on VMS, set non-blocking with ioctl, not fcntl
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
ae6aa562 17our $VERSION = '0.25';
45394607 18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
3164dd77 29our $UNICODE_VERSION;
30
327745dc 31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
9f1f04a1 36else { # Perl 5.6.1
3164dd77 37 my($f, $fh);
38 foreach my $d (@INC) {
3164dd77 39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
40 if (open($fh, $f)) {
327745dc 41 $UNICODE_VERSION = '3.0.1';
3164dd77 42 close $fh;
43 last;
44 }
45 }
46}
47
4d36a948 48# Perl's boolean
49use constant TRUE => 1;
50use constant FALSE => "";
51use constant NOMATCHPOS => -1;
52
53# A coderef to get combining class imported from Unicode::Normalize
54# (i.e. \&Unicode::Normalize::getCombinClass).
55# This is also used as a HAS_UNICODE_NORMALIZE flag.
56our $getCombinClass;
57
9f1f04a1 58# Supported Levels
59use constant MinLevel => 1;
60use constant MaxLevel => 4;
61
4d36a948 62# Minimum weights at level 2 and 3, respectively
9f1f04a1 63use constant Min2Wt => 0x20;
64use constant Min3Wt => 0x02;
4d36a948 65
66# Shifted weight at 4th level
9f1f04a1 67use constant Shift4Wt => 0xFFFF;
4d36a948 68
69# Variable weight at 1st level.
70# This is a negative value but should be regarded as zero on collation.
71# This is for distinction of variable chars from level 3 ignorable chars.
9f1f04a1 72use constant Var1Wt => -1;
4d36a948 73
74
75# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
76# PROBLEM: The Default Unicode Collation Element Table
77# has weights over 0xFFFF at the 4th level.
78# The tie-breaking in the variable weights
79# other than "shift" (as well as "shift-trimmed") is unreliable.
80use constant VCE_TEMPLATE => 'Cn4';
81
4d36a948 82# A sort key: 16-bit weights
83# See also the PROBLEM on VCE_TEMPLATE above.
84use constant KEY_TEMPLATE => 'n*';
85
86# Level separator in a sort key:
87# i.e. pack(KEY_TEMPLATE, 0)
88use constant LEVEL_SEP => "\0\0";
89
90# As Unicode code point separator for hash keys.
91# A joined code point string (denoted by JCPS below)
92# like "65;768" is used for internal processing
93# instead of Perl's Unicode string like "\x41\x{300}",
94# as the native code point is different from the Unicode code point
95# on EBCDIC platform.
96# This character must not be included in any stringified
97# representation of an integer.
98use constant CODE_SEP => ';';
99
100# boolean values of variable weights
0116f5dc 101use constant NON_VAR => 0; # Non-Variable character
102use constant VAR => 1; # Variable character
3164dd77 103
4d36a948 104# Logical_Order_Exception in PropList.txt
105# TODO: synchronization with change of PropList.txt.
0116f5dc 106our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 107
0116f5dc 108sub UCA_Version { "9" }
a7fbee98 109
0116f5dc 110sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
a7fbee98 111
9f1f04a1 112######
113
9f1f04a1 114sub pack_U {
ae6aa562 115 return pack('U*', @_);
9f1f04a1 116}
117
118sub unpack_U {
ae6aa562 119 return unpack('U*', pack('U*').shift);
9f1f04a1 120}
121
122######
123
0116f5dc 124my (%AlternateOK);
125@AlternateOK{ qw/
126 blanked non-ignorable shifted shift-trimmed
127 / } = ();
128
129our @ChangeOK = qw/
130 alternate backwards level normalization rearrange
131 katakana_before_hiragana upper_before_lower
132 overrideHangul overrideCJK preprocess UCA_Version
133 /;
134
135our @ChangeNG = qw/
9f1f04a1 136 entry entries table maxlength
0116f5dc 137 ignoreChar ignoreName undefChar undefName
138 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 139 derivCode normCode rearrangeHash L3_ignorable
9f1f04a1 140 backwardsFlag
0116f5dc 141 /;
9f1f04a1 142# The hash key 'ignored' is deleted at v 0.21.
143# The hash key 'isShift' is deleted at v 0.23.
144# The hash key 'combining' is deleted at v 0.24.
0116f5dc 145
146my (%ChangeOK, %ChangeNG);
147@ChangeOK{ @ChangeOK } = ();
148@ChangeNG{ @ChangeNG } = ();
149
150sub change {
151 my $self = shift;
152 my %hash = @_;
153 my %old;
154 foreach my $k (keys %hash) {
155 if (exists $ChangeOK{$k}) {
156 $old{$k} = $self->{$k};
157 $self->{$k} = $hash{$k};
158 }
159 elsif (exists $ChangeNG{$k}) {
160 croak "change of $k via change() is not allowed!";
161 }
162 # else => ignored
163 }
164 $self->checkCollator;
165 return wantarray ? %old : $self;
166}
a7fbee98 167
9f1f04a1 168sub _checkLevel {
169 my $level = shift;
170 my $key = shift;
171 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
172 $level, $key, MinLevel if MinLevel > $level;
173 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
174 $level, $key, MaxLevel if MaxLevel < $level;
175}
176
0116f5dc 177sub checkCollator {
178 my $self = shift;
9f1f04a1 179 _checkLevel($self->{level}, "level");
a7fbee98 180
0116f5dc 181 $self->{derivCode} =
4d36a948 182 $self->{UCA_Version} == 8 ? \&_derivCE_8 :
183 $self->{UCA_Version} == 9 ? \&_derivCE_9 :
0116f5dc 184 croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 185
0116f5dc 186 $self->{alternate} = lc($self->{alternate});
187 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
188 unless exists $AlternateOK{ $self->{alternate} };
189
9f1f04a1 190 if (! defined $self->{backwards}) {
191 $self->{backwardsFlag} = 0;
192 }
193 elsif (! ref $self->{backwards}) {
194 _checkLevel($self->{backwards}, "backwards");
195 $self->{backwardsFlag} = 1 << $self->{backwards};
196 }
197 else {
198 my %level;
199 $self->{backwardsFlag} = 0;
200 for my $b (@{ $self->{backwards} }) {
201 _checkLevel($b, "backwards");
202 $level{$b} = 1;
203 }
204 for my $v (sort keys %level) {
205 $self->{backwardsFlag} += 1 << $v;
206 }
207 }
0116f5dc 208
209 $self->{rearrange} = []
210 if ! defined $self->{rearrange};
211 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
212 if ! ref $self->{rearrange};
213
214 # keys of $self->{rearrangeHash} are $self->{rearrange}.
215 $self->{rearrangeHash} = undef;
216
217 if (@{ $self->{rearrange} }) {
218 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
219 }
220
221 $self->{normCode} = undef;
a7fbee98 222
223 if (defined $self->{normalization}) {
224 eval { require Unicode::Normalize };
225 croak "Unicode/Normalize.pm is required to normalize strings: $@"
226 if $@;
227
228 Unicode::Normalize->import();
229 $getCombinClass = \&Unicode::Normalize::getCombinClass
230 if ! $getCombinClass;
231
0116f5dc 232 $self->{normCode} =
a7fbee98 233 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
234 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
235 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
236 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
237 croak "$PACKAGE unknown normalization form name: "
238 . $self->{normalization};
239 }
0116f5dc 240 return;
241}
242
243sub new
244{
245 my $class = shift;
246 my $self = bless { @_ }, $class;
45394607 247
a7fbee98 248 # If undef is passed explicitly, no file is read.
0116f5dc 249 $self->{table} = $KeyFile if ! exists $self->{table};
250 $self->read_table if defined $self->{table};
905aa9f0 251
a7fbee98 252 if ($self->{entry}) {
253 $self->parseEntry($_) foreach split /\n/, $self->{entry};
254 }
905aa9f0 255
9f1f04a1 256 $self->{level} ||= MaxLevel;
0116f5dc 257 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 258
0116f5dc 259 $self->{overrideHangul} = ''
260 if ! exists $self->{overrideHangul};
261 $self->{overrideCJK} = ''
262 if ! exists $self->{overrideCJK};
263 $self->{normalization} = 'D'
264 if ! exists $self->{normalization};
265 $self->{alternate} = $self->{alternateTable} || 'shifted'
266 if ! exists $self->{alternate};
267 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 268 if ! exists $self->{rearrange};
0116f5dc 269 $self->{backwards} = $self->{backwardsTable}
270 if ! exists $self->{backwards};
a7fbee98 271
0116f5dc 272 $self->checkCollator;
a7fbee98 273
274 return $self;
275}
905aa9f0 276
277sub read_table {
a7fbee98 278 my $self = shift;
279 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
280
e69a2255 281 my $filepath = File::Spec->catfile($Path, $file);
282 open my $fk, "<$filepath"
283 or croak "File does not exist at $filepath";
a7fbee98 284
285 while (<$fk>) {
286 next if /^\s*#/;
287 if (/^\s*\@/) {
0116f5dc 288 if (/^\s*\@version\s*(\S*)/) {
289 $self->{versionTable} ||= $1;
290 }
291 elsif (/^\s*\@alternate\s+(\S*)/) {
292 $self->{alternateTable} ||= $1;
a7fbee98 293 }
0116f5dc 294 elsif (/^\s*\@backwards\s+(\S*)/) {
295 push @{ $self->{backwardsTable} }, $1;
a7fbee98 296 }
0116f5dc 297 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
298 push @{ $self->{forwardsTable} }, $1;
a7fbee98 299 }
0116f5dc 300 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
301 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 302 }
303 next;
304 }
305 $self->parseEntry($_);
45394607 306 }
a7fbee98 307 close $fk;
45394607 308}
309
905aa9f0 310
45394607 311##
312## get $line, parse it, and write an entry in $self
313##
314sub parseEntry
315{
a7fbee98 316 my $self = shift;
317 my $line = shift;
4d36a948 318 my($name, $entry, @uv, @key);
a7fbee98 319
320 return if $line !~ /^\s*[0-9A-Fa-f]/;
321
322 # removes comment and gets name
323 $name = $1
324 if $line =~ s/[#%]\s*(.*)//;
325 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
326
327 # gets element
328 my($e, $k) = split /;/, $line;
329 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
330 if ! $k;
331
4d36a948 332 @uv = _getHexArray($e);
333 return if !@uv;
334
335 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 336
4d36a948 337 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 338 my $ele = pack_U(@uv);
a7fbee98 339
4d36a948 340 # regarded as if it were not entried in the table
341 return
342 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 343
4d36a948 344 # replaced as completely ignorable
345 $k = '[.0000.0000.0000.0000]'
346 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 347 }
0116f5dc 348
4d36a948 349 # replaced as completely ignorable
350 $k = '[.0000.0000.0000.0000]'
351 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
352
4d36a948 353 my $is_L3_ignorable;
354
caffd4cf 355 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
356 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 357 my @wt = _getHexArray($arr);
358 push @key, pack(VCE_TEMPLATE, $var, @wt);
4d36a948 359 $is_L3_ignorable = TRUE
360 if $wt[0] + $wt[1] + $wt[2] == 0;
361 # if $arr !~ /[1-9A-Fa-f]/; NG
362 # Conformance Test shows L3-ignorable is completely ignorable.
a7fbee98 363 }
caffd4cf 364
4d36a948 365 $self->{entries}{$entry} = \@key;
caffd4cf 366
4d36a948 367 $self->{L3_ignorable}{$uv[0]} = TRUE
368 if @uv == 1 && $is_L3_ignorable;
caffd4cf 369
4d36a948 370 # Contraction is to be considered in the range of this maxlength.
371 $self->{maxlength}{$uv[0]} = scalar @uv
372 if @uv > 1;
45394607 373}
374
9f1f04a1 375
45394607 376##
9f1f04a1 377## arrayref[weights] = altCE(VCE)
45394607 378##
d16e9e3d 379sub altCE
45394607 380{
a7fbee98 381 my $self = shift;
4d36a948 382 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
a7fbee98 383
384 $self->{alternate} eq 'blanked' ?
9f1f04a1 385 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
a7fbee98 386 $self->{alternate} eq 'non-ignorable' ?
4d36a948 387 \@wt :
a7fbee98 388 $self->{alternate} eq 'shifted' ?
9f1f04a1 389 $var ? [Var1Wt, 0, 0, $wt[0] ]
390 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
a7fbee98 391 $self->{alternate} eq 'shift-trimmed' ?
9f1f04a1 392 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
a7fbee98 393 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 394}
395
45394607 396sub viewSortKey
397{
a7fbee98 398 my $self = shift;
9f1f04a1 399 $self->visualizeSortKey($self->getSortKey(@_));
400}
0116f5dc 401
9f1f04a1 402sub visualizeSortKey
403{
404 my $self = shift;
405 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 406
9f1f04a1 407 if ($self->{UCA_Version} <= 8) {
0116f5dc 408 $view =~ s/ ?0000 ?/|/g;
409 } else {
410 $view =~ s/\b0000\b/|/g;
411 }
a7fbee98 412 return "[$view]";
45394607 413}
414
d16e9e3d 415
45394607 416##
4d36a948 417## arrayref of JCPS = splitCE(string to be collated)
418## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
45394607 419##
d16e9e3d 420sub splitCE
45394607 421{
a7fbee98 422 my $self = shift;
4d36a948 423 my $wLen = $_[1];
424
a7fbee98 425 my $code = $self->{preprocess};
0116f5dc 426 my $norm = $self->{normCode};
a7fbee98 427 my $ent = $self->{entries};
428 my $max = $self->{maxlength};
429 my $reH = $self->{rearrangeHash};
4d36a948 430 my $ign = $self->{L3_ignorable};
0116f5dc 431 my $ver9 = $self->{UCA_Version} > 8;
a7fbee98 432
4d36a948 433 my ($str, @buf);
a7fbee98 434
4d36a948 435 if ($wLen) {
436 $code and croak "Preprocess breaks character positions. "
437 . "Don't use with index(), match(), etc.";
438 $norm and croak "Normalization breaks character positions. "
439 . "Don't use with index(), match(), etc.";
440 $str = $_[0];
441 }
442 else {
443 $str = $_[0];
444 $str = &$code($str) if ref $code;
445 $str = &$norm($str) if ref $norm;
446 }
a7fbee98 447
4d36a948 448 # get array of Unicode code point of string.
9f1f04a1 449 my @src = unpack_U($str);
4d36a948 450
451 # rearrangement:
452 # Character positions are not kept if rearranged,
453 # then neglected if $wLen is true.
454 if ($reH && ! $wLen) {
a7fbee98 455 for (my $i = 0; $i < @src; $i++) {
456 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
457 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
458 $i++;
459 }
460 }
45394607 461 }
45394607 462
0116f5dc 463 if ($ver9) {
4d36a948 464 # To remove a character marked as a completely ignorable.
465 for (my $i = 0; $i < @src; $i++) {
466 $src[$i] = undef if $ign->{ $src[$i] };
467 }
0116f5dc 468 }
469
a7fbee98 470 for (my $i = 0; $i < @src; $i++) {
4d36a948 471 next if _isNonCharacter($src[$i]);
472
473 my $i_orig = $i;
474 my $ce = $src[$i];
475
476 if ($max->{$ce}) { # contract
477 my $temp_ce = $ce;
478
479 for (my $p = $i + 1; $p < @src; $p++) {
480 next if ! defined $src[$p];
481 $temp_ce .= CODE_SEP . $src[$p];
482 if ($ent->{$temp_ce}) {
483 $ce = $temp_ce;
484 $i = $p;
485 }
486 }
487 }
488
489 # with Combining Char (UTS#10, 4.2.1).
490 # requires Unicode::Normalize.
491 # Not be $wLen, as not croaked due to $norm.
492 if ($getCombinClass) {
493 for (my $p = $i + 1; $p < @src; $p++) {
494 next if ! defined $src[$p];
495 last unless $getCombinClass->($src[$p]);
496 my $tail = CODE_SEP . $src[$p];
497 if ($ent->{$ce.$tail}) {
498 $ce .= $tail;
499 $src[$p] = undef;
500 }
a7fbee98 501 }
a7fbee98 502 }
503
4d36a948 504 if ($wLen) {
505 for (my $p = $i + 1; $p < @src; $p++) {
506 last if defined $src[$p];
507 $i = $p;
a7fbee98 508 }
509 }
4d36a948 510
511 push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
45394607 512 }
4d36a948 513 return \@buf;
d16e9e3d 514}
45394607 515
d16e9e3d 516
517##
4d36a948 518## list of arrayrefs of weights = getWt(JCPS)
d16e9e3d 519##
520sub getWt
521{
a7fbee98 522 my $self = shift;
4d36a948 523 my $ce = shift;
a7fbee98 524 my $ent = $self->{entries};
a7fbee98 525 my $cjk = $self->{overrideCJK};
526 my $hang = $self->{overrideHangul};
0116f5dc 527 my $der = $self->{derivCode};
a7fbee98 528
4d36a948 529 return if !defined $ce;
530 return map($self->altCE($_), @{ $ent->{$ce} })
531 if $ent->{$ce};
0116f5dc 532
4d36a948 533 # CE must not be a contraction, then it's a code point.
534 my $u = $ce;
a7fbee98 535
4d36a948 536 if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
0116f5dc 537 return map $self->altCE($_),
538 $hang
4d36a948 539 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
0116f5dc 540 : defined $hang
541 ? map({
4d36a948 542 $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
0116f5dc 543 } _decompHangul($u))
544 : $der->($u);
a7fbee98 545 }
546 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
547 0x4E00 <= $u && $u <= 0x9FA5 ||
0116f5dc 548 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
549 return map $self->altCE($_),
550 $cjk
4d36a948 551 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
0116f5dc 552 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
9f1f04a1 553 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 554 : $der->($u);
a7fbee98 555 }
556 else {
0116f5dc 557 return map $self->altCE($_), $der->($u);
a7fbee98 558 }
d16e9e3d 559}
560
d16e9e3d 561
562##
563## string sortkey = getSortKey(string arg)
564##
565sub getSortKey
566{
a7fbee98 567 my $self = shift;
568 my $lev = $self->{level};
4d36a948 569 my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
0116f5dc 570 my $ver9 = $self->{UCA_Version} > 8;
4d36a948 571 my $v2i = $self->{alternate} ne 'non-ignorable';
a7fbee98 572
573 # weight arrays
0116f5dc 574 my (@buf, $last_is_variable);
575
4d36a948 576 foreach my $wt (map $self->getWt($_), @$rCE) {
577 if ($v2i && $ver9) {
578 if ($wt->[0] == 0) { # ignorable
579 next if $last_is_variable;
0116f5dc 580 } else {
9f1f04a1 581 $last_is_variable = ($wt->[0] == Var1Wt);
0116f5dc 582 }
583 }
4d36a948 584 push @buf, $wt;
0116f5dc 585 }
a7fbee98 586
587 # make sort key
588 my @ret = ([],[],[],[]);
589 foreach my $v (0..$lev-1) {
590 foreach my $b (@buf) {
4d36a948 591 push @{ $ret[$v] }, $b->[$v]
592 if 0 < $b->[$v];
a7fbee98 593 }
594 }
45394607 595
a7fbee98 596 # modification of tertiary weights
597 if ($self->{upper_before_lower}) {
598 foreach (@{ $ret[2] }) {
599 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
600 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
601 elsif ($_ == 0x1C) { $_ += 1 } # square upper
602 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
603 }
45394607 604 }
a7fbee98 605 if ($self->{katakana_before_hiragana}) {
606 foreach (@{ $ret[2] }) {
607 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
608 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
609 }
45394607 610 }
9f1f04a1 611
612 if ($self->{backwardsFlag}) {
613 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
614 if ($self->{backwardsFlag} & (1 << $v)) {
615 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
616 }
617 }
618 }
619
4d36a948 620 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 621}
622
623
624##
d16e9e3d 625## int compare = cmp(string a, string b)
45394607 626##
5398038e 627sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
628sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
629sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
630sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
631sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
632sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
633sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 634
635##
d16e9e3d 636## list[strings] sorted = sort(list[strings] arg)
45394607 637##
a7fbee98 638sub sort {
639 my $obj = shift;
640 return
641 map { $_->[1] }
642 sort{ $a->[0] cmp $b->[0] }
643 map [ $obj->getSortKey($_), $_ ], @_;
45394607 644}
645
0116f5dc 646
4d36a948 647sub _derivCE_9 {
0116f5dc 648 my $u = shift;
649 my $base =
4d36a948 650 (0x4E00 <= $u && $u <= 0x9FA5)
651 ? 0xFB40 : # CJK
0116f5dc 652 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
4d36a948 653 ? 0xFB80 # CJK ext.
654 : 0xFBC0; # others
0116f5dc 655
656 my $aaaa = $base + ($u >> 15);
657 my $bbbb = ($u & 0x7FFF) | 0x8000;
658 return
9f1f04a1 659 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
660 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 661}
662
4d36a948 663sub _derivCE_8 {
0116f5dc 664 my $code = shift;
665 my $aaaa = 0xFF80 + ($code >> 15);
666 my $bbbb = ($code & 0x7FFF) | 0x8000;
667 return
4d36a948 668 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
669 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 670}
671
672##
673## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
674##
a7fbee98 675sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 676
a7fbee98 677#
4d36a948 678# $code *must* be in Hangul syllable.
a7fbee98 679# Check it before you enter here.
680#
5398038e 681sub _decompHangul {
682 my $code = shift;
5398038e 683 my $SIndex = $code - 0xAC00;
684 my $LIndex = int( $SIndex / 588);
685 my $VIndex = int(($SIndex % 588) / 28);
686 my $TIndex = $SIndex % 28;
687 return (
a7fbee98 688 0x1100 + $LIndex,
689 0x1161 + $VIndex,
690 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 691 );
45394607 692}
693
4d36a948 694sub _isNonCharacter {
695 my $code = shift;
696 return ! defined $code # removed
697 || ($code < 0 || 0x10FFFF < $code) # out of range
698 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
699 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
700 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
701 ;
702}
703
704
705##
706## bool _nonIgnorAtLevel(arrayref weights, int level)
707##
708sub _nonIgnorAtLevel($$)
709{
710 my $wt = shift;
711 return if ! defined $wt;
712 my $lv = shift;
9f1f04a1 713 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 714}
715
716##
717## bool _eqArray(
718## arrayref of arrayref[weights] source,
719## arrayref of arrayref[weights] substr,
720## int level)
721## * comparison of graphemes vs graphemes.
722## @$source >= @$substr must be true (check it before call this);
723##
724sub _eqArray($$$)
725{
726 my $source = shift;
727 my $substr = shift;
728 my $lev = shift;
729
730 for my $g (0..@$substr-1){
731 # Do the $g'th graphemes have the same number of AV weigths?
732 return if @{ $source->[$g] } != @{ $substr->[$g] };
733
734 for my $w (0..@{ $substr->[$g] }-1) {
735 for my $v (0..$lev-1) {
736 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
737 }
738 }
739 }
740 return 1;
741}
742
743##
744## (int position, int length)
745## int position = index(string, substring, position, [undoc'ed grobal])
746##
747## With "grobal" (only for the list context),
748## returns list of arrayref[position, length].
749##
750sub index
751{
752 my $self = shift;
753 my $str = shift;
754 my $len = length($str);
755 my $subCE = $self->splitCE(shift);
756 my $pos = @_ ? shift : 0;
757 $pos = 0 if $pos < 0;
758 my $grob = shift;
759
4d36a948 760 my $lev = $self->{level};
761 my $ver9 = $self->{UCA_Version} > 8;
762 my $v2i = $self->{alternate} ne 'non-ignorable';
763
764 if (! @$subCE) {
765 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
766 return $grob
767 ? map([$_, 0], $temp..$len)
768 : wantarray ? ($temp,0) : $temp;
769 }
770 if ($len < $pos) {
771 return wantarray ? () : NOMATCHPOS;
772 }
773 my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
774 if (! @$strCE) {
775 return wantarray ? () : NOMATCHPOS;
776 }
777 my $last_is_variable;
778 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
779
780 $last_is_variable = FALSE;
781 for my $wt (map $self->getWt($_), @$subCE) {
782 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
783
784 if ($v2i && $ver9) {
785 if ($wt->[0] == 0) {
786 $to_be_pushed = FALSE if $last_is_variable;
787 } else {
9f1f04a1 788 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 789 }
790 }
791
792 if (@subWt && $wt->[0] == 0) {
793 push @{ $subWt[-1] }, $wt if $to_be_pushed;
794 } else {
9f1f04a1 795 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 796 push @subWt, [ $wt ];
797 }
798 }
799
800 my $count = 0;
801 my $end = @$strCE - 1;
802
803 $last_is_variable = FALSE;
804
805 for (my $i = 0; $i <= $end; ) { # no $i++
806 my $found_base = 0;
807
808 # fetch a grapheme
809 while ($i <= $end && $found_base == 0) {
810 for my $wt ($self->getWt($strCE->[$i][0])) {
811 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
812
813 if ($v2i && $ver9) {
814 if ($wt->[0] == 0) {
815 $to_be_pushed = FALSE if $last_is_variable;
816 } else {
9f1f04a1 817 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 818 }
819 }
820
821 if (@strWt && $wt->[0] == 0) {
822 push @{ $strWt[-1] }, $wt if $to_be_pushed;
823 $finPos[-1] = $strCE->[$i][2];
824 } elsif ($to_be_pushed) {
9f1f04a1 825 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 826 push @strWt, [ $wt ];
827 push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
828 $finPos[-1] = NOMATCHPOS if $found_base;
829 push @finPos, $strCE->[$i][2];
830 $found_base++;
831 }
832 # else ===> no-op
833 }
834 $i++;
835 }
836
837 # try to match
838 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
839 if ($iniPos[0] != NOMATCHPOS &&
840 $finPos[$#subWt] != NOMATCHPOS &&
841 _eqArray(\@strWt, \@subWt, $lev)) {
842 my $temp = $iniPos[0] + $pos;
843
844 if ($grob) {
845 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
846 splice @strWt, 0, $#subWt;
847 splice @iniPos, 0, $#subWt;
848 splice @finPos, 0, $#subWt;
849 }
850 else {
851 return wantarray
852 ? ($temp, $finPos[$#subWt] - $iniPos[0])
853 : $temp;
854 }
855 }
856 shift @strWt;
857 shift @iniPos;
858 shift @finPos;
859 }
860 }
861
862 return $grob
863 ? @g_ret
864 : wantarray ? () : NOMATCHPOS;
865}
866
867##
868## scalarref to matching part = match(string, substring)
869##
870sub match
871{
872 my $self = shift;
873 if (my($pos,$len) = $self->index($_[0], $_[1])) {
874 my $temp = substr($_[0], $pos, $len);
875 return wantarray ? $temp : \$temp;
876 # An lvalue ref \substr should be avoided,
877 # since its value is affected by modification of its referent.
878 }
879 else {
880 return;
881 }
882}
883
884##
885## arrayref matching parts = gmatch(string, substring)
886##
887sub gmatch
888{
889 my $self = shift;
890 my $str = shift;
891 my $sub = shift;
892 return map substr($str, $_->[0], $_->[1]),
893 $self->index($str, $sub, 0, 'g');
894}
895
896##
897## bool subst'ed = subst(string, substring, replace)
898##
899sub subst
900{
901 my $self = shift;
902 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
903
904 if (my($pos,$len) = $self->index($_[0], $_[1])) {
905 if ($code) {
906 my $mat = substr($_[0], $pos, $len);
907 substr($_[0], $pos, $len, $code->($mat));
908 } else {
909 substr($_[0], $pos, $len, $_[2]);
910 }
911 return TRUE;
912 }
913 else {
914 return FALSE;
915 }
916}
917
918##
919## int count = gsubst(string, substring, replace)
920##
921sub gsubst
922{
923 my $self = shift;
924 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
925 my $cnt = 0;
926
927 # Replacement is carried out from the end, then use reverse.
928 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
929 if ($code) {
930 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
931 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
932 } else {
933 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
934 }
935 $cnt++;
936 }
937 return $cnt;
938}
939
45394607 9401;
941__END__
942
943=head1 NAME
944
a7fbee98 945Unicode::Collate - Unicode Collation Algorithm
45394607 946
947=head1 SYNOPSIS
948
949 use Unicode::Collate;
950
951 #construct
5398038e 952 $Collator = Unicode::Collate->new(%tailoring);
45394607 953
954 #sort
5398038e 955 @sorted = $Collator->sort(@not_sorted);
45394607 956
957 #compare
a7fbee98 958 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 959
960=head1 DESCRIPTION
961
4d36a948 962This module is an implementation
963of Unicode Technical Standard #10 (UTS #10)
964"Unicode Collation Algorithm."
965
45394607 966=head2 Constructor and Tailoring
967
d16e9e3d 968The C<new> method returns a collator object.
969
5398038e 970 $Collator = Unicode::Collate->new(
0116f5dc 971 UCA_Version => $UCA_Version,
45394607 972 alternate => $alternate,
973 backwards => $levelNumber, # or \@levelNumbers
974 entry => $element,
975 normalization => $normalization_form,
976 ignoreName => qr/$ignoreName/,
977 ignoreChar => qr/$ignoreChar/,
978 katakana_before_hiragana => $bool,
979 level => $collationLevel,
980 overrideCJK => \&overrideCJK,
981 overrideHangul => \&overrideHangul,
982 preprocess => \&preprocess,
983 rearrange => \@charList,
984 table => $filename,
985 undefName => qr/$undefName/,
986 undefChar => qr/$undefChar/,
987 upper_before_lower => $bool,
988 );
a7fbee98 989 # if %tailoring is false (i.e. empty),
5398038e 990 # $Collator should do the default collation.
45394607 991
992=over 4
993
0116f5dc 994=item UCA_Version
995
996If the version number of the older UCA is given,
997the older behavior of that version is emulated on collating.
998If omitted, the return value of C<UCA_Version()> is used.
999
1000The supported version: 8 or 9.
1001
1002B<This parameter may be removed in the future version,
1003as switching the algorithm would affect the performance.>
1004
45394607 1005=item alternate
1006
4d36a948 1007-- see 3.2.2 Variable Weighting, UTS #10.
caffd4cf 1008
1009(the title in UCA version 8: Alternate Weighting)
45394607 1010
a7fbee98 1011This key allows to alternate weighting for variable collation elements,
1012which are marked with an ASTERISK in the table
1013(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1014
1015 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 1016
a7fbee98 1017These names are case-insensitive.
45394607 1018By default (if specification is omitted), 'shifted' is adopted.
1019
4d36a948 1020 'Blanked' Variable elements are made ignorable at levels 1 through 3;
a7fbee98 1021 considered at the 4th level.
1022
1023 'Non-ignorable' Variable elements are not reset to ignorable.
1024
4d36a948 1025 'Shifted' Variable elements are made ignorable at levels 1 through 3
a7fbee98 1026 their level 4 weight is replaced by the old level 1 weight.
1027 Level 4 weight for Non-Variable elements is 0xFFFF.
1028
1029 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1030 are trimmed.
1031
45394607 1032=item backwards
1033
4d36a948 1034-- see 3.1.2 French Accents, UTS #10.
45394607 1035
1036 backwards => $levelNumber or \@levelNumbers
1037
1038Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1039If omitted, forwards at all the levels.
1040
1041=item entry
1042
4d36a948 1043-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1044
a7fbee98 1045Overrides a default order or defines additional collation elements
45394607 1046
1047 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 104800E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 10490063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
10500043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1051ENTRIES
1052
4d36a948 1053B<NOTE:> The code point in the UCA file format (before C<';'>)
1054B<must> be a Unicode code point, but not a native code point.
1055So C<0063> must always denote C<U+0063>,
1056but not a character of C<"\x63">.
1057
45394607 1058=item ignoreName
1059
1060=item ignoreChar
1061
4d36a948 1062-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1063
caffd4cf 1064Makes the entry in the table completely ignorable;
1065i.e. as if the weights were zero at all level.
45394607 1066
a7fbee98 1067E.g. when 'a' and 'e' are ignorable,
45394607 1068'element' is equal to 'lament' (or 'lmnt').
1069
1070=item level
1071
4d36a948 1072-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1073
1074Set the maximum level.
1075Any higher levels than the specified one are ignored.
1076
1077 Level 1: alphabetic ordering
1078 Level 2: diacritic ordering
1079 Level 3: case ordering
1080 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1081
1082 ex.level => 2,
1083
a7fbee98 1084If omitted, the maximum is the 4th.
1085
45394607 1086=item normalization
1087
4d36a948 1088-- see 4.1 Normalize each input string, UTS #10.
45394607 1089
905aa9f0 1090If specified, strings are normalized before preparation of sort keys
45394607 1091(the normalization is executed after preprocess).
1092
1093As a form name, one of the following names must be used.
1094
1095 'C' or 'NFC' for Normalization Form C
1096 'D' or 'NFD' for Normalization Form D
1097 'KC' or 'NFKC' for Normalization Form KC
1098 'KD' or 'NFKD' for Normalization Form KD
1099
1100If omitted, the string is put into Normalization Form D.
1101
a7fbee98 1102If C<undef> is passed explicitly as the value for this key,
45394607 1103any normalization is not carried out (this may make tailoring easier
1104if any normalization is not desired).
1105
1106see B<CAVEAT>.
1107
1108=item overrideCJK
1109
4d36a948 1110-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1111
1112By default, mapping of CJK Unified Ideographs
a7fbee98 1113uses the Unicode codepoint order.
1114But the mapping of CJK Unified Ideographs may be overrided.
45394607 1115
a7fbee98 1116ex. CJK Unified Ideographs in the JIS code point order.
45394607 1117
1118 overrideCJK => sub {
a7fbee98 1119 my $u = shift; # get a Unicode codepoint
1120 my $b = pack('n', $u); # to UTF-16BE
1121 my $s = your_unicode_to_sjis_converter($b); # convert
1122 my $n = unpack('n', $s); # convert sjis to short
1123 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1124 },
1125
a7fbee98 1126ex. ignores all CJK Unified Ideographs.
1127
1128 overrideCJK => sub {()}, # CODEREF returning empty list
1129
1130 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1131 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1132
1133If C<undef> is passed explicitly as the value for this key,
1134weights for CJK Unified Ideographs are treated as undefined.
1135But assignment of weight for CJK Unified Ideographs
1136in table or L<entry> is still valid.
1137
1138=item overrideHangul
1139
4d36a948 1140-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1141
1142By default, Hangul Syllables are decomposed into Hangul Jamo.
1143But the mapping of Hangul Syllables may be overrided.
1144
1145This tag works like L<overrideCJK>, so see there for examples.
1146
45394607 1147If you want to override the mapping of Hangul Syllables,
1148the Normalization Forms D and KD are not appropriate
1149(they will be decomposed before overriding).
1150
a7fbee98 1151If C<undef> is passed explicitly as the value for this key,
1152weight for Hangul Syllables is treated as undefined
1153without decomposition into Hangul Jamo.
1154But definition of weight for Hangul Syllables
1155in table or L<entry> is still valid.
1156
45394607 1157=item preprocess
1158
4d36a948 1159-- see 5.1 Preprocessing, UTS #10.
45394607 1160
1161If specified, the coderef is used to preprocess
1162before the formation of sort keys.
1163
a7fbee98 1164ex. dropping English articles, such as "a" or "the".
45394607 1165Then, "the pen" is before "a pencil".
1166
1167 preprocess => sub {
1168 my $str = shift;
a7fbee98 1169 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 1170 $str;
1171 },
1172
1173=item rearrange
1174
4d36a948 1175-- see 3.1.3 Rearrangement, UTS #10.
45394607 1176
1177Characters that are not coded in logical order and to be rearranged.
a7fbee98 1178By default,
45394607 1179
1180 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1181
a7fbee98 1182If you want to disallow any rearrangement,
1183pass C<undef> or C<[]> (a reference to an empty list)
1184as the value for this key.
1185
0116f5dc 1186B<According to the version 9 of UCA, this parameter shall not be used;
1187but it is not warned at present.>
1188
45394607 1189=item table
1190
4d36a948 1191-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1192
1193You can use another element table if desired.
ae6aa562 1194The table file must be put into a directory
1195where F<Unicode/Collate.pm> is installed.
1196E.g. in F<perl/lib/Unicode/Collate> directory
1197when you have F<perl/lib/Unicode/Collate.pm>.
45394607 1198
ae6aa562 1199By default, the filename F<"allkeys.txt"> is used.
45394607 1200
a7fbee98 1201If C<undef> is passed explicitly as the value for this key,
1202no file is read (but you can define collation elements via L<entry>).
1203
1204A typical way to define a collation element table
1205without any file of table:
1206
1207 $onlyABC = Unicode::Collate->new(
1208 table => undef,
1209 entry => << 'ENTRIES',
12100061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
12110041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
12120062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
12130042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
12140063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
12150043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1216ENTRIES
1217 );
905aa9f0 1218
45394607 1219=item undefName
1220
1221=item undefChar
1222
4d36a948 1223-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1224
1225Undefines the collation element as if it were unassigned in the table.
1226This reduces the size of the table.
1227If an unassigned character appears in the string to be collated,
1228the sort key is made from its codepoint
1229as a single-character collation element,
1230as it is greater than any other assigned collation elements
1231(in the codepoint order among the unassigned characters).
1232But, it'd be better to ignore characters
1233unfamiliar to you and maybe never used.
1234
1235=item katakana_before_hiragana
1236
1237=item upper_before_lower
1238
4d36a948 1239-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1240
1241By default, lowercase is before uppercase
1242and hiragana is before katakana.
1243
a7fbee98 1244If the tag is made true, this is reversed.
1245
1246B<NOTE>: These tags simplemindedly assume
1247any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1 1248must occur in level 3, and their weights at level 3
1249must be same as those mentioned in 7.3.1, UTS #10.
1250If you define your collation elements which violate this requirement,
4d36a948 1251these tags don't work validly.
45394607 1252
1253=back
1254
3164dd77 1255=head2 Methods for Collation
45394607 1256
1257=over 4
1258
5398038e 1259=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1260
1261Sorts a list of strings.
1262
5398038e 1263=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1264
1265Returns 1 (when C<$a> is greater than C<$b>)
1266or 0 (when C<$a> is equal to C<$b>)
1267or -1 (when C<$a> is lesser than C<$b>).
1268
5398038e 1269=item C<$result = $Collator-E<gt>eq($a, $b)>
1270
1271=item C<$result = $Collator-E<gt>ne($a, $b)>
1272
1273=item C<$result = $Collator-E<gt>lt($a, $b)>
1274
1275=item C<$result = $Collator-E<gt>le($a, $b)>
1276
1277=item C<$result = $Collator-E<gt>gt($a, $b)>
1278
1279=item C<$result = $Collator-E<gt>ge($a, $b)>
1280
a7fbee98 1281They works like the same name operators as theirs.
5398038e 1282
1283 eq : whether $a is equal to $b.
1284 ne : whether $a is not equal to $b.
1285 lt : whether $a is lesser than $b.
1286 le : whether $a is lesser than $b or equal to $b.
1287 gt : whether $a is greater than $b.
1288 ge : whether $a is greater than $b or equal to $b.
1289
1290=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1291
4d36a948 1292-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1293
1294Returns a sort key.
1295
1296You compare the sort keys using a binary comparison
1297and get the result of the comparison of the strings using UCA.
1298
5398038e 1299 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1300
1301 is equivalent to
1302
5398038e 1303 $Collator->cmp($a, $b)
45394607 1304
a7fbee98 1305=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1306
a7fbee98 1307 use Unicode::Collate;
1308 my $c = Unicode::Collate->new();
1309 print $c->viewSortKey("Perl"),"\n";
1310
0116f5dc 1311 # output:
1312 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1313 # Level 1 Level 2 Level 3 Level 4
1314
1315 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1316
4d36a948 1317=back
1318
1319=head2 Methods for Searching
d16e9e3d 1320
4d36a948 1321B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1322for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1323C<subst>, C<gsubst>) is croaked,
1324as the position and the length might differ
1325from those on the specified string.
1326(And the C<rearrange> tag is neglected.)
d16e9e3d 1327
4d36a948 1328The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1329like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1330but they are not aware of any pattern, but only a literal substring.
1331
1332=over 4
1333
1334=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1335
1336=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1337
1338If C<$substring> matches a part of C<$string>, returns
1339the position of the first occurrence of the matching part in scalar context;
1340in list context, returns a two-element list of
1341the position and the length of the matching part.
1342
d16e9e3d 1343If C<$substring> does not match any part of C<$string>,
1344returns C<-1> in scalar context and
1345an empty list in list context.
1346
1347e.g. you say
1348
5398038e 1349 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1350 # (normalization => undef) is REQUIRED.
1351 my $str = "Ich muß studieren Perl.";
1352 my $sub = "MÜSS";
d16e9e3d 1353 my $match;
a7fbee98 1354 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1355 $match = substr($str, $pos, $len);
d16e9e3d 1356 }
1357
4d36a948 1358and get C<"muß"> in C<$match> since C<"muß">
1359is primary equal to C<"MÜSS">.
1360
1361=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1362
1363=item C<($match) = $Collator-E<gt>match($string, $substring)>
1364
1365If C<$substring> matches a part of C<$string>, in scalar context, returns
1366B<a reference to> the first occurrence of the matching part
1367(C<$match_ref> is always true if matches,
1368since every reference is B<true>);
1369in list context, returns the first occurrence of the matching part.
1370
1371If C<$substring> does not match any part of C<$string>,
1372returns C<undef> in scalar context and
1373an empty list in list context.
1374
1375e.g.
1376
1377 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1378 print "matches [$$match_ref].\n";
1379 } else {
1380 print "doesn't match.\n";
1381 }
1382
1383 or
1384
1385 if (($match) = $Collator->match($str, $sub)) { # list context
1386 print "matches [$match].\n";
1387 } else {
1388 print "doesn't match.\n";
1389 }
1390
1391=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1392
1393If C<$substring> matches a part of C<$string>, returns
1394all the matching parts (or matching count in scalar context).
1395
1396If C<$substring> does not match any part of C<$string>,
1397returns an empty list.
1398
1399=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1400
1401If C<$substring> matches a part of C<$string>,
1402the first occurrence of the matching part is replaced by C<$replacement>
1403(C<$string> is modified) and return C<$count> (always equals to C<1>).
1404
1405C<$replacement> can be a C<CODEREF>,
1406taking the matching part as an argument,
1407and returning a string to replace the matching part
1408(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1409
1410=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1411
1412If C<$substring> matches a part of C<$string>,
1413all the occurrences of the matching part is replaced by C<$replacement>
1414(C<$string> is modified) and return C<$count>.
1415
1416C<$replacement> can be a C<CODEREF>,
1417taking the matching part as an argument,
1418and returning a string to replace the matching part
1419(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1420
1421e.g.
1422
1423 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1424 # (normalization => undef) is REQUIRED.
1425 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1426 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1427
1428 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1429 # i.e., all the camels are made bold-faced.
d16e9e3d 1430
45394607 1431=back
1432
3164dd77 1433=head2 Other Methods
1434
1435=over 4
1436
0116f5dc 1437=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1438
1439Change the value of specified keys and returns the changed part.
1440
1441 $Collator = Unicode::Collate->new(level => 4);
1442
1443 $Collator->eq("perl", "PERL"); # false
1444
1445 %old = $Collator->change(level => 2); # returns (level => 4).
1446
1447 $Collator->eq("perl", "PERL"); # true
1448
1449 $Collator->change(%old); # returns (level => 2).
1450
1451 $Collator->eq("perl", "PERL"); # false
1452
1453Not all C<(key,value)>s are allowed to be changed.
1454See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1455
1456In the scalar context, returns the modified collator
1457(but it is B<not> a clone from the original).
1458
1459 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1460
1461 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1462
1463 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1464
3164dd77 1465=item UCA_Version
1466
4d36a948 1467Returns the version number of UTS #10 this module consults.
3164dd77 1468
1469=item Base_Unicode_Version
1470
1471Returns the version number of the Unicode Standard
1472this module is based on.
1473
1474=back
1475
45394607 1476=head2 EXPORT
1477
1478None by default.
1479
1480=head2 CAVEAT
1481
1482Use of the C<normalization> parameter requires
1483the B<Unicode::Normalize> module.
1484
5398038e 1485If you need not it (say, in the case when you need not
45394607 1486handle any combining characters),
1487assign C<normalization =E<gt> undef> explicitly.
1488
4d36a948 1489-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1490
0116f5dc 1491=head2 Conformance Test
1492
1493The Conformance Test for the UCA is provided
1494in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1495and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1496
1497For F<CollationTest_SHIFTED.txt>,
1498a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1499for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1500C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1501
4d36a948 1502B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1503
45394607 1504=head1 AUTHOR
1505
1506SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1507
1508 http://homepage1.nifty.com/nomenclator/perl/
1509
ae6aa562 1510 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1511
a7fbee98 1512 This library is free software; you can redistribute it
1513 and/or modify it under the same terms as Perl itself.
45394607 1514
1515=head1 SEE ALSO
1516
1517=over 4
1518
0116f5dc 1519=item http://www.unicode.org/reports/tr10/
45394607 1520
4d36a948 1521Unicode Collation Algorithm - UTS #10
45394607 1522
0116f5dc 1523=item http://www.unicode.org/reports/tr10/allkeys.txt
a7fbee98 1524
1525The Default Unicode Collation Element Table
45394607 1526
0116f5dc 1527=item http://www.unicode.org/reports/tr10/CollationTest.html
1528http://www.unicode.org/reports/tr10/CollationTest.zip
1529
1530The latest versions of the conformance test for the UCA
1531
1532=item http://www.unicode.org/reports/tr15/
a7fbee98 1533
1534Unicode Normalization Forms - UAX #15
1535
a7fbee98 1536=item L<Unicode::Normalize>
45394607 1537
45394607 1538=back
1539
1540=cut