VMS %ENV fix (follow-up to 18852)
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
4d36a948 17# Supporting on EBCDIC platform is not tested.
18# Tester(s) welcome!
19our $IsEBCDIC = ord("A") != 0x41;
20
21our $VERSION = '0.23';
45394607 22our $PACKAGE = __PACKAGE__;
23
24our @ISA = qw(Exporter);
25
26our %EXPORT_TAGS = ();
27our @EXPORT_OK = ();
28our @EXPORT = ();
29
30(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
31our $KeyFile = "allkeys.txt";
32
3164dd77 33our $UNICODE_VERSION;
34
327745dc 35eval { require Unicode::UCD };
36
37unless ($@) {
38 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
39}
40else { # XXX, Perl 5.6.1
3164dd77 41 my($f, $fh);
42 foreach my $d (@INC) {
3164dd77 43 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
44 if (open($fh, $f)) {
327745dc 45 $UNICODE_VERSION = '3.0.1';
3164dd77 46 close $fh;
47 last;
48 }
49 }
50}
51
4d36a948 52# Perl's boolean
53use constant TRUE => 1;
54use constant FALSE => "";
55use constant NOMATCHPOS => -1;
56
57# A coderef to get combining class imported from Unicode::Normalize
58# (i.e. \&Unicode::Normalize::getCombinClass).
59# This is also used as a HAS_UNICODE_NORMALIZE flag.
60our $getCombinClass;
61
62# Minimum weights at level 2 and 3, respectively
63use constant Min2 => 0x20;
64use constant Min3 => 0x02;
65
66# Shifted weight at 4th level
67use constant Shift4 => 0xFFFF;
68
69# Variable weight at 1st level.
70# This is a negative value but should be regarded as zero on collation.
71# This is for distinction of variable chars from level 3 ignorable chars.
72use constant Var1 => -1;
73
74
75# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
76# PROBLEM: The Default Unicode Collation Element Table
77# has weights over 0xFFFF at the 4th level.
78# The tie-breaking in the variable weights
79# other than "shift" (as well as "shift-trimmed") is unreliable.
80use constant VCE_TEMPLATE => 'Cn4';
81
82# Unicode encoding of strings to be collated
83# TODO: 'N*' for UTF-32BE, 'V*' for UTF-32LE.
84use constant UTF_TEMPLATE => 'U*';
85
86# A sort key: 16-bit weights
87# See also the PROBLEM on VCE_TEMPLATE above.
88use constant KEY_TEMPLATE => 'n*';
89
90# Level separator in a sort key:
91# i.e. pack(KEY_TEMPLATE, 0)
92use constant LEVEL_SEP => "\0\0";
93
94# As Unicode code point separator for hash keys.
95# A joined code point string (denoted by JCPS below)
96# like "65;768" is used for internal processing
97# instead of Perl's Unicode string like "\x41\x{300}",
98# as the native code point is different from the Unicode code point
99# on EBCDIC platform.
100# This character must not be included in any stringified
101# representation of an integer.
102use constant CODE_SEP => ';';
103
104# boolean values of variable weights
0116f5dc 105use constant NON_VAR => 0; # Non-Variable character
106use constant VAR => 1; # Variable character
3164dd77 107
4d36a948 108# Logical_Order_Exception in PropList.txt
109# TODO: synchronization with change of PropList.txt.
0116f5dc 110our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 111
0116f5dc 112sub UCA_Version { "9" }
a7fbee98 113
0116f5dc 114sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
a7fbee98 115
0116f5dc 116my (%AlternateOK);
117@AlternateOK{ qw/
118 blanked non-ignorable shifted shift-trimmed
119 / } = ();
120
121our @ChangeOK = qw/
122 alternate backwards level normalization rearrange
123 katakana_before_hiragana upper_before_lower
124 overrideHangul overrideCJK preprocess UCA_Version
125 /;
126
127our @ChangeNG = qw/
caffd4cf 128 entry entries table combining maxlength
0116f5dc 129 ignoreChar ignoreName undefChar undefName
130 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 131 derivCode normCode rearrangeHash L3_ignorable
0116f5dc 132 /;
caffd4cf 133# The hash key 'ignored' is deleted at VERSION 0.21.
4d36a948 134# The hash key 'isShift' are deleted at VERSION 0.23.
0116f5dc 135
136my (%ChangeOK, %ChangeNG);
137@ChangeOK{ @ChangeOK } = ();
138@ChangeNG{ @ChangeNG } = ();
139
140sub change {
141 my $self = shift;
142 my %hash = @_;
143 my %old;
144 foreach my $k (keys %hash) {
145 if (exists $ChangeOK{$k}) {
146 $old{$k} = $self->{$k};
147 $self->{$k} = $hash{$k};
148 }
149 elsif (exists $ChangeNG{$k}) {
150 croak "change of $k via change() is not allowed!";
151 }
152 # else => ignored
153 }
154 $self->checkCollator;
155 return wantarray ? %old : $self;
156}
a7fbee98 157
0116f5dc 158sub checkCollator {
159 my $self = shift;
a7fbee98 160 croak "Illegal level lower than 1 (passed $self->{level})."
161 if $self->{level} < 1;
162 croak "A level higher than 4 (passed $self->{level}) is not supported."
163 if 4 < $self->{level};
164
0116f5dc 165 $self->{derivCode} =
4d36a948 166 $self->{UCA_Version} == 8 ? \&_derivCE_8 :
167 $self->{UCA_Version} == 9 ? \&_derivCE_9 :
0116f5dc 168 croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 169
0116f5dc 170 $self->{alternate} = lc($self->{alternate});
171 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
172 unless exists $AlternateOK{ $self->{alternate} };
173
0116f5dc 174 $self->{backwards} = []
175 if ! defined $self->{backwards};
176 $self->{backwards} = [ $self->{backwards} ]
177 if ! ref $self->{backwards};
178
179 $self->{rearrange} = []
180 if ! defined $self->{rearrange};
181 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
182 if ! ref $self->{rearrange};
183
184 # keys of $self->{rearrangeHash} are $self->{rearrange}.
185 $self->{rearrangeHash} = undef;
186
187 if (@{ $self->{rearrange} }) {
188 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
189 }
190
191 $self->{normCode} = undef;
a7fbee98 192
193 if (defined $self->{normalization}) {
194 eval { require Unicode::Normalize };
195 croak "Unicode/Normalize.pm is required to normalize strings: $@"
196 if $@;
197
198 Unicode::Normalize->import();
199 $getCombinClass = \&Unicode::Normalize::getCombinClass
200 if ! $getCombinClass;
201
0116f5dc 202 $self->{normCode} =
a7fbee98 203 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
204 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
205 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
206 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
207 croak "$PACKAGE unknown normalization form name: "
208 . $self->{normalization};
209 }
0116f5dc 210 return;
211}
212
213sub new
214{
215 my $class = shift;
216 my $self = bless { @_ }, $class;
45394607 217
a7fbee98 218 # If undef is passed explicitly, no file is read.
0116f5dc 219 $self->{table} = $KeyFile if ! exists $self->{table};
220 $self->read_table if defined $self->{table};
905aa9f0 221
a7fbee98 222 if ($self->{entry}) {
223 $self->parseEntry($_) foreach split /\n/, $self->{entry};
224 }
905aa9f0 225
0116f5dc 226 $self->{level} ||= 4;
227 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 228
0116f5dc 229 $self->{overrideHangul} = ''
230 if ! exists $self->{overrideHangul};
231 $self->{overrideCJK} = ''
232 if ! exists $self->{overrideCJK};
233 $self->{normalization} = 'D'
234 if ! exists $self->{normalization};
235 $self->{alternate} = $self->{alternateTable} || 'shifted'
236 if ! exists $self->{alternate};
237 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 238 if ! exists $self->{rearrange};
0116f5dc 239 $self->{backwards} = $self->{backwardsTable}
240 if ! exists $self->{backwards};
a7fbee98 241
0116f5dc 242 $self->checkCollator;
a7fbee98 243
244 return $self;
245}
905aa9f0 246
247sub read_table {
a7fbee98 248 my $self = shift;
249 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
250
e69a2255 251 my $filepath = File::Spec->catfile($Path, $file);
252 open my $fk, "<$filepath"
253 or croak "File does not exist at $filepath";
a7fbee98 254
255 while (<$fk>) {
256 next if /^\s*#/;
257 if (/^\s*\@/) {
0116f5dc 258 if (/^\s*\@version\s*(\S*)/) {
259 $self->{versionTable} ||= $1;
260 }
261 elsif (/^\s*\@alternate\s+(\S*)/) {
262 $self->{alternateTable} ||= $1;
a7fbee98 263 }
0116f5dc 264 elsif (/^\s*\@backwards\s+(\S*)/) {
265 push @{ $self->{backwardsTable} }, $1;
a7fbee98 266 }
0116f5dc 267 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
268 push @{ $self->{forwardsTable} }, $1;
a7fbee98 269 }
0116f5dc 270 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
271 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 272 }
273 next;
274 }
275 $self->parseEntry($_);
45394607 276 }
a7fbee98 277 close $fk;
45394607 278}
279
905aa9f0 280
45394607 281##
282## get $line, parse it, and write an entry in $self
283##
284sub parseEntry
285{
a7fbee98 286 my $self = shift;
287 my $line = shift;
4d36a948 288 my($name, $entry, @uv, @key);
a7fbee98 289
290 return if $line !~ /^\s*[0-9A-Fa-f]/;
291
292 # removes comment and gets name
293 $name = $1
294 if $line =~ s/[#%]\s*(.*)//;
295 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
296
297 # gets element
298 my($e, $k) = split /;/, $line;
299 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
300 if ! $k;
301
4d36a948 302 @uv = _getHexArray($e);
303 return if !@uv;
304
305 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 306
4d36a948 307 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
308 # Do not use UTF_TEMPLATE; Perl' RE is only for utf8.
309 my $ele = $IsEBCDIC
310 ? pack('U*', map utf8::unicode_to_native($_), @uv)
311 : pack('U*', @uv);
a7fbee98 312
4d36a948 313 # regarded as if it were not entried in the table
314 return
315 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 316
4d36a948 317 # replaced as completely ignorable
318 $k = '[.0000.0000.0000.0000]'
319 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 320 }
0116f5dc 321
4d36a948 322 # replaced as completely ignorable
323 $k = '[.0000.0000.0000.0000]'
324 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
325
326 my $combining = TRUE; # primary = 0, secondary != 0;
327 my $is_L3_ignorable;
328
caffd4cf 329 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
330 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 331 my @wt = _getHexArray($arr);
332 push @key, pack(VCE_TEMPLATE, $var, @wt);
333 $combining = FALSE
334 unless $wt[0] == 0 && $wt[1] != 0;
335 $is_L3_ignorable = TRUE
336 if $wt[0] + $wt[1] + $wt[2] == 0;
337 # if $arr !~ /[1-9A-Fa-f]/; NG
338 # Conformance Test shows L3-ignorable is completely ignorable.
a7fbee98 339 }
caffd4cf 340
4d36a948 341 $self->{entries}{$entry} = \@key;
caffd4cf 342
4d36a948 343 $self->{combining}{$entry} = TRUE
caffd4cf 344 if $combining;
345
4d36a948 346 # The key is a string representing a numeral code point.
347
348 $self->{L3_ignorable}{$uv[0]} = TRUE
349 if @uv == 1 && $is_L3_ignorable;
caffd4cf 350
4d36a948 351 # Contraction is to be considered in the range of this maxlength.
352 $self->{maxlength}{$uv[0]} = scalar @uv
353 if @uv > 1;
45394607 354}
355
45394607 356##
4d36a948 357## arrayref[weights] = altCE(bool variable?, list[num] weights)
45394607 358##
d16e9e3d 359sub altCE
45394607 360{
a7fbee98 361 my $self = shift;
4d36a948 362 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
a7fbee98 363
364 $self->{alternate} eq 'blanked' ?
4d36a948 365 $var ? [Var1, 0, 0, $wt[3]] : \@wt :
a7fbee98 366 $self->{alternate} eq 'non-ignorable' ?
4d36a948 367 \@wt :
a7fbee98 368 $self->{alternate} eq 'shifted' ?
4d36a948 369 $var ? [Var1, 0, 0, $wt[0] ]
370 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4 : 0 ] :
a7fbee98 371 $self->{alternate} eq 'shift-trimmed' ?
4d36a948 372 $var ? [Var1, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
a7fbee98 373 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 374}
375
45394607 376sub viewSortKey
377{
a7fbee98 378 my $self = shift;
0116f5dc 379 my $ver = $self->{UCA_Version};
380
a7fbee98 381 my $key = $self->getSortKey(@_);
4d36a948 382 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, $key);
383
0116f5dc 384 if ($ver <= 8) {
385 $view =~ s/ ?0000 ?/|/g;
386 } else {
387 $view =~ s/\b0000\b/|/g;
388 }
a7fbee98 389 return "[$view]";
45394607 390}
391
d16e9e3d 392
45394607 393##
4d36a948 394## arrayref of JCPS = splitCE(string to be collated)
395## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
45394607 396##
d16e9e3d 397sub splitCE
45394607 398{
a7fbee98 399 my $self = shift;
4d36a948 400 my $wLen = $_[1];
401
a7fbee98 402 my $code = $self->{preprocess};
0116f5dc 403 my $norm = $self->{normCode};
a7fbee98 404 my $ent = $self->{entries};
405 my $max = $self->{maxlength};
406 my $reH = $self->{rearrangeHash};
4d36a948 407 my $ign = $self->{L3_ignorable};
0116f5dc 408 my $ver9 = $self->{UCA_Version} > 8;
a7fbee98 409
4d36a948 410 my ($str, @buf);
a7fbee98 411
4d36a948 412 if ($wLen) {
413 $code and croak "Preprocess breaks character positions. "
414 . "Don't use with index(), match(), etc.";
415 $norm and croak "Normalization breaks character positions. "
416 . "Don't use with index(), match(), etc.";
417 $str = $_[0];
418 }
419 else {
420 $str = $_[0];
421 $str = &$code($str) if ref $code;
422 $str = &$norm($str) if ref $norm;
423 }
a7fbee98 424
4d36a948 425 # get array of Unicode code point of string.
426 my @src = $IsEBCDIC
427 ? map(utf8::native_to_unicode($_), unpack UTF_TEMPLATE, $str)
428 : unpack(UTF_TEMPLATE, $str);
429
430 # rearrangement:
431 # Character positions are not kept if rearranged,
432 # then neglected if $wLen is true.
433 if ($reH && ! $wLen) {
a7fbee98 434 for (my $i = 0; $i < @src; $i++) {
435 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
436 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
437 $i++;
438 }
439 }
45394607 440 }
45394607 441
0116f5dc 442 if ($ver9) {
4d36a948 443 # To remove a character marked as a completely ignorable.
444 for (my $i = 0; $i < @src; $i++) {
445 $src[$i] = undef if $ign->{ $src[$i] };
446 }
0116f5dc 447 }
448
a7fbee98 449 for (my $i = 0; $i < @src; $i++) {
4d36a948 450 next if _isNonCharacter($src[$i]);
451
452 my $i_orig = $i;
453 my $ce = $src[$i];
454
455 if ($max->{$ce}) { # contract
456 my $temp_ce = $ce;
457
458 for (my $p = $i + 1; $p < @src; $p++) {
459 next if ! defined $src[$p];
460 $temp_ce .= CODE_SEP . $src[$p];
461 if ($ent->{$temp_ce}) {
462 $ce = $temp_ce;
463 $i = $p;
464 }
465 }
466 }
467
468 # with Combining Char (UTS#10, 4.2.1).
469 # requires Unicode::Normalize.
470 # Not be $wLen, as not croaked due to $norm.
471 if ($getCombinClass) {
472 for (my $p = $i + 1; $p < @src; $p++) {
473 next if ! defined $src[$p];
474 last unless $getCombinClass->($src[$p]);
475 my $tail = CODE_SEP . $src[$p];
476 if ($ent->{$ce.$tail}) {
477 $ce .= $tail;
478 $src[$p] = undef;
479 }
a7fbee98 480 }
a7fbee98 481 }
482
4d36a948 483 if ($wLen) {
484 for (my $p = $i + 1; $p < @src; $p++) {
485 last if defined $src[$p];
486 $i = $p;
a7fbee98 487 }
488 }
4d36a948 489
490 push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
45394607 491 }
4d36a948 492 return \@buf;
d16e9e3d 493}
45394607 494
d16e9e3d 495
496##
4d36a948 497## list of arrayrefs of weights = getWt(JCPS)
d16e9e3d 498##
499sub getWt
500{
a7fbee98 501 my $self = shift;
4d36a948 502 my $ce = shift;
a7fbee98 503 my $ent = $self->{entries};
a7fbee98 504 my $cjk = $self->{overrideCJK};
505 my $hang = $self->{overrideHangul};
0116f5dc 506 my $der = $self->{derivCode};
a7fbee98 507
4d36a948 508 return if !defined $ce;
509 return map($self->altCE($_), @{ $ent->{$ce} })
510 if $ent->{$ce};
0116f5dc 511
4d36a948 512 # CE must not be a contraction, then it's a code point.
513 my $u = $ce;
a7fbee98 514
4d36a948 515 if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
0116f5dc 516 return map $self->altCE($_),
517 $hang
4d36a948 518 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
0116f5dc 519 : defined $hang
520 ? map({
4d36a948 521 $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
0116f5dc 522 } _decompHangul($u))
523 : $der->($u);
a7fbee98 524 }
525 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
526 0x4E00 <= $u && $u <= 0x9FA5 ||
0116f5dc 527 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
528 return map $self->altCE($_),
529 $cjk
4d36a948 530 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
0116f5dc 531 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
4d36a948 532 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2, Min3, $u)
0116f5dc 533 : $der->($u);
a7fbee98 534 }
535 else {
0116f5dc 536 return map $self->altCE($_), $der->($u);
a7fbee98 537 }
d16e9e3d 538}
539
d16e9e3d 540
541##
542## string sortkey = getSortKey(string arg)
543##
544sub getSortKey
545{
a7fbee98 546 my $self = shift;
547 my $lev = $self->{level};
4d36a948 548 my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
0116f5dc 549 my $ver9 = $self->{UCA_Version} > 8;
4d36a948 550 my $v2i = $self->{alternate} ne 'non-ignorable';
a7fbee98 551
552 # weight arrays
0116f5dc 553 my (@buf, $last_is_variable);
554
4d36a948 555 foreach my $wt (map $self->getWt($_), @$rCE) {
556 if ($v2i && $ver9) {
557 if ($wt->[0] == 0) { # ignorable
558 next if $last_is_variable;
0116f5dc 559 } else {
4d36a948 560 $last_is_variable = ($wt->[0] == Var1);
0116f5dc 561 }
562 }
4d36a948 563 push @buf, $wt;
0116f5dc 564 }
a7fbee98 565
566 # make sort key
567 my @ret = ([],[],[],[]);
568 foreach my $v (0..$lev-1) {
569 foreach my $b (@buf) {
4d36a948 570 push @{ $ret[$v] }, $b->[$v]
571 if 0 < $b->[$v];
a7fbee98 572 }
573 }
574 foreach (@{ $self->{backwards} }) {
575 my $v = $_ - 1;
576 @{ $ret[$v] } = reverse @{ $ret[$v] };
45394607 577 }
45394607 578
a7fbee98 579 # modification of tertiary weights
580 if ($self->{upper_before_lower}) {
581 foreach (@{ $ret[2] }) {
582 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
583 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
584 elsif ($_ == 0x1C) { $_ += 1 } # square upper
585 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
586 }
45394607 587 }
a7fbee98 588 if ($self->{katakana_before_hiragana}) {
589 foreach (@{ $ret[2] }) {
590 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
591 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
592 }
45394607 593 }
4d36a948 594 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 595}
596
597
598##
d16e9e3d 599## int compare = cmp(string a, string b)
45394607 600##
5398038e 601sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
602sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
603sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
604sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
605sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
606sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
607sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 608
609##
d16e9e3d 610## list[strings] sorted = sort(list[strings] arg)
45394607 611##
a7fbee98 612sub sort {
613 my $obj = shift;
614 return
615 map { $_->[1] }
616 sort{ $a->[0] cmp $b->[0] }
617 map [ $obj->getSortKey($_), $_ ], @_;
45394607 618}
619
0116f5dc 620
4d36a948 621sub _derivCE_9 {
0116f5dc 622 my $u = shift;
623 my $base =
4d36a948 624 (0x4E00 <= $u && $u <= 0x9FA5)
625 ? 0xFB40 : # CJK
0116f5dc 626 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
4d36a948 627 ? 0xFB80 # CJK ext.
628 : 0xFBC0; # others
0116f5dc 629
630 my $aaaa = $base + ($u >> 15);
631 my $bbbb = ($u & 0x7FFF) | 0x8000;
632 return
4d36a948 633 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2, Min3, $u),
634 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 635}
636
4d36a948 637sub _derivCE_8 {
0116f5dc 638 my $code = shift;
639 my $aaaa = 0xFF80 + ($code >> 15);
640 my $bbbb = ($code & 0x7FFF) | 0x8000;
641 return
4d36a948 642 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
643 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 644}
645
646##
647## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
648##
a7fbee98 649sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 650
a7fbee98 651#
4d36a948 652# $code *must* be in Hangul syllable.
a7fbee98 653# Check it before you enter here.
654#
5398038e 655sub _decompHangul {
656 my $code = shift;
5398038e 657 my $SIndex = $code - 0xAC00;
658 my $LIndex = int( $SIndex / 588);
659 my $VIndex = int(($SIndex % 588) / 28);
660 my $TIndex = $SIndex % 28;
661 return (
a7fbee98 662 0x1100 + $LIndex,
663 0x1161 + $VIndex,
664 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 665 );
45394607 666}
667
4d36a948 668sub _isNonCharacter {
669 my $code = shift;
670 return ! defined $code # removed
671 || ($code < 0 || 0x10FFFF < $code) # out of range
672 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
673 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
674 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
675 ;
676}
677
678
679##
680## bool _nonIgnorAtLevel(arrayref weights, int level)
681##
682sub _nonIgnorAtLevel($$)
683{
684 my $wt = shift;
685 return if ! defined $wt;
686 my $lv = shift;
687 return grep($wt->[$_] != 0, 0..$lv-1) ? TRUE : FALSE;
688}
689
690##
691## bool _eqArray(
692## arrayref of arrayref[weights] source,
693## arrayref of arrayref[weights] substr,
694## int level)
695## * comparison of graphemes vs graphemes.
696## @$source >= @$substr must be true (check it before call this);
697##
698sub _eqArray($$$)
699{
700 my $source = shift;
701 my $substr = shift;
702 my $lev = shift;
703
704 for my $g (0..@$substr-1){
705 # Do the $g'th graphemes have the same number of AV weigths?
706 return if @{ $source->[$g] } != @{ $substr->[$g] };
707
708 for my $w (0..@{ $substr->[$g] }-1) {
709 for my $v (0..$lev-1) {
710 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
711 }
712 }
713 }
714 return 1;
715}
716
717##
718## (int position, int length)
719## int position = index(string, substring, position, [undoc'ed grobal])
720##
721## With "grobal" (only for the list context),
722## returns list of arrayref[position, length].
723##
724sub index
725{
726 my $self = shift;
727 my $str = shift;
728 my $len = length($str);
729 my $subCE = $self->splitCE(shift);
730 my $pos = @_ ? shift : 0;
731 $pos = 0 if $pos < 0;
732 my $grob = shift;
733
734 my $comb = $self->{combining};
735 my $lev = $self->{level};
736 my $ver9 = $self->{UCA_Version} > 8;
737 my $v2i = $self->{alternate} ne 'non-ignorable';
738
739 if (! @$subCE) {
740 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
741 return $grob
742 ? map([$_, 0], $temp..$len)
743 : wantarray ? ($temp,0) : $temp;
744 }
745 if ($len < $pos) {
746 return wantarray ? () : NOMATCHPOS;
747 }
748 my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
749 if (! @$strCE) {
750 return wantarray ? () : NOMATCHPOS;
751 }
752 my $last_is_variable;
753 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
754
755 $last_is_variable = FALSE;
756 for my $wt (map $self->getWt($_), @$subCE) {
757 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
758
759 if ($v2i && $ver9) {
760 if ($wt->[0] == 0) {
761 $to_be_pushed = FALSE if $last_is_variable;
762 } else {
763 $last_is_variable = ($wt->[0] == Var1);
764 }
765 }
766
767 if (@subWt && $wt->[0] == 0) {
768 push @{ $subWt[-1] }, $wt if $to_be_pushed;
769 } else {
770 $wt->[0] = 0 if $wt->[0] == Var1;
771 push @subWt, [ $wt ];
772 }
773 }
774
775 my $count = 0;
776 my $end = @$strCE - 1;
777
778 $last_is_variable = FALSE;
779
780 for (my $i = 0; $i <= $end; ) { # no $i++
781 my $found_base = 0;
782
783 # fetch a grapheme
784 while ($i <= $end && $found_base == 0) {
785 for my $wt ($self->getWt($strCE->[$i][0])) {
786 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
787
788 if ($v2i && $ver9) {
789 if ($wt->[0] == 0) {
790 $to_be_pushed = FALSE if $last_is_variable;
791 } else {
792 $last_is_variable = ($wt->[0] == Var1);
793 }
794 }
795
796 if (@strWt && $wt->[0] == 0) {
797 push @{ $strWt[-1] }, $wt if $to_be_pushed;
798 $finPos[-1] = $strCE->[$i][2];
799 } elsif ($to_be_pushed) {
800 $wt->[0] = 0 if $wt->[0] == Var1;
801 push @strWt, [ $wt ];
802 push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
803 $finPos[-1] = NOMATCHPOS if $found_base;
804 push @finPos, $strCE->[$i][2];
805 $found_base++;
806 }
807 # else ===> no-op
808 }
809 $i++;
810 }
811
812 # try to match
813 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
814 if ($iniPos[0] != NOMATCHPOS &&
815 $finPos[$#subWt] != NOMATCHPOS &&
816 _eqArray(\@strWt, \@subWt, $lev)) {
817 my $temp = $iniPos[0] + $pos;
818
819 if ($grob) {
820 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
821 splice @strWt, 0, $#subWt;
822 splice @iniPos, 0, $#subWt;
823 splice @finPos, 0, $#subWt;
824 }
825 else {
826 return wantarray
827 ? ($temp, $finPos[$#subWt] - $iniPos[0])
828 : $temp;
829 }
830 }
831 shift @strWt;
832 shift @iniPos;
833 shift @finPos;
834 }
835 }
836
837 return $grob
838 ? @g_ret
839 : wantarray ? () : NOMATCHPOS;
840}
841
842##
843## scalarref to matching part = match(string, substring)
844##
845sub match
846{
847 my $self = shift;
848 if (my($pos,$len) = $self->index($_[0], $_[1])) {
849 my $temp = substr($_[0], $pos, $len);
850 return wantarray ? $temp : \$temp;
851 # An lvalue ref \substr should be avoided,
852 # since its value is affected by modification of its referent.
853 }
854 else {
855 return;
856 }
857}
858
859##
860## arrayref matching parts = gmatch(string, substring)
861##
862sub gmatch
863{
864 my $self = shift;
865 my $str = shift;
866 my $sub = shift;
867 return map substr($str, $_->[0], $_->[1]),
868 $self->index($str, $sub, 0, 'g');
869}
870
871##
872## bool subst'ed = subst(string, substring, replace)
873##
874sub subst
875{
876 my $self = shift;
877 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
878
879 if (my($pos,$len) = $self->index($_[0], $_[1])) {
880 if ($code) {
881 my $mat = substr($_[0], $pos, $len);
882 substr($_[0], $pos, $len, $code->($mat));
883 } else {
884 substr($_[0], $pos, $len, $_[2]);
885 }
886 return TRUE;
887 }
888 else {
889 return FALSE;
890 }
891}
892
893##
894## int count = gsubst(string, substring, replace)
895##
896sub gsubst
897{
898 my $self = shift;
899 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
900 my $cnt = 0;
901
902 # Replacement is carried out from the end, then use reverse.
903 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
904 if ($code) {
905 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
906 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
907 } else {
908 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
909 }
910 $cnt++;
911 }
912 return $cnt;
913}
914
45394607 9151;
916__END__
917
918=head1 NAME
919
a7fbee98 920Unicode::Collate - Unicode Collation Algorithm
45394607 921
922=head1 SYNOPSIS
923
924 use Unicode::Collate;
925
926 #construct
5398038e 927 $Collator = Unicode::Collate->new(%tailoring);
45394607 928
929 #sort
5398038e 930 @sorted = $Collator->sort(@not_sorted);
45394607 931
932 #compare
a7fbee98 933 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 934
935=head1 DESCRIPTION
936
4d36a948 937This module is an implementation
938of Unicode Technical Standard #10 (UTS #10)
939"Unicode Collation Algorithm."
940
45394607 941=head2 Constructor and Tailoring
942
d16e9e3d 943The C<new> method returns a collator object.
944
5398038e 945 $Collator = Unicode::Collate->new(
0116f5dc 946 UCA_Version => $UCA_Version,
45394607 947 alternate => $alternate,
948 backwards => $levelNumber, # or \@levelNumbers
949 entry => $element,
950 normalization => $normalization_form,
951 ignoreName => qr/$ignoreName/,
952 ignoreChar => qr/$ignoreChar/,
953 katakana_before_hiragana => $bool,
954 level => $collationLevel,
955 overrideCJK => \&overrideCJK,
956 overrideHangul => \&overrideHangul,
957 preprocess => \&preprocess,
958 rearrange => \@charList,
959 table => $filename,
960 undefName => qr/$undefName/,
961 undefChar => qr/$undefChar/,
962 upper_before_lower => $bool,
963 );
a7fbee98 964 # if %tailoring is false (i.e. empty),
5398038e 965 # $Collator should do the default collation.
45394607 966
967=over 4
968
0116f5dc 969=item UCA_Version
970
971If the version number of the older UCA is given,
972the older behavior of that version is emulated on collating.
973If omitted, the return value of C<UCA_Version()> is used.
974
975The supported version: 8 or 9.
976
977B<This parameter may be removed in the future version,
978as switching the algorithm would affect the performance.>
979
45394607 980=item alternate
981
4d36a948 982-- see 3.2.2 Variable Weighting, UTS #10.
caffd4cf 983
984(the title in UCA version 8: Alternate Weighting)
45394607 985
a7fbee98 986This key allows to alternate weighting for variable collation elements,
987which are marked with an ASTERISK in the table
988(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
989
990 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 991
a7fbee98 992These names are case-insensitive.
45394607 993By default (if specification is omitted), 'shifted' is adopted.
994
4d36a948 995 'Blanked' Variable elements are made ignorable at levels 1 through 3;
a7fbee98 996 considered at the 4th level.
997
998 'Non-ignorable' Variable elements are not reset to ignorable.
999
4d36a948 1000 'Shifted' Variable elements are made ignorable at levels 1 through 3
a7fbee98 1001 their level 4 weight is replaced by the old level 1 weight.
1002 Level 4 weight for Non-Variable elements is 0xFFFF.
1003
1004 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1005 are trimmed.
1006
45394607 1007=item backwards
1008
4d36a948 1009-- see 3.1.2 French Accents, UTS #10.
45394607 1010
1011 backwards => $levelNumber or \@levelNumbers
1012
1013Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1014If omitted, forwards at all the levels.
1015
1016=item entry
1017
4d36a948 1018-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1019
a7fbee98 1020Overrides a default order or defines additional collation elements
45394607 1021
1022 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 102300E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 10240063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
10250043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1026ENTRIES
1027
4d36a948 1028B<NOTE:> The code point in the UCA file format (before C<';'>)
1029B<must> be a Unicode code point, but not a native code point.
1030So C<0063> must always denote C<U+0063>,
1031but not a character of C<"\x63">.
1032
45394607 1033=item ignoreName
1034
1035=item ignoreChar
1036
4d36a948 1037-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1038
caffd4cf 1039Makes the entry in the table completely ignorable;
1040i.e. as if the weights were zero at all level.
45394607 1041
a7fbee98 1042E.g. when 'a' and 'e' are ignorable,
45394607 1043'element' is equal to 'lament' (or 'lmnt').
1044
1045=item level
1046
4d36a948 1047-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1048
1049Set the maximum level.
1050Any higher levels than the specified one are ignored.
1051
1052 Level 1: alphabetic ordering
1053 Level 2: diacritic ordering
1054 Level 3: case ordering
1055 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1056
1057 ex.level => 2,
1058
a7fbee98 1059If omitted, the maximum is the 4th.
1060
45394607 1061=item normalization
1062
4d36a948 1063-- see 4.1 Normalize each input string, UTS #10.
45394607 1064
905aa9f0 1065If specified, strings are normalized before preparation of sort keys
45394607 1066(the normalization is executed after preprocess).
1067
1068As a form name, one of the following names must be used.
1069
1070 'C' or 'NFC' for Normalization Form C
1071 'D' or 'NFD' for Normalization Form D
1072 'KC' or 'NFKC' for Normalization Form KC
1073 'KD' or 'NFKD' for Normalization Form KD
1074
1075If omitted, the string is put into Normalization Form D.
1076
a7fbee98 1077If C<undef> is passed explicitly as the value for this key,
45394607 1078any normalization is not carried out (this may make tailoring easier
1079if any normalization is not desired).
1080
1081see B<CAVEAT>.
1082
1083=item overrideCJK
1084
4d36a948 1085-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1086
1087By default, mapping of CJK Unified Ideographs
a7fbee98 1088uses the Unicode codepoint order.
1089But the mapping of CJK Unified Ideographs may be overrided.
45394607 1090
a7fbee98 1091ex. CJK Unified Ideographs in the JIS code point order.
45394607 1092
1093 overrideCJK => sub {
a7fbee98 1094 my $u = shift; # get a Unicode codepoint
1095 my $b = pack('n', $u); # to UTF-16BE
1096 my $s = your_unicode_to_sjis_converter($b); # convert
1097 my $n = unpack('n', $s); # convert sjis to short
1098 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1099 },
1100
a7fbee98 1101ex. ignores all CJK Unified Ideographs.
1102
1103 overrideCJK => sub {()}, # CODEREF returning empty list
1104
1105 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1106 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1107
1108If C<undef> is passed explicitly as the value for this key,
1109weights for CJK Unified Ideographs are treated as undefined.
1110But assignment of weight for CJK Unified Ideographs
1111in table or L<entry> is still valid.
1112
1113=item overrideHangul
1114
4d36a948 1115-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1116
1117By default, Hangul Syllables are decomposed into Hangul Jamo.
1118But the mapping of Hangul Syllables may be overrided.
1119
1120This tag works like L<overrideCJK>, so see there for examples.
1121
45394607 1122If you want to override the mapping of Hangul Syllables,
1123the Normalization Forms D and KD are not appropriate
1124(they will be decomposed before overriding).
1125
a7fbee98 1126If C<undef> is passed explicitly as the value for this key,
1127weight for Hangul Syllables is treated as undefined
1128without decomposition into Hangul Jamo.
1129But definition of weight for Hangul Syllables
1130in table or L<entry> is still valid.
1131
45394607 1132=item preprocess
1133
4d36a948 1134-- see 5.1 Preprocessing, UTS #10.
45394607 1135
1136If specified, the coderef is used to preprocess
1137before the formation of sort keys.
1138
a7fbee98 1139ex. dropping English articles, such as "a" or "the".
45394607 1140Then, "the pen" is before "a pencil".
1141
1142 preprocess => sub {
1143 my $str = shift;
a7fbee98 1144 $str =~ s/\b(?:an?|the)\s+//gi;
45394607 1145 $str;
1146 },
1147
1148=item rearrange
1149
4d36a948 1150-- see 3.1.3 Rearrangement, UTS #10.
45394607 1151
1152Characters that are not coded in logical order and to be rearranged.
a7fbee98 1153By default,
45394607 1154
1155 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1156
a7fbee98 1157If you want to disallow any rearrangement,
1158pass C<undef> or C<[]> (a reference to an empty list)
1159as the value for this key.
1160
0116f5dc 1161B<According to the version 9 of UCA, this parameter shall not be used;
1162but it is not warned at present.>
1163
45394607 1164=item table
1165
4d36a948 1166-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1167
1168You can use another element table if desired.
1169The table file must be in your C<lib/Unicode/Collate> directory.
1170
1171By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
1172
a7fbee98 1173If C<undef> is passed explicitly as the value for this key,
1174no file is read (but you can define collation elements via L<entry>).
1175
1176A typical way to define a collation element table
1177without any file of table:
1178
1179 $onlyABC = Unicode::Collate->new(
1180 table => undef,
1181 entry => << 'ENTRIES',
11820061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
11830041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
11840062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
11850042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
11860063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
11870043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1188ENTRIES
1189 );
905aa9f0 1190
45394607 1191=item undefName
1192
1193=item undefChar
1194
4d36a948 1195-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1196
1197Undefines the collation element as if it were unassigned in the table.
1198This reduces the size of the table.
1199If an unassigned character appears in the string to be collated,
1200the sort key is made from its codepoint
1201as a single-character collation element,
1202as it is greater than any other assigned collation elements
1203(in the codepoint order among the unassigned characters).
1204But, it'd be better to ignore characters
1205unfamiliar to you and maybe never used.
1206
1207=item katakana_before_hiragana
1208
1209=item upper_before_lower
1210
4d36a948 1211-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1212
1213By default, lowercase is before uppercase
1214and hiragana is before katakana.
1215
a7fbee98 1216If the tag is made true, this is reversed.
1217
1218B<NOTE>: These tags simplemindedly assume
1219any lowercase/uppercase or hiragana/katakana distinctions
1220should occur in level 3, and their weights at level 3
4d36a948 1221should be same as those mentioned in 7.3.1, UTS #10.
a7fbee98 1222If you define your collation elements which violates this,
4d36a948 1223these tags don't work validly.
45394607 1224
1225=back
1226
3164dd77 1227=head2 Methods for Collation
45394607 1228
1229=over 4
1230
5398038e 1231=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1232
1233Sorts a list of strings.
1234
5398038e 1235=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1236
1237Returns 1 (when C<$a> is greater than C<$b>)
1238or 0 (when C<$a> is equal to C<$b>)
1239or -1 (when C<$a> is lesser than C<$b>).
1240
5398038e 1241=item C<$result = $Collator-E<gt>eq($a, $b)>
1242
1243=item C<$result = $Collator-E<gt>ne($a, $b)>
1244
1245=item C<$result = $Collator-E<gt>lt($a, $b)>
1246
1247=item C<$result = $Collator-E<gt>le($a, $b)>
1248
1249=item C<$result = $Collator-E<gt>gt($a, $b)>
1250
1251=item C<$result = $Collator-E<gt>ge($a, $b)>
1252
a7fbee98 1253They works like the same name operators as theirs.
5398038e 1254
1255 eq : whether $a is equal to $b.
1256 ne : whether $a is not equal to $b.
1257 lt : whether $a is lesser than $b.
1258 le : whether $a is lesser than $b or equal to $b.
1259 gt : whether $a is greater than $b.
1260 ge : whether $a is greater than $b or equal to $b.
1261
1262=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1263
4d36a948 1264-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1265
1266Returns a sort key.
1267
1268You compare the sort keys using a binary comparison
1269and get the result of the comparison of the strings using UCA.
1270
5398038e 1271 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1272
1273 is equivalent to
1274
5398038e 1275 $Collator->cmp($a, $b)
45394607 1276
a7fbee98 1277=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1278
a7fbee98 1279 use Unicode::Collate;
1280 my $c = Unicode::Collate->new();
1281 print $c->viewSortKey("Perl"),"\n";
1282
0116f5dc 1283 # output:
1284 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1285 # Level 1 Level 2 Level 3 Level 4
1286
1287 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1288
4d36a948 1289=back
1290
1291=head2 Methods for Searching
d16e9e3d 1292
4d36a948 1293B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1294for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1295C<subst>, C<gsubst>) is croaked,
1296as the position and the length might differ
1297from those on the specified string.
1298(And the C<rearrange> tag is neglected.)
d16e9e3d 1299
4d36a948 1300The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1301like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1302but they are not aware of any pattern, but only a literal substring.
1303
1304=over 4
1305
1306=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1307
1308=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1309
1310If C<$substring> matches a part of C<$string>, returns
1311the position of the first occurrence of the matching part in scalar context;
1312in list context, returns a two-element list of
1313the position and the length of the matching part.
1314
d16e9e3d 1315If C<$substring> does not match any part of C<$string>,
1316returns C<-1> in scalar context and
1317an empty list in list context.
1318
1319e.g. you say
1320
5398038e 1321 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1322 # (normalization => undef) is REQUIRED.
1323 my $str = "Ich muß studieren Perl.";
1324 my $sub = "MÜSS";
d16e9e3d 1325 my $match;
a7fbee98 1326 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1327 $match = substr($str, $pos, $len);
d16e9e3d 1328 }
1329
4d36a948 1330and get C<"muß"> in C<$match> since C<"muß">
1331is primary equal to C<"MÜSS">.
1332
1333=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1334
1335=item C<($match) = $Collator-E<gt>match($string, $substring)>
1336
1337If C<$substring> matches a part of C<$string>, in scalar context, returns
1338B<a reference to> the first occurrence of the matching part
1339(C<$match_ref> is always true if matches,
1340since every reference is B<true>);
1341in list context, returns the first occurrence of the matching part.
1342
1343If C<$substring> does not match any part of C<$string>,
1344returns C<undef> in scalar context and
1345an empty list in list context.
1346
1347e.g.
1348
1349 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1350 print "matches [$$match_ref].\n";
1351 } else {
1352 print "doesn't match.\n";
1353 }
1354
1355 or
1356
1357 if (($match) = $Collator->match($str, $sub)) { # list context
1358 print "matches [$match].\n";
1359 } else {
1360 print "doesn't match.\n";
1361 }
1362
1363=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1364
1365If C<$substring> matches a part of C<$string>, returns
1366all the matching parts (or matching count in scalar context).
1367
1368If C<$substring> does not match any part of C<$string>,
1369returns an empty list.
1370
1371=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1372
1373If C<$substring> matches a part of C<$string>,
1374the first occurrence of the matching part is replaced by C<$replacement>
1375(C<$string> is modified) and return C<$count> (always equals to C<1>).
1376
1377C<$replacement> can be a C<CODEREF>,
1378taking the matching part as an argument,
1379and returning a string to replace the matching part
1380(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1381
1382=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1383
1384If C<$substring> matches a part of C<$string>,
1385all the occurrences of the matching part is replaced by C<$replacement>
1386(C<$string> is modified) and return C<$count>.
1387
1388C<$replacement> can be a C<CODEREF>,
1389taking the matching part as an argument,
1390and returning a string to replace the matching part
1391(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1392
1393e.g.
1394
1395 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1396 # (normalization => undef) is REQUIRED.
1397 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1398 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1399
1400 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1401 # i.e., all the camels are made bold-faced.
d16e9e3d 1402
45394607 1403=back
1404
3164dd77 1405=head2 Other Methods
1406
1407=over 4
1408
0116f5dc 1409=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1410
1411Change the value of specified keys and returns the changed part.
1412
1413 $Collator = Unicode::Collate->new(level => 4);
1414
1415 $Collator->eq("perl", "PERL"); # false
1416
1417 %old = $Collator->change(level => 2); # returns (level => 4).
1418
1419 $Collator->eq("perl", "PERL"); # true
1420
1421 $Collator->change(%old); # returns (level => 2).
1422
1423 $Collator->eq("perl", "PERL"); # false
1424
1425Not all C<(key,value)>s are allowed to be changed.
1426See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1427
1428In the scalar context, returns the modified collator
1429(but it is B<not> a clone from the original).
1430
1431 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1432
1433 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1434
1435 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1436
3164dd77 1437=item UCA_Version
1438
4d36a948 1439Returns the version number of UTS #10 this module consults.
3164dd77 1440
1441=item Base_Unicode_Version
1442
1443Returns the version number of the Unicode Standard
1444this module is based on.
1445
1446=back
1447
45394607 1448=head2 EXPORT
1449
1450None by default.
1451
4a2e806c 1452=head2 TODO
1453
4d36a948 1454Unicode::Collate has not been ported to EBCDIC.
1455IMHO, use of utf8::unicode_to_native()/utf8::native_to_unicode()
1456at the proper postions should allow
1457this module to work on EBCDIC platform...
4a2e806c 1458
45394607 1459=head2 CAVEAT
1460
1461Use of the C<normalization> parameter requires
1462the B<Unicode::Normalize> module.
1463
5398038e 1464If you need not it (say, in the case when you need not
45394607 1465handle any combining characters),
1466assign C<normalization =E<gt> undef> explicitly.
1467
4d36a948 1468-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1469
0116f5dc 1470=head2 Conformance Test
1471
1472The Conformance Test for the UCA is provided
1473in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1474and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1475
1476For F<CollationTest_SHIFTED.txt>,
1477a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1478for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1479C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1480
4d36a948 1481B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1482
45394607 1483=head1 AUTHOR
1484
1485SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1486
1487 http://homepage1.nifty.com/nomenclator/perl/
1488
a7fbee98 1489 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1490
a7fbee98 1491 This library is free software; you can redistribute it
1492 and/or modify it under the same terms as Perl itself.
45394607 1493
1494=head1 SEE ALSO
1495
1496=over 4
1497
0116f5dc 1498=item http://www.unicode.org/reports/tr10/
45394607 1499
4d36a948 1500Unicode Collation Algorithm - UTS #10
45394607 1501
0116f5dc 1502=item http://www.unicode.org/reports/tr10/allkeys.txt
a7fbee98 1503
1504The Default Unicode Collation Element Table
45394607 1505
0116f5dc 1506=item http://www.unicode.org/reports/tr10/CollationTest.html
1507http://www.unicode.org/reports/tr10/CollationTest.zip
1508
1509The latest versions of the conformance test for the UCA
1510
1511=item http://www.unicode.org/reports/tr15/
a7fbee98 1512
1513Unicode Normalization Forms - UAX #15
1514
a7fbee98 1515=item L<Unicode::Normalize>
45394607 1516
45394607 1517=back
1518
1519=cut