Re: [perl #23656] Safe reval bleeds local variable values
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
1d2654e1 17our $VERSION = '0.27';
45394607 18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
3164dd77 29our $UNICODE_VERSION;
30
327745dc 31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
9f1f04a1 36else { # Perl 5.6.1
3164dd77 37 my($f, $fh);
38 foreach my $d (@INC) {
3164dd77 39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
40 if (open($fh, $f)) {
327745dc 41 $UNICODE_VERSION = '3.0.1';
3164dd77 42 close $fh;
43 last;
44 }
45 }
46}
47
4d36a948 48# Perl's boolean
49use constant TRUE => 1;
50use constant FALSE => "";
51use constant NOMATCHPOS => -1;
52
53# A coderef to get combining class imported from Unicode::Normalize
54# (i.e. \&Unicode::Normalize::getCombinClass).
55# This is also used as a HAS_UNICODE_NORMALIZE flag.
56our $getCombinClass;
57
9f1f04a1 58# Supported Levels
59use constant MinLevel => 1;
60use constant MaxLevel => 4;
61
4d36a948 62# Minimum weights at level 2 and 3, respectively
9f1f04a1 63use constant Min2Wt => 0x20;
64use constant Min3Wt => 0x02;
4d36a948 65
66# Shifted weight at 4th level
9f1f04a1 67use constant Shift4Wt => 0xFFFF;
4d36a948 68
69# Variable weight at 1st level.
70# This is a negative value but should be regarded as zero on collation.
71# This is for distinction of variable chars from level 3 ignorable chars.
9f1f04a1 72use constant Var1Wt => -1;
4d36a948 73
74
75# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
76# PROBLEM: The Default Unicode Collation Element Table
77# has weights over 0xFFFF at the 4th level.
78# The tie-breaking in the variable weights
79# other than "shift" (as well as "shift-trimmed") is unreliable.
80use constant VCE_TEMPLATE => 'Cn4';
81
4d36a948 82# A sort key: 16-bit weights
83# See also the PROBLEM on VCE_TEMPLATE above.
84use constant KEY_TEMPLATE => 'n*';
85
86# Level separator in a sort key:
87# i.e. pack(KEY_TEMPLATE, 0)
88use constant LEVEL_SEP => "\0\0";
89
90# As Unicode code point separator for hash keys.
91# A joined code point string (denoted by JCPS below)
92# like "65;768" is used for internal processing
93# instead of Perl's Unicode string like "\x41\x{300}",
94# as the native code point is different from the Unicode code point
95# on EBCDIC platform.
96# This character must not be included in any stringified
97# representation of an integer.
98use constant CODE_SEP => ';';
99
100# boolean values of variable weights
0116f5dc 101use constant NON_VAR => 0; # Non-Variable character
102use constant VAR => 1; # Variable character
3164dd77 103
4d36a948 104# Logical_Order_Exception in PropList.txt
105# TODO: synchronization with change of PropList.txt.
0116f5dc 106our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 107
0116f5dc 108sub UCA_Version { "9" }
a7fbee98 109
0116f5dc 110sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
a7fbee98 111
9f1f04a1 112######
113
9f1f04a1 114sub pack_U {
ae6aa562 115 return pack('U*', @_);
9f1f04a1 116}
117
118sub unpack_U {
ae6aa562 119 return unpack('U*', pack('U*').shift);
9f1f04a1 120}
121
122######
123
0116f5dc 124my (%AlternateOK);
125@AlternateOK{ qw/
126 blanked non-ignorable shifted shift-trimmed
127 / } = ();
128
129our @ChangeOK = qw/
130 alternate backwards level normalization rearrange
131 katakana_before_hiragana upper_before_lower
132 overrideHangul overrideCJK preprocess UCA_Version
133 /;
134
135our @ChangeNG = qw/
9f1f04a1 136 entry entries table maxlength
0116f5dc 137 ignoreChar ignoreName undefChar undefName
138 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 139 derivCode normCode rearrangeHash L3_ignorable
9f1f04a1 140 backwardsFlag
0116f5dc 141 /;
9f1f04a1 142# The hash key 'ignored' is deleted at v 0.21.
143# The hash key 'isShift' is deleted at v 0.23.
144# The hash key 'combining' is deleted at v 0.24.
0116f5dc 145
146my (%ChangeOK, %ChangeNG);
147@ChangeOK{ @ChangeOK } = ();
148@ChangeNG{ @ChangeNG } = ();
149
150sub change {
151 my $self = shift;
152 my %hash = @_;
153 my %old;
154 foreach my $k (keys %hash) {
155 if (exists $ChangeOK{$k}) {
156 $old{$k} = $self->{$k};
157 $self->{$k} = $hash{$k};
158 }
159 elsif (exists $ChangeNG{$k}) {
160 croak "change of $k via change() is not allowed!";
161 }
162 # else => ignored
163 }
164 $self->checkCollator;
165 return wantarray ? %old : $self;
166}
a7fbee98 167
9f1f04a1 168sub _checkLevel {
169 my $level = shift;
170 my $key = shift;
171 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
172 $level, $key, MinLevel if MinLevel > $level;
173 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
174 $level, $key, MaxLevel if MaxLevel < $level;
175}
176
0116f5dc 177sub checkCollator {
178 my $self = shift;
9f1f04a1 179 _checkLevel($self->{level}, "level");
a7fbee98 180
0116f5dc 181 $self->{derivCode} =
4d36a948 182 $self->{UCA_Version} == 8 ? \&_derivCE_8 :
183 $self->{UCA_Version} == 9 ? \&_derivCE_9 :
0116f5dc 184 croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 185
0116f5dc 186 $self->{alternate} = lc($self->{alternate});
187 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
188 unless exists $AlternateOK{ $self->{alternate} };
189
9f1f04a1 190 if (! defined $self->{backwards}) {
191 $self->{backwardsFlag} = 0;
192 }
193 elsif (! ref $self->{backwards}) {
194 _checkLevel($self->{backwards}, "backwards");
195 $self->{backwardsFlag} = 1 << $self->{backwards};
196 }
197 else {
198 my %level;
199 $self->{backwardsFlag} = 0;
200 for my $b (@{ $self->{backwards} }) {
201 _checkLevel($b, "backwards");
202 $level{$b} = 1;
203 }
204 for my $v (sort keys %level) {
205 $self->{backwardsFlag} += 1 << $v;
206 }
207 }
0116f5dc 208
209 $self->{rearrange} = []
210 if ! defined $self->{rearrange};
211 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
212 if ! ref $self->{rearrange};
213
214 # keys of $self->{rearrangeHash} are $self->{rearrange}.
215 $self->{rearrangeHash} = undef;
216
217 if (@{ $self->{rearrange} }) {
218 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
219 }
220
221 $self->{normCode} = undef;
a7fbee98 222
223 if (defined $self->{normalization}) {
224 eval { require Unicode::Normalize };
225 croak "Unicode/Normalize.pm is required to normalize strings: $@"
226 if $@;
227
a7fbee98 228 $getCombinClass = \&Unicode::Normalize::getCombinClass
229 if ! $getCombinClass;
230
1d2654e1 231 my $norm = $self->{normalization};
232 $self->{normCode} = sub {
233 Unicode::Normalize::normalize($norm, shift);
234 };
235
236 eval { $self->{normCode}->("") }; # try
237 if ($@) {
238 croak "$PACKAGE unknown normalization form name: $norm";
239 }
a7fbee98 240 }
0116f5dc 241 return;
242}
243
244sub new
245{
246 my $class = shift;
247 my $self = bless { @_ }, $class;
45394607 248
a7fbee98 249 # If undef is passed explicitly, no file is read.
0116f5dc 250 $self->{table} = $KeyFile if ! exists $self->{table};
251 $self->read_table if defined $self->{table};
905aa9f0 252
a7fbee98 253 if ($self->{entry}) {
254 $self->parseEntry($_) foreach split /\n/, $self->{entry};
255 }
905aa9f0 256
9f1f04a1 257 $self->{level} ||= MaxLevel;
0116f5dc 258 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 259
0116f5dc 260 $self->{overrideHangul} = ''
261 if ! exists $self->{overrideHangul};
262 $self->{overrideCJK} = ''
263 if ! exists $self->{overrideCJK};
264 $self->{normalization} = 'D'
265 if ! exists $self->{normalization};
266 $self->{alternate} = $self->{alternateTable} || 'shifted'
267 if ! exists $self->{alternate};
268 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 269 if ! exists $self->{rearrange};
0116f5dc 270 $self->{backwards} = $self->{backwardsTable}
271 if ! exists $self->{backwards};
a7fbee98 272
0116f5dc 273 $self->checkCollator;
a7fbee98 274
275 return $self;
276}
905aa9f0 277
278sub read_table {
a7fbee98 279 my $self = shift;
280 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
281
e69a2255 282 my $filepath = File::Spec->catfile($Path, $file);
283 open my $fk, "<$filepath"
284 or croak "File does not exist at $filepath";
a7fbee98 285
286 while (<$fk>) {
287 next if /^\s*#/;
288 if (/^\s*\@/) {
0116f5dc 289 if (/^\s*\@version\s*(\S*)/) {
290 $self->{versionTable} ||= $1;
291 }
292 elsif (/^\s*\@alternate\s+(\S*)/) {
293 $self->{alternateTable} ||= $1;
a7fbee98 294 }
0116f5dc 295 elsif (/^\s*\@backwards\s+(\S*)/) {
296 push @{ $self->{backwardsTable} }, $1;
a7fbee98 297 }
0116f5dc 298 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
299 push @{ $self->{forwardsTable} }, $1;
a7fbee98 300 }
0116f5dc 301 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
302 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 303 }
304 next;
305 }
306 $self->parseEntry($_);
45394607 307 }
a7fbee98 308 close $fk;
45394607 309}
310
905aa9f0 311
45394607 312##
313## get $line, parse it, and write an entry in $self
314##
315sub parseEntry
316{
a7fbee98 317 my $self = shift;
318 my $line = shift;
4d36a948 319 my($name, $entry, @uv, @key);
a7fbee98 320
321 return if $line !~ /^\s*[0-9A-Fa-f]/;
322
323 # removes comment and gets name
324 $name = $1
325 if $line =~ s/[#%]\s*(.*)//;
326 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
327
328 # gets element
329 my($e, $k) = split /;/, $line;
330 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
331 if ! $k;
332
4d36a948 333 @uv = _getHexArray($e);
334 return if !@uv;
335
336 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 337
4d36a948 338 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 339 my $ele = pack_U(@uv);
a7fbee98 340
4d36a948 341 # regarded as if it were not entried in the table
342 return
343 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 344
4d36a948 345 # replaced as completely ignorable
346 $k = '[.0000.0000.0000.0000]'
347 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 348 }
0116f5dc 349
4d36a948 350 # replaced as completely ignorable
351 $k = '[.0000.0000.0000.0000]'
352 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
353
4c843366 354 my $is_L3_ignorable = TRUE;
4d36a948 355
caffd4cf 356 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
357 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 358 my @wt = _getHexArray($arr);
359 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 360 $is_L3_ignorable = FALSE
361 if $wt[0] + $wt[1] + $wt[2] != 0;
4d36a948 362 # if $arr !~ /[1-9A-Fa-f]/; NG
363 # Conformance Test shows L3-ignorable is completely ignorable.
4c843366 364 # For expansion, an entry $is_L3_ignorable
365 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 366 }
caffd4cf 367
4d36a948 368 $self->{entries}{$entry} = \@key;
caffd4cf 369
4d36a948 370 $self->{L3_ignorable}{$uv[0]} = TRUE
371 if @uv == 1 && $is_L3_ignorable;
caffd4cf 372
4d36a948 373 # Contraction is to be considered in the range of this maxlength.
374 $self->{maxlength}{$uv[0]} = scalar @uv
375 if @uv > 1;
45394607 376}
377
9f1f04a1 378
45394607 379##
9f1f04a1 380## arrayref[weights] = altCE(VCE)
45394607 381##
d16e9e3d 382sub altCE
45394607 383{
a7fbee98 384 my $self = shift;
4d36a948 385 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
a7fbee98 386
387 $self->{alternate} eq 'blanked' ?
9f1f04a1 388 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
a7fbee98 389 $self->{alternate} eq 'non-ignorable' ?
4d36a948 390 \@wt :
a7fbee98 391 $self->{alternate} eq 'shifted' ?
9f1f04a1 392 $var ? [Var1Wt, 0, 0, $wt[0] ]
393 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
a7fbee98 394 $self->{alternate} eq 'shift-trimmed' ?
9f1f04a1 395 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
a7fbee98 396 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607 397}
398
45394607 399sub viewSortKey
400{
a7fbee98 401 my $self = shift;
9f1f04a1 402 $self->visualizeSortKey($self->getSortKey(@_));
403}
0116f5dc 404
9f1f04a1 405sub visualizeSortKey
406{
407 my $self = shift;
408 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 409
9f1f04a1 410 if ($self->{UCA_Version} <= 8) {
0116f5dc 411 $view =~ s/ ?0000 ?/|/g;
412 } else {
413 $view =~ s/\b0000\b/|/g;
414 }
a7fbee98 415 return "[$view]";
45394607 416}
417
d16e9e3d 418
45394607 419##
4d36a948 420## arrayref of JCPS = splitCE(string to be collated)
421## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
45394607 422##
d16e9e3d 423sub splitCE
45394607 424{
a7fbee98 425 my $self = shift;
4d36a948 426 my $wLen = $_[1];
427
a7fbee98 428 my $code = $self->{preprocess};
0116f5dc 429 my $norm = $self->{normCode};
a7fbee98 430 my $ent = $self->{entries};
431 my $max = $self->{maxlength};
432 my $reH = $self->{rearrangeHash};
4d36a948 433 my $ign = $self->{L3_ignorable};
0116f5dc 434 my $ver9 = $self->{UCA_Version} > 8;
a7fbee98 435
4d36a948 436 my ($str, @buf);
a7fbee98 437
4d36a948 438 if ($wLen) {
439 $code and croak "Preprocess breaks character positions. "
440 . "Don't use with index(), match(), etc.";
441 $norm and croak "Normalization breaks character positions. "
442 . "Don't use with index(), match(), etc.";
443 $str = $_[0];
444 }
445 else {
446 $str = $_[0];
447 $str = &$code($str) if ref $code;
448 $str = &$norm($str) if ref $norm;
449 }
a7fbee98 450
4d36a948 451 # get array of Unicode code point of string.
9f1f04a1 452 my @src = unpack_U($str);
4d36a948 453
454 # rearrangement:
455 # Character positions are not kept if rearranged,
456 # then neglected if $wLen is true.
457 if ($reH && ! $wLen) {
a7fbee98 458 for (my $i = 0; $i < @src; $i++) {
459 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
460 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
461 $i++;
462 }
463 }
45394607 464 }
45394607 465
0116f5dc 466 if ($ver9) {
4d36a948 467 # To remove a character marked as a completely ignorable.
468 for (my $i = 0; $i < @src; $i++) {
469 $src[$i] = undef if $ign->{ $src[$i] };
470 }
0116f5dc 471 }
472
a7fbee98 473 for (my $i = 0; $i < @src; $i++) {
4d36a948 474 next if _isNonCharacter($src[$i]);
475
476 my $i_orig = $i;
477 my $ce = $src[$i];
478
479 if ($max->{$ce}) { # contract
480 my $temp_ce = $ce;
1d2654e1 481 my $ceLen = 1;
482 my $maxLen = $max->{$ce};
4d36a948 483
1d2654e1 484 for (my $p = $i + 1; $ceLen < $maxLen && $p < @src; $p++) {
4d36a948 485 next if ! defined $src[$p];
486 $temp_ce .= CODE_SEP . $src[$p];
1d2654e1 487 $ceLen++;
4d36a948 488 if ($ent->{$temp_ce}) {
489 $ce = $temp_ce;
490 $i = $p;
491 }
492 }
493 }
494
495 # with Combining Char (UTS#10, 4.2.1).
496 # requires Unicode::Normalize.
497 # Not be $wLen, as not croaked due to $norm.
498 if ($getCombinClass) {
499 for (my $p = $i + 1; $p < @src; $p++) {
500 next if ! defined $src[$p];
501 last unless $getCombinClass->($src[$p]);
502 my $tail = CODE_SEP . $src[$p];
503 if ($ent->{$ce.$tail}) {
504 $ce .= $tail;
505 $src[$p] = undef;
506 }
a7fbee98 507 }
a7fbee98 508 }
509
4d36a948 510 if ($wLen) {
511 for (my $p = $i + 1; $p < @src; $p++) {
512 last if defined $src[$p];
513 $i = $p;
a7fbee98 514 }
515 }
4d36a948 516
517 push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
45394607 518 }
4d36a948 519 return \@buf;
d16e9e3d 520}
45394607 521
d16e9e3d 522
523##
4d36a948 524## list of arrayrefs of weights = getWt(JCPS)
d16e9e3d 525##
526sub getWt
527{
a7fbee98 528 my $self = shift;
4d36a948 529 my $ce = shift;
a7fbee98 530 my $ent = $self->{entries};
0116f5dc 531 my $der = $self->{derivCode};
a7fbee98 532
4d36a948 533 return if !defined $ce;
534 return map($self->altCE($_), @{ $ent->{$ce} })
535 if $ent->{$ce};
0116f5dc 536
4d36a948 537 # CE must not be a contraction, then it's a code point.
538 my $u = $ce;
a7fbee98 539
4d36a948 540 if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
1d2654e1 541 my $hang = $self->{overrideHangul};
542 my @hangulCE;
543 if ($hang) {
544 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
545 }
546 elsif (!defined $hang) {
547 @hangulCE = $der->($u);
548 }
549 else {
550 my $max = $self->{maxlength};
551 my @decH = _decompHangul($u);
552
553 if (@decH == 2) {
554 my $contract = join(CODE_SEP, @decH);
555 @decH = ($contract) if $ent->{$contract};
556 } else { # must be <@decH == 3>
557 if ($max->{$decH[0]}) {
558 my $contract = join(CODE_SEP, @decH);
559 if ($ent->{$contract}) {
560 @decH = ($contract);
561 } else {
562 $contract = join(CODE_SEP, @decH[0,1]);
563 $ent->{$contract} and @decH = ($contract, $decH[2]);
564 }
565 # even if V's ignorable, LT contraction is not supported.
566 # If such a situatution were required, NFD should be used.
567 }
568 if (@decH == 3 && $max->{$decH[1]}) {
569 my $contract = join(CODE_SEP, @decH[1,2]);
570 $ent->{$contract} and @decH = ($decH[0], $contract);
571 }
572 }
573
574 @hangulCE = map({
575 $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
576 } @decH);
577 }
578 return map $self->altCE($_), @hangulCE;
a7fbee98 579 }
580 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
581 0x4E00 <= $u && $u <= 0x9FA5 ||
1d2654e1 582 0x20000 <= $u && $u <= 0x2A6D6) # CJK Ideograph
583 {
584 my $cjk = $self->{overrideCJK};
0116f5dc 585 return map $self->altCE($_),
586 $cjk
4d36a948 587 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
0116f5dc 588 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
9f1f04a1 589 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 590 : $der->($u);
a7fbee98 591 }
592 else {
0116f5dc 593 return map $self->altCE($_), $der->($u);
a7fbee98 594 }
d16e9e3d 595}
596
d16e9e3d 597
598##
599## string sortkey = getSortKey(string arg)
600##
601sub getSortKey
602{
a7fbee98 603 my $self = shift;
604 my $lev = $self->{level};
4d36a948 605 my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
0116f5dc 606 my $ver9 = $self->{UCA_Version} > 8;
4d36a948 607 my $v2i = $self->{alternate} ne 'non-ignorable';
a7fbee98 608
609 # weight arrays
0116f5dc 610 my (@buf, $last_is_variable);
611
4d36a948 612 foreach my $wt (map $self->getWt($_), @$rCE) {
613 if ($v2i && $ver9) {
614 if ($wt->[0] == 0) { # ignorable
615 next if $last_is_variable;
0116f5dc 616 } else {
9f1f04a1 617 $last_is_variable = ($wt->[0] == Var1Wt);
0116f5dc 618 }
619 }
4d36a948 620 push @buf, $wt;
0116f5dc 621 }
a7fbee98 622
623 # make sort key
624 my @ret = ([],[],[],[]);
625 foreach my $v (0..$lev-1) {
626 foreach my $b (@buf) {
4d36a948 627 push @{ $ret[$v] }, $b->[$v]
628 if 0 < $b->[$v];
a7fbee98 629 }
630 }
45394607 631
a7fbee98 632 # modification of tertiary weights
633 if ($self->{upper_before_lower}) {
634 foreach (@{ $ret[2] }) {
635 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
636 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
637 elsif ($_ == 0x1C) { $_ += 1 } # square upper
638 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
639 }
45394607 640 }
a7fbee98 641 if ($self->{katakana_before_hiragana}) {
642 foreach (@{ $ret[2] }) {
643 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
644 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
645 }
45394607 646 }
9f1f04a1 647
648 if ($self->{backwardsFlag}) {
649 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
650 if ($self->{backwardsFlag} & (1 << $v)) {
651 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
652 }
653 }
654 }
655
4d36a948 656 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 657}
658
659
660##
d16e9e3d 661## int compare = cmp(string a, string b)
45394607 662##
5398038e 663sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
664sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
665sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
666sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
667sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
668sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
669sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 670
671##
d16e9e3d 672## list[strings] sorted = sort(list[strings] arg)
45394607 673##
a7fbee98 674sub sort {
675 my $obj = shift;
676 return
677 map { $_->[1] }
678 sort{ $a->[0] cmp $b->[0] }
679 map [ $obj->getSortKey($_), $_ ], @_;
45394607 680}
681
0116f5dc 682
4d36a948 683sub _derivCE_9 {
0116f5dc 684 my $u = shift;
685 my $base =
4d36a948 686 (0x4E00 <= $u && $u <= 0x9FA5)
687 ? 0xFB40 : # CJK
0116f5dc 688 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
4d36a948 689 ? 0xFB80 # CJK ext.
690 : 0xFBC0; # others
0116f5dc 691
692 my $aaaa = $base + ($u >> 15);
693 my $bbbb = ($u & 0x7FFF) | 0x8000;
694 return
9f1f04a1 695 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
696 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 697}
698
4d36a948 699sub _derivCE_8 {
0116f5dc 700 my $code = shift;
701 my $aaaa = 0xFF80 + ($code >> 15);
702 my $bbbb = ($code & 0x7FFF) | 0x8000;
703 return
4d36a948 704 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
705 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 706}
707
708##
709## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
710##
a7fbee98 711sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 712
a7fbee98 713#
4d36a948 714# $code *must* be in Hangul syllable.
a7fbee98 715# Check it before you enter here.
716#
5398038e 717sub _decompHangul {
718 my $code = shift;
5398038e 719 my $SIndex = $code - 0xAC00;
720 my $LIndex = int( $SIndex / 588);
721 my $VIndex = int(($SIndex % 588) / 28);
722 my $TIndex = $SIndex % 28;
723 return (
a7fbee98 724 0x1100 + $LIndex,
725 0x1161 + $VIndex,
726 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 727 );
45394607 728}
729
4d36a948 730sub _isNonCharacter {
731 my $code = shift;
732 return ! defined $code # removed
733 || ($code < 0 || 0x10FFFF < $code) # out of range
734 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
735 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
736 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
737 ;
738}
739
740
741##
742## bool _nonIgnorAtLevel(arrayref weights, int level)
743##
744sub _nonIgnorAtLevel($$)
745{
746 my $wt = shift;
747 return if ! defined $wt;
748 my $lv = shift;
9f1f04a1 749 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 750}
751
752##
753## bool _eqArray(
754## arrayref of arrayref[weights] source,
755## arrayref of arrayref[weights] substr,
756## int level)
757## * comparison of graphemes vs graphemes.
758## @$source >= @$substr must be true (check it before call this);
759##
760sub _eqArray($$$)
761{
762 my $source = shift;
763 my $substr = shift;
764 my $lev = shift;
765
766 for my $g (0..@$substr-1){
767 # Do the $g'th graphemes have the same number of AV weigths?
768 return if @{ $source->[$g] } != @{ $substr->[$g] };
769
770 for my $w (0..@{ $substr->[$g] }-1) {
771 for my $v (0..$lev-1) {
772 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
773 }
774 }
775 }
776 return 1;
777}
778
779##
780## (int position, int length)
781## int position = index(string, substring, position, [undoc'ed grobal])
782##
783## With "grobal" (only for the list context),
784## returns list of arrayref[position, length].
785##
786sub index
787{
788 my $self = shift;
789 my $str = shift;
790 my $len = length($str);
791 my $subCE = $self->splitCE(shift);
792 my $pos = @_ ? shift : 0;
793 $pos = 0 if $pos < 0;
794 my $grob = shift;
795
4d36a948 796 my $lev = $self->{level};
797 my $ver9 = $self->{UCA_Version} > 8;
798 my $v2i = $self->{alternate} ne 'non-ignorable';
799
800 if (! @$subCE) {
801 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
802 return $grob
803 ? map([$_, 0], $temp..$len)
804 : wantarray ? ($temp,0) : $temp;
805 }
806 if ($len < $pos) {
807 return wantarray ? () : NOMATCHPOS;
808 }
809 my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
810 if (! @$strCE) {
811 return wantarray ? () : NOMATCHPOS;
812 }
813 my $last_is_variable;
814 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
815
816 $last_is_variable = FALSE;
817 for my $wt (map $self->getWt($_), @$subCE) {
818 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
819
820 if ($v2i && $ver9) {
821 if ($wt->[0] == 0) {
822 $to_be_pushed = FALSE if $last_is_variable;
823 } else {
9f1f04a1 824 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 825 }
826 }
827
828 if (@subWt && $wt->[0] == 0) {
829 push @{ $subWt[-1] }, $wt if $to_be_pushed;
830 } else {
9f1f04a1 831 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 832 push @subWt, [ $wt ];
833 }
834 }
835
836 my $count = 0;
837 my $end = @$strCE - 1;
838
839 $last_is_variable = FALSE;
840
841 for (my $i = 0; $i <= $end; ) { # no $i++
842 my $found_base = 0;
843
844 # fetch a grapheme
845 while ($i <= $end && $found_base == 0) {
846 for my $wt ($self->getWt($strCE->[$i][0])) {
847 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
848
849 if ($v2i && $ver9) {
850 if ($wt->[0] == 0) {
851 $to_be_pushed = FALSE if $last_is_variable;
852 } else {
9f1f04a1 853 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 854 }
855 }
856
857 if (@strWt && $wt->[0] == 0) {
858 push @{ $strWt[-1] }, $wt if $to_be_pushed;
859 $finPos[-1] = $strCE->[$i][2];
860 } elsif ($to_be_pushed) {
9f1f04a1 861 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 862 push @strWt, [ $wt ];
863 push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
864 $finPos[-1] = NOMATCHPOS if $found_base;
865 push @finPos, $strCE->[$i][2];
866 $found_base++;
867 }
868 # else ===> no-op
869 }
870 $i++;
871 }
872
873 # try to match
874 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
875 if ($iniPos[0] != NOMATCHPOS &&
876 $finPos[$#subWt] != NOMATCHPOS &&
877 _eqArray(\@strWt, \@subWt, $lev)) {
878 my $temp = $iniPos[0] + $pos;
879
880 if ($grob) {
881 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
882 splice @strWt, 0, $#subWt;
883 splice @iniPos, 0, $#subWt;
884 splice @finPos, 0, $#subWt;
885 }
886 else {
887 return wantarray
888 ? ($temp, $finPos[$#subWt] - $iniPos[0])
889 : $temp;
890 }
891 }
892 shift @strWt;
893 shift @iniPos;
894 shift @finPos;
895 }
896 }
897
898 return $grob
899 ? @g_ret
900 : wantarray ? () : NOMATCHPOS;
901}
902
903##
904## scalarref to matching part = match(string, substring)
905##
906sub match
907{
908 my $self = shift;
909 if (my($pos,$len) = $self->index($_[0], $_[1])) {
910 my $temp = substr($_[0], $pos, $len);
911 return wantarray ? $temp : \$temp;
912 # An lvalue ref \substr should be avoided,
913 # since its value is affected by modification of its referent.
914 }
915 else {
916 return;
917 }
918}
919
920##
921## arrayref matching parts = gmatch(string, substring)
922##
923sub gmatch
924{
925 my $self = shift;
926 my $str = shift;
927 my $sub = shift;
928 return map substr($str, $_->[0], $_->[1]),
929 $self->index($str, $sub, 0, 'g');
930}
931
932##
933## bool subst'ed = subst(string, substring, replace)
934##
935sub subst
936{
937 my $self = shift;
938 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
939
940 if (my($pos,$len) = $self->index($_[0], $_[1])) {
941 if ($code) {
942 my $mat = substr($_[0], $pos, $len);
943 substr($_[0], $pos, $len, $code->($mat));
944 } else {
945 substr($_[0], $pos, $len, $_[2]);
946 }
947 return TRUE;
948 }
949 else {
950 return FALSE;
951 }
952}
953
954##
955## int count = gsubst(string, substring, replace)
956##
957sub gsubst
958{
959 my $self = shift;
960 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
961 my $cnt = 0;
962
963 # Replacement is carried out from the end, then use reverse.
964 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
965 if ($code) {
966 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
967 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
968 } else {
969 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
970 }
971 $cnt++;
972 }
973 return $cnt;
974}
975
45394607 9761;
977__END__
978
979=head1 NAME
980
a7fbee98 981Unicode::Collate - Unicode Collation Algorithm
45394607 982
983=head1 SYNOPSIS
984
985 use Unicode::Collate;
986
987 #construct
5398038e 988 $Collator = Unicode::Collate->new(%tailoring);
45394607 989
990 #sort
5398038e 991 @sorted = $Collator->sort(@not_sorted);
45394607 992
993 #compare
a7fbee98 994 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 995
996=head1 DESCRIPTION
997
4d36a948 998This module is an implementation
999of Unicode Technical Standard #10 (UTS #10)
1000"Unicode Collation Algorithm."
1001
45394607 1002=head2 Constructor and Tailoring
1003
d16e9e3d 1004The C<new> method returns a collator object.
1005
5398038e 1006 $Collator = Unicode::Collate->new(
0116f5dc 1007 UCA_Version => $UCA_Version,
45394607 1008 alternate => $alternate,
1009 backwards => $levelNumber, # or \@levelNumbers
1010 entry => $element,
1011 normalization => $normalization_form,
1012 ignoreName => qr/$ignoreName/,
1013 ignoreChar => qr/$ignoreChar/,
1014 katakana_before_hiragana => $bool,
1015 level => $collationLevel,
1016 overrideCJK => \&overrideCJK,
1017 overrideHangul => \&overrideHangul,
1018 preprocess => \&preprocess,
1019 rearrange => \@charList,
1020 table => $filename,
1021 undefName => qr/$undefName/,
1022 undefChar => qr/$undefChar/,
1023 upper_before_lower => $bool,
1024 );
a7fbee98 1025 # if %tailoring is false (i.e. empty),
5398038e 1026 # $Collator should do the default collation.
45394607 1027
1028=over 4
1029
0116f5dc 1030=item UCA_Version
1031
1032If the version number of the older UCA is given,
1033the older behavior of that version is emulated on collating.
1034If omitted, the return value of C<UCA_Version()> is used.
1035
1036The supported version: 8 or 9.
1037
1038B<This parameter may be removed in the future version,
1039as switching the algorithm would affect the performance.>
1040
45394607 1041=item alternate
1042
4d36a948 1043-- see 3.2.2 Variable Weighting, UTS #10.
caffd4cf 1044
1045(the title in UCA version 8: Alternate Weighting)
45394607 1046
a7fbee98 1047This key allows to alternate weighting for variable collation elements,
1048which are marked with an ASTERISK in the table
1049(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1050
1051 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 1052
a7fbee98 1053These names are case-insensitive.
45394607 1054By default (if specification is omitted), 'shifted' is adopted.
1055
4d36a948 1056 'Blanked' Variable elements are made ignorable at levels 1 through 3;
a7fbee98 1057 considered at the 4th level.
1058
1059 'Non-ignorable' Variable elements are not reset to ignorable.
1060
4d36a948 1061 'Shifted' Variable elements are made ignorable at levels 1 through 3
a7fbee98 1062 their level 4 weight is replaced by the old level 1 weight.
1063 Level 4 weight for Non-Variable elements is 0xFFFF.
1064
1065 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1066 are trimmed.
1067
45394607 1068=item backwards
1069
4d36a948 1070-- see 3.1.2 French Accents, UTS #10.
45394607 1071
1072 backwards => $levelNumber or \@levelNumbers
1073
1074Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1075If omitted, forwards at all the levels.
1076
1077=item entry
1078
4d36a948 1079-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1080
a7fbee98 1081Overrides a default order or defines additional collation elements
45394607 1082
1083 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 108400E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 10850063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
10860043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1087ENTRIES
1088
4d36a948 1089B<NOTE:> The code point in the UCA file format (before C<';'>)
1090B<must> be a Unicode code point, but not a native code point.
1091So C<0063> must always denote C<U+0063>,
1092but not a character of C<"\x63">.
1093
45394607 1094=item ignoreName
1095
1096=item ignoreChar
1097
4d36a948 1098-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1099
caffd4cf 1100Makes the entry in the table completely ignorable;
1101i.e. as if the weights were zero at all level.
45394607 1102
a7fbee98 1103E.g. when 'a' and 'e' are ignorable,
45394607 1104'element' is equal to 'lament' (or 'lmnt').
1105
1106=item level
1107
4d36a948 1108-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1109
1110Set the maximum level.
1111Any higher levels than the specified one are ignored.
1112
1113 Level 1: alphabetic ordering
1114 Level 2: diacritic ordering
1115 Level 3: case ordering
1116 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1117
1118 ex.level => 2,
1119
a7fbee98 1120If omitted, the maximum is the 4th.
1121
45394607 1122=item normalization
1123
4d36a948 1124-- see 4.1 Normalize each input string, UTS #10.
45394607 1125
905aa9f0 1126If specified, strings are normalized before preparation of sort keys
45394607 1127(the normalization is executed after preprocess).
1128
1d2654e1 1129A form name C<Unicode::Normalize::normalize()> accepts will be applied
1130as C<$normalization_form>.
1131See C<Unicode::Normalize::normalize()> for detail.
1132If omitted, C<'NFD'> is used.
45394607 1133
1d2654e1 1134L<normalization> is performed after L<preprocess> (if defined).
45394607 1135
a7fbee98 1136If C<undef> is passed explicitly as the value for this key,
45394607 1137any normalization is not carried out (this may make tailoring easier
1138if any normalization is not desired).
1139
1140see B<CAVEAT>.
1141
1142=item overrideCJK
1143
4d36a948 1144-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1145
1146By default, mapping of CJK Unified Ideographs
a7fbee98 1147uses the Unicode codepoint order.
1148But the mapping of CJK Unified Ideographs may be overrided.
45394607 1149
a7fbee98 1150ex. CJK Unified Ideographs in the JIS code point order.
45394607 1151
1152 overrideCJK => sub {
a7fbee98 1153 my $u = shift; # get a Unicode codepoint
1154 my $b = pack('n', $u); # to UTF-16BE
1155 my $s = your_unicode_to_sjis_converter($b); # convert
1156 my $n = unpack('n', $s); # convert sjis to short
1157 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1158 },
1159
a7fbee98 1160ex. ignores all CJK Unified Ideographs.
1161
1162 overrideCJK => sub {()}, # CODEREF returning empty list
1163
1164 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1165 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1166
1167If C<undef> is passed explicitly as the value for this key,
1168weights for CJK Unified Ideographs are treated as undefined.
1169But assignment of weight for CJK Unified Ideographs
1170in table or L<entry> is still valid.
1171
1172=item overrideHangul
1173
4d36a948 1174-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1175
1176By default, Hangul Syllables are decomposed into Hangul Jamo.
1177But the mapping of Hangul Syllables may be overrided.
1178
1179This tag works like L<overrideCJK>, so see there for examples.
1180
45394607 1181If you want to override the mapping of Hangul Syllables,
1182the Normalization Forms D and KD are not appropriate
1183(they will be decomposed before overriding).
1184
a7fbee98 1185If C<undef> is passed explicitly as the value for this key,
1186weight for Hangul Syllables is treated as undefined
1187without decomposition into Hangul Jamo.
1188But definition of weight for Hangul Syllables
1189in table or L<entry> is still valid.
1190
45394607 1191=item preprocess
1192
4d36a948 1193-- see 5.1 Preprocessing, UTS #10.
45394607 1194
1195If specified, the coderef is used to preprocess
1196before the formation of sort keys.
1197
a7fbee98 1198ex. dropping English articles, such as "a" or "the".
45394607 1199Then, "the pen" is before "a pencil".
1200
1201 preprocess => sub {
1202 my $str = shift;
a7fbee98 1203 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1204 return $str;
45394607 1205 },
1206
1d2654e1 1207L<preprocess> is performed before L<normalization> (if defined).
1208
45394607 1209=item rearrange
1210
4d36a948 1211-- see 3.1.3 Rearrangement, UTS #10.
45394607 1212
1213Characters that are not coded in logical order and to be rearranged.
a7fbee98 1214By default,
45394607 1215
1216 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1217
a7fbee98 1218If you want to disallow any rearrangement,
1219pass C<undef> or C<[]> (a reference to an empty list)
1220as the value for this key.
1221
0116f5dc 1222B<According to the version 9 of UCA, this parameter shall not be used;
1223but it is not warned at present.>
1224
45394607 1225=item table
1226
4d36a948 1227-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1228
1229You can use another element table if desired.
ae6aa562 1230The table file must be put into a directory
1231where F<Unicode/Collate.pm> is installed.
1232E.g. in F<perl/lib/Unicode/Collate> directory
1233when you have F<perl/lib/Unicode/Collate.pm>.
45394607 1234
ae6aa562 1235By default, the filename F<"allkeys.txt"> is used.
45394607 1236
a7fbee98 1237If C<undef> is passed explicitly as the value for this key,
1238no file is read (but you can define collation elements via L<entry>).
1239
1240A typical way to define a collation element table
1241without any file of table:
1242
1243 $onlyABC = Unicode::Collate->new(
1244 table => undef,
1245 entry => << 'ENTRIES',
12460061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
12470041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
12480062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
12490042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
12500063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
12510043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1252ENTRIES
1253 );
905aa9f0 1254
45394607 1255=item undefName
1256
1257=item undefChar
1258
4d36a948 1259-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1260
1261Undefines the collation element as if it were unassigned in the table.
1262This reduces the size of the table.
1263If an unassigned character appears in the string to be collated,
1264the sort key is made from its codepoint
1265as a single-character collation element,
1266as it is greater than any other assigned collation elements
1267(in the codepoint order among the unassigned characters).
1268But, it'd be better to ignore characters
1269unfamiliar to you and maybe never used.
1270
1271=item katakana_before_hiragana
1272
1273=item upper_before_lower
1274
4d36a948 1275-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1276
1277By default, lowercase is before uppercase
1278and hiragana is before katakana.
1279
a7fbee98 1280If the tag is made true, this is reversed.
1281
1282B<NOTE>: These tags simplemindedly assume
1283any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1 1284must occur in level 3, and their weights at level 3
1285must be same as those mentioned in 7.3.1, UTS #10.
1286If you define your collation elements which violate this requirement,
4d36a948 1287these tags don't work validly.
45394607 1288
1289=back
1290
3164dd77 1291=head2 Methods for Collation
45394607 1292
1293=over 4
1294
5398038e 1295=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1296
1297Sorts a list of strings.
1298
5398038e 1299=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1300
1301Returns 1 (when C<$a> is greater than C<$b>)
1302or 0 (when C<$a> is equal to C<$b>)
1303or -1 (when C<$a> is lesser than C<$b>).
1304
5398038e 1305=item C<$result = $Collator-E<gt>eq($a, $b)>
1306
1307=item C<$result = $Collator-E<gt>ne($a, $b)>
1308
1309=item C<$result = $Collator-E<gt>lt($a, $b)>
1310
1311=item C<$result = $Collator-E<gt>le($a, $b)>
1312
1313=item C<$result = $Collator-E<gt>gt($a, $b)>
1314
1315=item C<$result = $Collator-E<gt>ge($a, $b)>
1316
a7fbee98 1317They works like the same name operators as theirs.
5398038e 1318
1319 eq : whether $a is equal to $b.
1320 ne : whether $a is not equal to $b.
1321 lt : whether $a is lesser than $b.
1322 le : whether $a is lesser than $b or equal to $b.
1323 gt : whether $a is greater than $b.
1324 ge : whether $a is greater than $b or equal to $b.
1325
1326=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1327
4d36a948 1328-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1329
1330Returns a sort key.
1331
1332You compare the sort keys using a binary comparison
1333and get the result of the comparison of the strings using UCA.
1334
5398038e 1335 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1336
1337 is equivalent to
1338
5398038e 1339 $Collator->cmp($a, $b)
45394607 1340
a7fbee98 1341=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1342
a7fbee98 1343 use Unicode::Collate;
1344 my $c = Unicode::Collate->new();
1345 print $c->viewSortKey("Perl"),"\n";
1346
0116f5dc 1347 # output:
1348 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1349 # Level 1 Level 2 Level 3 Level 4
1350
1351 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1352
4d36a948 1353=back
1354
1355=head2 Methods for Searching
d16e9e3d 1356
4d36a948 1357B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1358for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1359C<subst>, C<gsubst>) is croaked,
1360as the position and the length might differ
1361from those on the specified string.
1362(And the C<rearrange> tag is neglected.)
d16e9e3d 1363
4d36a948 1364The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1365like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1366but they are not aware of any pattern, but only a literal substring.
1367
1368=over 4
1369
1370=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1371
1372=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1373
1374If C<$substring> matches a part of C<$string>, returns
1375the position of the first occurrence of the matching part in scalar context;
1376in list context, returns a two-element list of
1377the position and the length of the matching part.
1378
d16e9e3d 1379If C<$substring> does not match any part of C<$string>,
1380returns C<-1> in scalar context and
1381an empty list in list context.
1382
1383e.g. you say
1384
5398038e 1385 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1386 # (normalization => undef) is REQUIRED.
1387 my $str = "Ich muß studieren Perl.";
1388 my $sub = "MÜSS";
d16e9e3d 1389 my $match;
a7fbee98 1390 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1391 $match = substr($str, $pos, $len);
d16e9e3d 1392 }
1393
4d36a948 1394and get C<"muß"> in C<$match> since C<"muß">
1395is primary equal to C<"MÜSS">.
1396
1397=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1398
1399=item C<($match) = $Collator-E<gt>match($string, $substring)>
1400
1401If C<$substring> matches a part of C<$string>, in scalar context, returns
1402B<a reference to> the first occurrence of the matching part
1403(C<$match_ref> is always true if matches,
1404since every reference is B<true>);
1405in list context, returns the first occurrence of the matching part.
1406
1407If C<$substring> does not match any part of C<$string>,
1408returns C<undef> in scalar context and
1409an empty list in list context.
1410
1411e.g.
1412
1413 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1414 print "matches [$$match_ref].\n";
1415 } else {
1416 print "doesn't match.\n";
1417 }
1418
1419 or
1420
1421 if (($match) = $Collator->match($str, $sub)) { # list context
1422 print "matches [$match].\n";
1423 } else {
1424 print "doesn't match.\n";
1425 }
1426
1427=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1428
1429If C<$substring> matches a part of C<$string>, returns
1430all the matching parts (or matching count in scalar context).
1431
1432If C<$substring> does not match any part of C<$string>,
1433returns an empty list.
1434
1435=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1436
1437If C<$substring> matches a part of C<$string>,
1438the first occurrence of the matching part is replaced by C<$replacement>
1439(C<$string> is modified) and return C<$count> (always equals to C<1>).
1440
1441C<$replacement> can be a C<CODEREF>,
1442taking the matching part as an argument,
1443and returning a string to replace the matching part
1444(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1445
1446=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1447
1448If C<$substring> matches a part of C<$string>,
1449all the occurrences of the matching part is replaced by C<$replacement>
1450(C<$string> is modified) and return C<$count>.
1451
1452C<$replacement> can be a C<CODEREF>,
1453taking the matching part as an argument,
1454and returning a string to replace the matching part
1455(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1456
1457e.g.
1458
1459 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1460 # (normalization => undef) is REQUIRED.
1461 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1462 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1463
1464 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1465 # i.e., all the camels are made bold-faced.
d16e9e3d 1466
45394607 1467=back
1468
3164dd77 1469=head2 Other Methods
1470
1471=over 4
1472
0116f5dc 1473=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1474
1475Change the value of specified keys and returns the changed part.
1476
1477 $Collator = Unicode::Collate->new(level => 4);
1478
1479 $Collator->eq("perl", "PERL"); # false
1480
1481 %old = $Collator->change(level => 2); # returns (level => 4).
1482
1483 $Collator->eq("perl", "PERL"); # true
1484
1485 $Collator->change(%old); # returns (level => 2).
1486
1487 $Collator->eq("perl", "PERL"); # false
1488
1489Not all C<(key,value)>s are allowed to be changed.
1490See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1491
1492In the scalar context, returns the modified collator
1493(but it is B<not> a clone from the original).
1494
1495 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1496
1497 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1498
1499 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1500
3164dd77 1501=item UCA_Version
1502
4d36a948 1503Returns the version number of UTS #10 this module consults.
3164dd77 1504
1505=item Base_Unicode_Version
1506
1507Returns the version number of the Unicode Standard
1508this module is based on.
1509
1510=back
1511
45394607 1512=head2 EXPORT
1513
1514None by default.
1515
1516=head2 CAVEAT
1517
1518Use of the C<normalization> parameter requires
1519the B<Unicode::Normalize> module.
1520
5398038e 1521If you need not it (say, in the case when you need not
45394607 1522handle any combining characters),
1523assign C<normalization =E<gt> undef> explicitly.
1524
4d36a948 1525-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1526
0116f5dc 1527=head2 Conformance Test
1528
1529The Conformance Test for the UCA is provided
1530in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1531and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1532
1533For F<CollationTest_SHIFTED.txt>,
1534a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1535for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1536C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1537
4d36a948 1538B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1539
45394607 1540=head1 AUTHOR
1541
1d2654e1 1542SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
45394607 1543
1544 http://homepage1.nifty.com/nomenclator/perl/
1545
ae6aa562 1546 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1547
a7fbee98 1548 This library is free software; you can redistribute it
1549 and/or modify it under the same terms as Perl itself.
45394607 1550
1551=head1 SEE ALSO
1552
1553=over 4
1554
0116f5dc 1555=item http://www.unicode.org/reports/tr10/
45394607 1556
4d36a948 1557Unicode Collation Algorithm - UTS #10
45394607 1558
0116f5dc 1559=item http://www.unicode.org/reports/tr10/allkeys.txt
a7fbee98 1560
1561The Default Unicode Collation Element Table
45394607 1562
0116f5dc 1563=item http://www.unicode.org/reports/tr10/CollationTest.html
1564http://www.unicode.org/reports/tr10/CollationTest.zip
1565
1566The latest versions of the conformance test for the UCA
1567
1568=item http://www.unicode.org/reports/tr15/
a7fbee98 1569
1570Unicode Normalization Forms - UAX #15
1571
a7fbee98 1572=item L<Unicode::Normalize>
45394607 1573
45394607 1574=back
1575
1576=cut