D'oh! Forgot to lib/Digest/base.pm
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607 15require Exporter;
16
91ae00cb 17our $VERSION = '0.30';
45394607 18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
4d36a948 29# Perl's boolean
30use constant TRUE => 1;
31use constant FALSE => "";
32use constant NOMATCHPOS => -1;
33
34# A coderef to get combining class imported from Unicode::Normalize
35# (i.e. \&Unicode::Normalize::getCombinClass).
36# This is also used as a HAS_UNICODE_NORMALIZE flag.
06c8fc8f 37our $CVgetCombinClass;
4d36a948 38
9f1f04a1 39# Supported Levels
40use constant MinLevel => 1;
41use constant MaxLevel => 4;
42
4d36a948 43# Minimum weights at level 2 and 3, respectively
9f1f04a1 44use constant Min2Wt => 0x20;
45use constant Min3Wt => 0x02;
4d36a948 46
47# Shifted weight at 4th level
9f1f04a1 48use constant Shift4Wt => 0xFFFF;
4d36a948 49
50# Variable weight at 1st level.
51# This is a negative value but should be regarded as zero on collation.
52# This is for distinction of variable chars from level 3 ignorable chars.
9f1f04a1 53use constant Var1Wt => -1;
4d36a948 54
55
56# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
57# PROBLEM: The Default Unicode Collation Element Table
58# has weights over 0xFFFF at the 4th level.
59# The tie-breaking in the variable weights
60# other than "shift" (as well as "shift-trimmed") is unreliable.
61use constant VCE_TEMPLATE => 'Cn4';
62
4d36a948 63# A sort key: 16-bit weights
64# See also the PROBLEM on VCE_TEMPLATE above.
65use constant KEY_TEMPLATE => 'n*';
66
67# Level separator in a sort key:
68# i.e. pack(KEY_TEMPLATE, 0)
69use constant LEVEL_SEP => "\0\0";
70
71# As Unicode code point separator for hash keys.
72# A joined code point string (denoted by JCPS below)
73# like "65;768" is used for internal processing
74# instead of Perl's Unicode string like "\x41\x{300}",
75# as the native code point is different from the Unicode code point
76# on EBCDIC platform.
77# This character must not be included in any stringified
78# representation of an integer.
79use constant CODE_SEP => ';';
80
81# boolean values of variable weights
0116f5dc 82use constant NON_VAR => 0; # Non-Variable character
83use constant VAR => 1; # Variable character
3164dd77 84
91ae00cb 85# specific code points
86use constant Hangul_LBase => 0x1100;
87use constant Hangul_LIni => 0x1100;
88use constant Hangul_LFin => 0x1159;
89use constant Hangul_LFill => 0x115F;
90use constant Hangul_VBase => 0x1161;
91use constant Hangul_VIni => 0x1160;
92use constant Hangul_VFin => 0x11A2;
93use constant Hangul_TBase => 0x11A7;
94use constant Hangul_TIni => 0x11A8;
95use constant Hangul_TFin => 0x11F9;
96use constant Hangul_TCount => 28;
97use constant Hangul_NCount => 588;
98use constant Hangul_SBase => 0xAC00;
99use constant Hangul_SIni => 0xAC00;
100use constant Hangul_SFin => 0xD7A3;
101use constant CJK_UidIni => 0x4E00;
102use constant CJK_UidFin => 0x9FA5;
103use constant CJK_ExtAIni => 0x3400;
104use constant CJK_ExtAFin => 0x4DB5;
105use constant CJK_ExtBIni => 0x20000;
106use constant CJK_ExtBFin => 0x2A6D6;
107use constant BMP_Max => 0xFFFF;
108
4d36a948 109# Logical_Order_Exception in PropList.txt
110# TODO: synchronization with change of PropList.txt.
0116f5dc 111our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 112
91ae00cb 113sub UCA_Version { "11" }
a7fbee98 114
91ae00cb 115sub Base_Unicode_Version { "4.0" }
a7fbee98 116
9f1f04a1 117######
118
9f1f04a1 119sub pack_U {
ae6aa562 120 return pack('U*', @_);
9f1f04a1 121}
122
123sub unpack_U {
ae6aa562 124 return unpack('U*', pack('U*').shift);
9f1f04a1 125}
126
127######
128
91ae00cb 129my (%VariableOK);
130@VariableOK{ qw/
0116f5dc 131 blanked non-ignorable shifted shift-trimmed
91ae00cb 132 / } = (); # keys lowercased
0116f5dc 133
134our @ChangeOK = qw/
135 alternate backwards level normalization rearrange
136 katakana_before_hiragana upper_before_lower
137 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 138 hangul_terminator variable
0116f5dc 139 /;
140
141our @ChangeNG = qw/
91ae00cb 142 entry mapping table maxlength
143 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 144 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 145 derivCode normCode rearrangeHash L3_ignorable
9f1f04a1 146 backwardsFlag
0116f5dc 147 /;
9f1f04a1 148# The hash key 'ignored' is deleted at v 0.21.
149# The hash key 'isShift' is deleted at v 0.23.
150# The hash key 'combining' is deleted at v 0.24.
91ae00cb 151# The hash key 'entries' is deleted at v 0.30.
152
153sub version {
154 my $self = shift;
155 return $self->{versionTable} || 'unknown';
156}
0116f5dc 157
158my (%ChangeOK, %ChangeNG);
159@ChangeOK{ @ChangeOK } = ();
160@ChangeNG{ @ChangeNG } = ();
161
162sub change {
163 my $self = shift;
164 my %hash = @_;
165 my %old;
91ae00cb 166 if (exists $hash{variable} && exists $hash{alternate}) {
167 delete $hash{alternate};
168 }
169 elsif (!exists $hash{variable} && exists $hash{alternate}) {
170 $hash{variable} = $hash{alternate};
171 }
0116f5dc 172 foreach my $k (keys %hash) {
173 if (exists $ChangeOK{$k}) {
174 $old{$k} = $self->{$k};
175 $self->{$k} = $hash{$k};
176 }
177 elsif (exists $ChangeNG{$k}) {
178 croak "change of $k via change() is not allowed!";
179 }
180 # else => ignored
181 }
182 $self->checkCollator;
183 return wantarray ? %old : $self;
184}
a7fbee98 185
9f1f04a1 186sub _checkLevel {
187 my $level = shift;
188 my $key = shift;
189 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
190 $level, $key, MinLevel if MinLevel > $level;
191 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
192 $level, $key, MaxLevel if MaxLevel < $level;
193}
194
91ae00cb 195my %DerivCode = (
196 8 => \&_derivCE_8,
197 9 => \&_derivCE_9,
198 11 => \&_derivCE_9, # 11 == 9
199);
200
0116f5dc 201sub checkCollator {
202 my $self = shift;
9f1f04a1 203 _checkLevel($self->{level}, "level");
a7fbee98 204
91ae00cb 205 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
206 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 207
91ae00cb 208 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
209 $self->{alternateTable} || $self->{alternate} || 'shifted';
210 $self->{variable} = $self->{alternate} = lc($self->{variable});
211 exists $VariableOK{ $self->{variable} }
212 or croak "$PACKAGE unknown variable tag name: $self->{variable}";
0116f5dc 213
9f1f04a1 214 if (! defined $self->{backwards}) {
215 $self->{backwardsFlag} = 0;
216 }
217 elsif (! ref $self->{backwards}) {
218 _checkLevel($self->{backwards}, "backwards");
219 $self->{backwardsFlag} = 1 << $self->{backwards};
220 }
221 else {
222 my %level;
223 $self->{backwardsFlag} = 0;
224 for my $b (@{ $self->{backwards} }) {
225 _checkLevel($b, "backwards");
226 $level{$b} = 1;
227 }
228 for my $v (sort keys %level) {
229 $self->{backwardsFlag} += 1 << $v;
230 }
231 }
0116f5dc 232
91ae00cb 233 defined $self->{rearrange} or $self->{rearrange} = [];
234 ref $self->{rearrange}
235 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc 236
237 # keys of $self->{rearrangeHash} are $self->{rearrange}.
238 $self->{rearrangeHash} = undef;
239
240 if (@{ $self->{rearrange} }) {
241 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
242 }
243
244 $self->{normCode} = undef;
a7fbee98 245
246 if (defined $self->{normalization}) {
247 eval { require Unicode::Normalize };
91ae00cb 248 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 249
91ae00cb 250 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 251
91ae00cb 252 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
253 $self->{normCode} = \&Unicode::Normalize::NFD;
254 }
255 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f 256 my $norm = $self->{normalization};
257 $self->{normCode} = sub {
1d2654e1 258 Unicode::Normalize::normalize($norm, shift);
259 };
06c8fc8f 260 eval { $self->{normCode}->("") }; # try
261 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 262 }
a7fbee98 263 }
0116f5dc 264 return;
265}
266
267sub new
268{
269 my $class = shift;
270 my $self = bless { @_ }, $class;
45394607 271
a7fbee98 272 # If undef is passed explicitly, no file is read.
0116f5dc 273 $self->{table} = $KeyFile if ! exists $self->{table};
274 $self->read_table if defined $self->{table};
905aa9f0 275
a7fbee98 276 if ($self->{entry}) {
277 $self->parseEntry($_) foreach split /\n/, $self->{entry};
278 }
905aa9f0 279
9f1f04a1 280 $self->{level} ||= MaxLevel;
0116f5dc 281 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 282
0116f5dc 283 $self->{overrideHangul} = ''
284 if ! exists $self->{overrideHangul};
285 $self->{overrideCJK} = ''
286 if ! exists $self->{overrideCJK};
06c8fc8f 287 $self->{normalization} = 'NFD'
0116f5dc 288 if ! exists $self->{normalization};
0116f5dc 289 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 290 if ! exists $self->{rearrange};
0116f5dc 291 $self->{backwards} = $self->{backwardsTable}
292 if ! exists $self->{backwards};
a7fbee98 293
0116f5dc 294 $self->checkCollator;
a7fbee98 295
296 return $self;
297}
905aa9f0 298
299sub read_table {
a7fbee98 300 my $self = shift;
301 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
302
e69a2255 303 my $filepath = File::Spec->catfile($Path, $file);
304 open my $fk, "<$filepath"
305 or croak "File does not exist at $filepath";
a7fbee98 306
307 while (<$fk>) {
308 next if /^\s*#/;
309 if (/^\s*\@/) {
0116f5dc 310 if (/^\s*\@version\s*(\S*)/) {
311 $self->{versionTable} ||= $1;
312 }
91ae00cb 313 elsif (/^\s*\@variable\s+(\S*)/) { # since UTS #10-9
314 $self->{variableTable} ||= $1;
315 }
316 elsif (/^\s*\@alternate\s+(\S*)/) { # till UTS #10-8
0116f5dc 317 $self->{alternateTable} ||= $1;
a7fbee98 318 }
0116f5dc 319 elsif (/^\s*\@backwards\s+(\S*)/) {
320 push @{ $self->{backwardsTable} }, $1;
a7fbee98 321 }
0116f5dc 322 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
323 push @{ $self->{forwardsTable} }, $1;
a7fbee98 324 }
0116f5dc 325 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
326 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 327 }
328 next;
329 }
330 $self->parseEntry($_);
45394607 331 }
a7fbee98 332 close $fk;
45394607 333}
334
905aa9f0 335
45394607 336##
337## get $line, parse it, and write an entry in $self
338##
339sub parseEntry
340{
a7fbee98 341 my $self = shift;
342 my $line = shift;
4d36a948 343 my($name, $entry, @uv, @key);
a7fbee98 344
345 return if $line !~ /^\s*[0-9A-Fa-f]/;
346
347 # removes comment and gets name
348 $name = $1
349 if $line =~ s/[#%]\s*(.*)//;
350 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
351
352 # gets element
353 my($e, $k) = split /;/, $line;
354 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
355 if ! $k;
356
4d36a948 357 @uv = _getHexArray($e);
358 return if !@uv;
359
360 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 361
4d36a948 362 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 363 my $ele = pack_U(@uv);
a7fbee98 364
4d36a948 365 # regarded as if it were not entried in the table
366 return
367 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 368
4d36a948 369 # replaced as completely ignorable
370 $k = '[.0000.0000.0000.0000]'
371 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 372 }
0116f5dc 373
4d36a948 374 # replaced as completely ignorable
375 $k = '[.0000.0000.0000.0000]'
376 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
377
4c843366 378 my $is_L3_ignorable = TRUE;
4d36a948 379
caffd4cf 380 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
381 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 382 my @wt = _getHexArray($arr);
383 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 384 $is_L3_ignorable = FALSE
385 if $wt[0] + $wt[1] + $wt[2] != 0;
4d36a948 386 # if $arr !~ /[1-9A-Fa-f]/; NG
387 # Conformance Test shows L3-ignorable is completely ignorable.
4c843366 388 # For expansion, an entry $is_L3_ignorable
389 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 390 }
caffd4cf 391
91ae00cb 392 $self->{mapping}{$entry} = \@key;
caffd4cf 393
91ae00cb 394 if (@uv > 1) {
395 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
396 and $self->{maxlength}{$uv[0]} = @uv;
397 }
398 else {
399 $is_L3_ignorable
400 ? ($self->{L3_ignorable}{$uv[0]} = TRUE)
401 : ($self->{L3_ignorable}{$uv[0]} and
402 $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key.
403 }
45394607 404}
405
9f1f04a1 406
45394607 407##
91ae00cb 408## arrayref[weights] = varCE(VCE)
45394607 409##
91ae00cb 410sub varCE
45394607 411{
a7fbee98 412 my $self = shift;
4d36a948 413 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
a7fbee98 414
91ae00cb 415 $self->{variable} eq 'blanked' ?
9f1f04a1 416 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
91ae00cb 417 $self->{variable} eq 'non-ignorable' ?
4d36a948 418 \@wt :
91ae00cb 419 $self->{variable} eq 'shifted' ?
9f1f04a1 420 $var ? [Var1Wt, 0, 0, $wt[0] ]
421 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
91ae00cb 422 $self->{variable} eq 'shift-trimmed' ?
9f1f04a1 423 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
91ae00cb 424 croak "$PACKAGE unknown variable name: $self->{variable}";
45394607 425}
426
45394607 427sub viewSortKey
428{
a7fbee98 429 my $self = shift;
9f1f04a1 430 $self->visualizeSortKey($self->getSortKey(@_));
431}
0116f5dc 432
9f1f04a1 433sub visualizeSortKey
434{
435 my $self = shift;
436 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 437
9f1f04a1 438 if ($self->{UCA_Version} <= 8) {
0116f5dc 439 $view =~ s/ ?0000 ?/|/g;
440 } else {
441 $view =~ s/\b0000\b/|/g;
442 }
a7fbee98 443 return "[$view]";
45394607 444}
445
d16e9e3d 446
45394607 447##
91ae00cb 448## arrayref of JCPS = splitEnt(string to be collated)
449## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 450##
91ae00cb 451sub splitEnt
45394607 452{
a7fbee98 453 my $self = shift;
4d36a948 454 my $wLen = $_[1];
455
a7fbee98 456 my $code = $self->{preprocess};
0116f5dc 457 my $norm = $self->{normCode};
91ae00cb 458 my $map = $self->{mapping};
a7fbee98 459 my $max = $self->{maxlength};
460 my $reH = $self->{rearrangeHash};
4d36a948 461 my $ign = $self->{L3_ignorable};
91ae00cb 462 my $ver9 = $self->{UCA_Version} >= 9;
a7fbee98 463
4d36a948 464 my ($str, @buf);
a7fbee98 465
4d36a948 466 if ($wLen) {
467 $code and croak "Preprocess breaks character positions. "
468 . "Don't use with index(), match(), etc.";
469 $norm and croak "Normalization breaks character positions. "
470 . "Don't use with index(), match(), etc.";
471 $str = $_[0];
472 }
473 else {
474 $str = $_[0];
475 $str = &$code($str) if ref $code;
476 $str = &$norm($str) if ref $norm;
477 }
a7fbee98 478
4d36a948 479 # get array of Unicode code point of string.
9f1f04a1 480 my @src = unpack_U($str);
4d36a948 481
482 # rearrangement:
483 # Character positions are not kept if rearranged,
484 # then neglected if $wLen is true.
485 if ($reH && ! $wLen) {
a7fbee98 486 for (my $i = 0; $i < @src; $i++) {
487 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
488 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
489 $i++;
490 }
491 }
45394607 492 }
45394607 493
0116f5dc 494 if ($ver9) {
4d36a948 495 # To remove a character marked as a completely ignorable.
496 for (my $i = 0; $i < @src; $i++) {
497 $src[$i] = undef if $ign->{ $src[$i] };
498 }
0116f5dc 499 }
500
a7fbee98 501 for (my $i = 0; $i < @src; $i++) {
4d36a948 502 next if _isNonCharacter($src[$i]);
503
504 my $i_orig = $i;
91ae00cb 505 my $jcps = $src[$i];
4d36a948 506
91ae00cb 507 if ($max->{$jcps}) { # contract
508 my $temp_jcps = $jcps;
509 my $jcpsLen = 1;
510 my $maxLen = $max->{$jcps};
4d36a948 511
91ae00cb 512 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 513 next if ! defined $src[$p];
91ae00cb 514 $temp_jcps .= CODE_SEP . $src[$p];
515 $jcpsLen++;
516 if ($map->{$temp_jcps}) {
517 $jcps = $temp_jcps;
4d36a948 518 $i = $p;
519 }
520 }
4d36a948 521
06c8fc8f 522 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
523 # This process requires Unicode::Normalize.
91ae00cb 524 # If "normalization" is undef, here should be skipped *always*
06c8fc8f 525 # (in spite of bool value of $CVgetCombinClass),
526 # since canonical ordering cannot be expected.
527 # Blocked combining character should not be contracted.
528
529 if ($self->{normalization})
530 # $self->{normCode} is false in the case of "prenormalized".
531 {
532 my $preCC = 0;
533 my $curCC = 0;
534
535 for (my $p = $i + 1; $p < @src; $p++) {
536 next if ! defined $src[$p];
537 $curCC = $CVgetCombinClass->($src[$p]);
538 last unless $curCC;
539 my $tail = CODE_SEP . $src[$p];
91ae00cb 540 if ($preCC != $curCC && $map->{$jcps.$tail}) {
541 $jcps .= $tail;
06c8fc8f 542 $src[$p] = undef;
543 } else {
544 $preCC = $curCC;
545 }
4d36a948 546 }
a7fbee98 547 }
a7fbee98 548 }
549
4d36a948 550 if ($wLen) {
551 for (my $p = $i + 1; $p < @src; $p++) {
552 last if defined $src[$p];
553 $i = $p;
a7fbee98 554 }
555 }
4d36a948 556
91ae00cb 557 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 558 }
4d36a948 559 return \@buf;
d16e9e3d 560}
45394607 561
d16e9e3d 562
563##
4d36a948 564## list of arrayrefs of weights = getWt(JCPS)
d16e9e3d 565##
566sub getWt
567{
a7fbee98 568 my $self = shift;
91ae00cb 569 my $u = shift;
570 my $map = $self->{mapping};
0116f5dc 571 my $der = $self->{derivCode};
a7fbee98 572
91ae00cb 573 return if !defined $u;
574 return map($self->varCE($_), @{ $map->{$u} })
575 if $map->{$u};
a7fbee98 576
91ae00cb 577 # JCPS must not be a contraction, then it's a code point.
578 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 579 my $hang = $self->{overrideHangul};
580 my @hangulCE;
581 if ($hang) {
582 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
583 }
584 elsif (!defined $hang) {
585 @hangulCE = $der->($u);
586 }
587 else {
588 my $max = $self->{maxlength};
589 my @decH = _decompHangul($u);
590
591 if (@decH == 2) {
592 my $contract = join(CODE_SEP, @decH);
91ae00cb 593 @decH = ($contract) if $map->{$contract};
1d2654e1 594 } else { # must be <@decH == 3>
595 if ($max->{$decH[0]}) {
596 my $contract = join(CODE_SEP, @decH);
91ae00cb 597 if ($map->{$contract}) {
1d2654e1 598 @decH = ($contract);
599 } else {
600 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 601 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1 602 }
603 # even if V's ignorable, LT contraction is not supported.
604 # If such a situatution were required, NFD should be used.
605 }
606 if (@decH == 3 && $max->{$decH[1]}) {
607 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 608 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1 609 }
610 }
611
612 @hangulCE = map({
91ae00cb 613 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1 614 } @decH);
615 }
91ae00cb 616 return map $self->varCE($_), @hangulCE;
a7fbee98 617 }
91ae00cb 618 elsif (CJK_UidIni <= $u && $u <= CJK_UidFin ||
619 CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
620 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
1d2654e1 621 my $cjk = $self->{overrideCJK};
91ae00cb 622 return map $self->varCE($_),
0116f5dc 623 $cjk
4d36a948 624 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
91ae00cb 625 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
9f1f04a1 626 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 627 : $der->($u);
a7fbee98 628 }
629 else {
91ae00cb 630 return map $self->varCE($_), $der->($u);
a7fbee98 631 }
d16e9e3d 632}
633
d16e9e3d 634
635##
636## string sortkey = getSortKey(string arg)
637##
638sub getSortKey
639{
a7fbee98 640 my $self = shift;
641 my $lev = $self->{level};
91ae00cb 642 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
643 my $ver9 = $self->{UCA_Version} >= 9;
644 my $v2i = $self->{variable} ne 'non-ignorable';
a7fbee98 645
646 # weight arrays
91ae00cb 647 my (@wts, @buf, $last_is_variable);
648
649 if ($self->{hangul_terminator}) {
650 my $preHST = '';
651 foreach my $jcps (@$rEnt) {
652 # weird things like VL, TL-contraction are not considered!
653 my $curHST = '';
654 foreach my $u (split /;/, $jcps) {
655 $curHST .= getHST($u);
656 }
657 if ($preHST && !$curHST || # hangul before non-hangul
658 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
659 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
660 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
0116f5dc 661
91ae00cb 662 push @wts, $self->varCE_HangulTerm;
663 }
664 $preHST = $curHST;
665
666 push @wts, $self->getWt($jcps);
667 }
668 $preHST # end at hangul
669 and push @wts, $self->varCE_HangulTerm;
670 }
671 else {
672 foreach my $jcps (@$rEnt) {
673 push @wts, $self->getWt($jcps);
674 }
675 }
676
677 foreach my $wt (@wts) {
4d36a948 678 if ($v2i && $ver9) {
679 if ($wt->[0] == 0) { # ignorable
680 next if $last_is_variable;
0116f5dc 681 } else {
9f1f04a1 682 $last_is_variable = ($wt->[0] == Var1Wt);
0116f5dc 683 }
684 }
4d36a948 685 push @buf, $wt;
0116f5dc 686 }
a7fbee98 687
688 # make sort key
689 my @ret = ([],[],[],[]);
690 foreach my $v (0..$lev-1) {
691 foreach my $b (@buf) {
4d36a948 692 push @{ $ret[$v] }, $b->[$v]
693 if 0 < $b->[$v];
a7fbee98 694 }
695 }
45394607 696
a7fbee98 697 # modification of tertiary weights
698 if ($self->{upper_before_lower}) {
699 foreach (@{ $ret[2] }) {
700 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
701 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
702 elsif ($_ == 0x1C) { $_ += 1 } # square upper
703 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
704 }
45394607 705 }
a7fbee98 706 if ($self->{katakana_before_hiragana}) {
707 foreach (@{ $ret[2] }) {
708 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
709 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
710 }
45394607 711 }
9f1f04a1 712
713 if ($self->{backwardsFlag}) {
714 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
715 if ($self->{backwardsFlag} & (1 << $v)) {
716 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
717 }
718 }
719 }
720
4d36a948 721 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 722}
723
724
725##
d16e9e3d 726## int compare = cmp(string a, string b)
45394607 727##
5398038e 728sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
729sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
730sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
731sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
732sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
733sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
734sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 735
736##
d16e9e3d 737## list[strings] sorted = sort(list[strings] arg)
45394607 738##
a7fbee98 739sub sort {
740 my $obj = shift;
741 return
742 map { $_->[1] }
743 sort{ $a->[0] cmp $b->[0] }
744 map [ $obj->getSortKey($_), $_ ], @_;
45394607 745}
746
0116f5dc 747
4d36a948 748sub _derivCE_9 {
0116f5dc 749 my $u = shift;
750 my $base =
91ae00cb 751 (CJK_UidIni <= $u && $u <= CJK_UidFin)
4d36a948 752 ? 0xFB40 : # CJK
91ae00cb 753 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
754 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
4d36a948 755 ? 0xFB80 # CJK ext.
756 : 0xFBC0; # others
0116f5dc 757
758 my $aaaa = $base + ($u >> 15);
759 my $bbbb = ($u & 0x7FFF) | 0x8000;
760 return
9f1f04a1 761 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
762 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 763}
764
4d36a948 765sub _derivCE_8 {
0116f5dc 766 my $code = shift;
767 my $aaaa = 0xFF80 + ($code >> 15);
768 my $bbbb = ($code & 0x7FFF) | 0x8000;
769 return
4d36a948 770 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
771 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 772}
773
91ae00cb 774
775sub varCE_HangulTerm {
776 my $self = shift;
777 return $self->varCE(pack(VCE_TEMPLATE,
778 NON_VAR, $self->{hangul_terminator}, 0,0,0));
779}
780
781
45394607 782##
783## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
784##
a7fbee98 785sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 786
a7fbee98 787#
4d36a948 788# $code *must* be in Hangul syllable.
a7fbee98 789# Check it before you enter here.
790#
5398038e 791sub _decompHangul {
792 my $code = shift;
91ae00cb 793 my $SIndex = $code - Hangul_SBase;
794 my $LIndex = int( $SIndex / Hangul_NCount);
795 my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
796 my $TIndex = $SIndex % Hangul_TCount;
5398038e 797 return (
91ae00cb 798 Hangul_LBase + $LIndex,
799 Hangul_VBase + $VIndex,
800 $TIndex ? (Hangul_TBase + $TIndex) : (),
5398038e 801 );
45394607 802}
803
4d36a948 804sub _isNonCharacter {
805 my $code = shift;
806 return ! defined $code # removed
807 || ($code < 0 || 0x10FFFF < $code) # out of range
808 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
809 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
810 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
811 ;
812}
813
91ae00cb 814# Hangul Syllable Type
815sub getHST {
816 my $u = shift;
817 return
818 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
819 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
820 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
821 Hangul_SIni <= $u && $u <= Hangul_SFin ?
822 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
823}
824
4d36a948 825
826##
827## bool _nonIgnorAtLevel(arrayref weights, int level)
828##
829sub _nonIgnorAtLevel($$)
830{
831 my $wt = shift;
832 return if ! defined $wt;
833 my $lv = shift;
9f1f04a1 834 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 835}
836
837##
838## bool _eqArray(
839## arrayref of arrayref[weights] source,
840## arrayref of arrayref[weights] substr,
841## int level)
842## * comparison of graphemes vs graphemes.
843## @$source >= @$substr must be true (check it before call this);
844##
845sub _eqArray($$$)
846{
847 my $source = shift;
848 my $substr = shift;
849 my $lev = shift;
850
851 for my $g (0..@$substr-1){
852 # Do the $g'th graphemes have the same number of AV weigths?
853 return if @{ $source->[$g] } != @{ $substr->[$g] };
854
855 for my $w (0..@{ $substr->[$g] }-1) {
856 for my $v (0..$lev-1) {
857 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
858 }
859 }
860 }
861 return 1;
862}
863
864##
865## (int position, int length)
866## int position = index(string, substring, position, [undoc'ed grobal])
867##
868## With "grobal" (only for the list context),
869## returns list of arrayref[position, length].
870##
871sub index
872{
91ae00cb 873 my $self = shift;
874 my $str = shift;
875 my $len = length($str);
876 my $subE = $self->splitEnt(shift);
877 my $pos = @_ ? shift : 0;
878 $pos = 0 if $pos < 0;
879 my $grob = shift;
880
881 my $lev = $self->{level};
882 my $ver9 = $self->{UCA_Version} >= 9;
883 my $v2i = $self->{variable} ne 'non-ignorable';
884
885 if (! @$subE) {
4d36a948 886 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
887 return $grob
888 ? map([$_, 0], $temp..$len)
889 : wantarray ? ($temp,0) : $temp;
890 }
891 if ($len < $pos) {
892 return wantarray ? () : NOMATCHPOS;
893 }
91ae00cb 894 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
895 if (! @$strE) {
4d36a948 896 return wantarray ? () : NOMATCHPOS;
897 }
898 my $last_is_variable;
899 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
900
901 $last_is_variable = FALSE;
91ae00cb 902 for my $wt (map $self->getWt($_), @$subE) {
4d36a948 903 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
904
905 if ($v2i && $ver9) {
906 if ($wt->[0] == 0) {
907 $to_be_pushed = FALSE if $last_is_variable;
908 } else {
9f1f04a1 909 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 910 }
911 }
912
913 if (@subWt && $wt->[0] == 0) {
914 push @{ $subWt[-1] }, $wt if $to_be_pushed;
915 } else {
9f1f04a1 916 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 917 push @subWt, [ $wt ];
918 }
919 }
920
921 my $count = 0;
91ae00cb 922 my $end = @$strE - 1;
4d36a948 923
924 $last_is_variable = FALSE;
925
926 for (my $i = 0; $i <= $end; ) { # no $i++
927 my $found_base = 0;
928
929 # fetch a grapheme
930 while ($i <= $end && $found_base == 0) {
91ae00cb 931 for my $wt ($self->getWt($strE->[$i][0])) {
4d36a948 932 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
933
934 if ($v2i && $ver9) {
935 if ($wt->[0] == 0) {
936 $to_be_pushed = FALSE if $last_is_variable;
937 } else {
9f1f04a1 938 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 939 }
940 }
941
942 if (@strWt && $wt->[0] == 0) {
943 push @{ $strWt[-1] }, $wt if $to_be_pushed;
91ae00cb 944 $finPos[-1] = $strE->[$i][2];
4d36a948 945 } elsif ($to_be_pushed) {
9f1f04a1 946 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 947 push @strWt, [ $wt ];
91ae00cb 948 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 949 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 950 push @finPos, $strE->[$i][2];
4d36a948 951 $found_base++;
952 }
953 # else ===> no-op
954 }
955 $i++;
956 }
957
958 # try to match
959 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
960 if ($iniPos[0] != NOMATCHPOS &&
961 $finPos[$#subWt] != NOMATCHPOS &&
962 _eqArray(\@strWt, \@subWt, $lev)) {
963 my $temp = $iniPos[0] + $pos;
964
965 if ($grob) {
966 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
967 splice @strWt, 0, $#subWt;
968 splice @iniPos, 0, $#subWt;
969 splice @finPos, 0, $#subWt;
970 }
971 else {
972 return wantarray
973 ? ($temp, $finPos[$#subWt] - $iniPos[0])
974 : $temp;
975 }
976 }
977 shift @strWt;
978 shift @iniPos;
979 shift @finPos;
980 }
981 }
982
983 return $grob
984 ? @g_ret
985 : wantarray ? () : NOMATCHPOS;
986}
987
988##
989## scalarref to matching part = match(string, substring)
990##
991sub match
992{
993 my $self = shift;
994 if (my($pos,$len) = $self->index($_[0], $_[1])) {
995 my $temp = substr($_[0], $pos, $len);
996 return wantarray ? $temp : \$temp;
997 # An lvalue ref \substr should be avoided,
998 # since its value is affected by modification of its referent.
999 }
1000 else {
1001 return;
1002 }
1003}
1004
1005##
1006## arrayref matching parts = gmatch(string, substring)
1007##
1008sub gmatch
1009{
1010 my $self = shift;
1011 my $str = shift;
1012 my $sub = shift;
1013 return map substr($str, $_->[0], $_->[1]),
1014 $self->index($str, $sub, 0, 'g');
1015}
1016
1017##
1018## bool subst'ed = subst(string, substring, replace)
1019##
1020sub subst
1021{
1022 my $self = shift;
1023 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1024
1025 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1026 if ($code) {
1027 my $mat = substr($_[0], $pos, $len);
1028 substr($_[0], $pos, $len, $code->($mat));
1029 } else {
1030 substr($_[0], $pos, $len, $_[2]);
1031 }
1032 return TRUE;
1033 }
1034 else {
1035 return FALSE;
1036 }
1037}
1038
1039##
1040## int count = gsubst(string, substring, replace)
1041##
1042sub gsubst
1043{
1044 my $self = shift;
1045 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1046 my $cnt = 0;
1047
1048 # Replacement is carried out from the end, then use reverse.
1049 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1050 if ($code) {
1051 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1052 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1053 } else {
1054 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1055 }
1056 $cnt++;
1057 }
1058 return $cnt;
1059}
1060
45394607 10611;
1062__END__
1063
1064=head1 NAME
1065
a7fbee98 1066Unicode::Collate - Unicode Collation Algorithm
45394607 1067
1068=head1 SYNOPSIS
1069
1070 use Unicode::Collate;
1071
1072 #construct
5398038e 1073 $Collator = Unicode::Collate->new(%tailoring);
45394607 1074
1075 #sort
5398038e 1076 @sorted = $Collator->sort(@not_sorted);
45394607 1077
1078 #compare
a7fbee98 1079 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1080
91ae00cb 1081 # If %tailoring is false (i.e. empty),
1082 # $Collator should do the default collation.
1083
45394607 1084=head1 DESCRIPTION
1085
4d36a948 1086This module is an implementation
1087of Unicode Technical Standard #10 (UTS #10)
1088"Unicode Collation Algorithm."
1089
45394607 1090=head2 Constructor and Tailoring
1091
d16e9e3d 1092The C<new> method returns a collator object.
1093
5398038e 1094 $Collator = Unicode::Collate->new(
0116f5dc 1095 UCA_Version => $UCA_Version,
91ae00cb 1096 alternate => $alternate, # deprecated: use of 'variable' is recommended.
45394607 1097 backwards => $levelNumber, # or \@levelNumbers
1098 entry => $element,
91ae00cb 1099 hangul_terminator => $term_primary_weight,
45394607 1100 ignoreName => qr/$ignoreName/,
1101 ignoreChar => qr/$ignoreChar/,
1102 katakana_before_hiragana => $bool,
1103 level => $collationLevel,
91ae00cb 1104 normalization => $normalization_form,
45394607 1105 overrideCJK => \&overrideCJK,
1106 overrideHangul => \&overrideHangul,
1107 preprocess => \&preprocess,
1108 rearrange => \@charList,
1109 table => $filename,
1110 undefName => qr/$undefName/,
1111 undefChar => qr/$undefChar/,
1112 upper_before_lower => $bool,
91ae00cb 1113 variable => $variable,
45394607 1114 );
45394607 1115
1116=over 4
1117
0116f5dc 1118=item UCA_Version
1119
91ae00cb 1120If the tracking version number of the older UCA is given,
1121the older behavior of that tracking version is emulated on collating.
0116f5dc 1122If omitted, the return value of C<UCA_Version()> is used.
1123
91ae00cb 1124The supported tracking version: 8, 9, or 11.
0116f5dc 1125
1126B<This parameter may be removed in the future version,
1127as switching the algorithm would affect the performance.>
1128
45394607 1129=item backwards
1130
4d36a948 1131-- see 3.1.2 French Accents, UTS #10.
45394607 1132
1133 backwards => $levelNumber or \@levelNumbers
1134
1135Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1136If omitted, forwards at all the levels.
1137
1138=item entry
1139
4d36a948 1140-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1141
91ae00cb 1142If the same character (or a sequence of characters) exists
1143in the collation element table through C<table>,
1144mapping to collation elements is overrided.
1145If it does not exist, the mapping is defined additionally.
45394607 1146
1147 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 114800E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 11490063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
11500043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1151ENTRIES
1152
4d36a948 1153B<NOTE:> The code point in the UCA file format (before C<';'>)
1154B<must> be a Unicode code point, but not a native code point.
1155So C<0063> must always denote C<U+0063>,
1156but not a character of C<"\x63">.
1157
91ae00cb 1158=item hangul_terminator
1159
1160-- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1161
1162If a true value is given (non-zero but should be positive),
1163it will be added as a terminator primary weight to the end of
1164every standard Hangul syllable. Secondary and any higher weights
1165for terminator are set to zero.
1166If the value is false or C<hangul_terminator> key does not exist,
1167insertion of terminator weights will not be performed.
1168
1169Boundaries of Hangul syllables are determined
1170according to conjoining Jamo behavior in F<the Unicode Standard>
1171and F<HangulSyllableType.txt>.
1172
1173B<Implementation Note:>
1174(1) For expansion mapping (Unicode character mapped
1175to a sequence of collation elements), a terminator will not be added
1176between collation elements, even if Hangul syllable boundary exists there.
1177Addition of terminator is restricted to the next position
1178to the last collation element.
1179
1180(2) Non-conjoining Hangul letters
1181(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1182automatically terminated with a terminator primary weight.
1183These characters may need terminator included in a collation element
1184table beforehand.
1185
45394607 1186=item ignoreName
1187
1188=item ignoreChar
1189
4d36a948 1190-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1191
caffd4cf 1192Makes the entry in the table completely ignorable;
1193i.e. as if the weights were zero at all level.
45394607 1194
a7fbee98 1195E.g. when 'a' and 'e' are ignorable,
45394607 1196'element' is equal to 'lament' (or 'lmnt').
1197
1198=item level
1199
4d36a948 1200-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1201
1202Set the maximum level.
1203Any higher levels than the specified one are ignored.
1204
1205 Level 1: alphabetic ordering
1206 Level 2: diacritic ordering
1207 Level 3: case ordering
91ae00cb 1208 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607 1209
1210 ex.level => 2,
1211
a7fbee98 1212If omitted, the maximum is the 4th.
1213
45394607 1214=item normalization
1215
4d36a948 1216-- see 4.1 Normalize each input string, UTS #10.
45394607 1217
905aa9f0 1218If specified, strings are normalized before preparation of sort keys
45394607 1219(the normalization is executed after preprocess).
1220
1d2654e1 1221A form name C<Unicode::Normalize::normalize()> accepts will be applied
1222as C<$normalization_form>.
06c8fc8f 1223Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1 1224See C<Unicode::Normalize::normalize()> for detail.
1225If omitted, C<'NFD'> is used.
45394607 1226
91ae00cb 1227C<normalization> is performed after C<preprocess> (if defined).
45394607 1228
06c8fc8f 1229Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1230though they are not concerned with C<Unicode::Normalize::normalize()>.
1231
1232If C<undef> (not a string C<"undef">) is passed explicitly
1233as the value for this key,
45394607 1234any normalization is not carried out (this may make tailoring easier
1235if any normalization is not desired).
06c8fc8f 1236Under C<(normalization =E<gt> undef)>, only contiguous contractions
1237are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
1238even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
1239In this point,
1240C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1241B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1242
1243In the case of C<(normalization =E<gt> "prenormalized")>,
1244any normalization is not performed, but
1245non-contiguous contractions with combining characters are performed.
1246Therefore
1247C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1248B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1249If source strings are finely prenormalized,
1250C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1251
1252Except C<(normalization =E<gt> undef)>,
1253B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607 1254
1255=item overrideCJK
1256
4d36a948 1257-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1258
91ae00cb 1259By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1260(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than
1261C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1262C<U+20000> to C<U+2A6D6>].
1263
1264Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
45394607 1265
a7fbee98 1266ex. CJK Unified Ideographs in the JIS code point order.
45394607 1267
1268 overrideCJK => sub {
a7fbee98 1269 my $u = shift; # get a Unicode codepoint
1270 my $b = pack('n', $u); # to UTF-16BE
1271 my $s = your_unicode_to_sjis_converter($b); # convert
1272 my $n = unpack('n', $s); # convert sjis to short
1273 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1274 },
1275
a7fbee98 1276ex. ignores all CJK Unified Ideographs.
1277
1278 overrideCJK => sub {()}, # CODEREF returning empty list
1279
1280 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1281 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1282
1283If C<undef> is passed explicitly as the value for this key,
1284weights for CJK Unified Ideographs are treated as undefined.
1285But assignment of weight for CJK Unified Ideographs
91ae00cb 1286in table or C<entry> is still valid.
a7fbee98 1287
1288=item overrideHangul
1289
4d36a948 1290-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1291
1292By default, Hangul Syllables are decomposed into Hangul Jamo.
1293But the mapping of Hangul Syllables may be overrided.
1294
91ae00cb 1295This tag works like C<overrideCJK>, so see there for examples.
a7fbee98 1296
45394607 1297If you want to override the mapping of Hangul Syllables,
1298the Normalization Forms D and KD are not appropriate
1299(they will be decomposed before overriding).
1300
a7fbee98 1301If C<undef> is passed explicitly as the value for this key,
1302weight for Hangul Syllables is treated as undefined
1303without decomposition into Hangul Jamo.
1304But definition of weight for Hangul Syllables
91ae00cb 1305in table or C<entry> is still valid.
a7fbee98 1306
45394607 1307=item preprocess
1308
4d36a948 1309-- see 5.1 Preprocessing, UTS #10.
45394607 1310
1311If specified, the coderef is used to preprocess
1312before the formation of sort keys.
1313
a7fbee98 1314ex. dropping English articles, such as "a" or "the".
45394607 1315Then, "the pen" is before "a pencil".
1316
1317 preprocess => sub {
1318 my $str = shift;
a7fbee98 1319 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1320 return $str;
45394607 1321 },
1322
91ae00cb 1323C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1324
45394607 1325=item rearrange
1326
4d36a948 1327-- see 3.1.3 Rearrangement, UTS #10.
45394607 1328
1329Characters that are not coded in logical order and to be rearranged.
a7fbee98 1330By default,
45394607 1331
1332 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1333
a7fbee98 1334If you want to disallow any rearrangement,
1335pass C<undef> or C<[]> (a reference to an empty list)
1336as the value for this key.
1337
0116f5dc 1338B<According to the version 9 of UCA, this parameter shall not be used;
1339but it is not warned at present.>
1340
45394607 1341=item table
1342
4d36a948 1343-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1344
91ae00cb 1345You can use another collation element table if desired.
ae6aa562 1346The table file must be put into a directory
1347where F<Unicode/Collate.pm> is installed.
1348E.g. in F<perl/lib/Unicode/Collate> directory
1349when you have F<perl/lib/Unicode/Collate.pm>.
45394607 1350
ae6aa562 1351By default, the filename F<"allkeys.txt"> is used.
45394607 1352
a7fbee98 1353If C<undef> is passed explicitly as the value for this key,
91ae00cb 1354no file is read (but you can define collation elements via C<entry>).
a7fbee98 1355
1356A typical way to define a collation element table
1357without any file of table:
1358
1359 $onlyABC = Unicode::Collate->new(
1360 table => undef,
1361 entry => << 'ENTRIES',
13620061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
13630041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
13640062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
13650042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
13660063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
13670043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1368ENTRIES
1369 );
905aa9f0 1370
45394607 1371=item undefName
1372
1373=item undefChar
1374
4d36a948 1375-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1376
1377Undefines the collation element as if it were unassigned in the table.
1378This reduces the size of the table.
1379If an unassigned character appears in the string to be collated,
1380the sort key is made from its codepoint
1381as a single-character collation element,
1382as it is greater than any other assigned collation elements
1383(in the codepoint order among the unassigned characters).
1384But, it'd be better to ignore characters
1385unfamiliar to you and maybe never used.
1386
1387=item katakana_before_hiragana
1388
1389=item upper_before_lower
1390
4d36a948 1391-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1392
1393By default, lowercase is before uppercase
1394and hiragana is before katakana.
1395
a7fbee98 1396If the tag is made true, this is reversed.
1397
1398B<NOTE>: These tags simplemindedly assume
1399any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1 1400must occur in level 3, and their weights at level 3
1401must be same as those mentioned in 7.3.1, UTS #10.
1402If you define your collation elements which violate this requirement,
4d36a948 1403these tags don't work validly.
45394607 1404
91ae00cb 1405=item variable
1406
1407=item alternate
1408
1409-- see 3.2.2 Variable Weighting, UTS #10.
1410
1411(the title in UCA version 8: Alternate Weighting)
1412
1413This key allows to variable weighting for variable collation elements,
1414which are marked with an ASTERISK in the table
1415(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1416
1417 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1418
1419These names are case-insensitive.
1420By default (if specification is omitted), 'shifted' is adopted.
1421
1422 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1423 considered at the 4th level.
1424
1425 'Non-ignorable' Variable elements are not reset to ignorable.
1426
1427 'Shifted' Variable elements are made ignorable at levels 1 through 3
1428 their level 4 weight is replaced by the old level 1 weight.
1429 Level 4 weight for Non-Variable elements is 0xFFFF.
1430
1431 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1432 are trimmed.
1433
1434For backward compatibility, C<alternate> can be used as an alias
1435for C<variable>.
1436
45394607 1437=back
1438
3164dd77 1439=head2 Methods for Collation
45394607 1440
1441=over 4
1442
5398038e 1443=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1444
1445Sorts a list of strings.
1446
5398038e 1447=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1448
1449Returns 1 (when C<$a> is greater than C<$b>)
1450or 0 (when C<$a> is equal to C<$b>)
1451or -1 (when C<$a> is lesser than C<$b>).
1452
5398038e 1453=item C<$result = $Collator-E<gt>eq($a, $b)>
1454
1455=item C<$result = $Collator-E<gt>ne($a, $b)>
1456
1457=item C<$result = $Collator-E<gt>lt($a, $b)>
1458
1459=item C<$result = $Collator-E<gt>le($a, $b)>
1460
1461=item C<$result = $Collator-E<gt>gt($a, $b)>
1462
1463=item C<$result = $Collator-E<gt>ge($a, $b)>
1464
a7fbee98 1465They works like the same name operators as theirs.
5398038e 1466
1467 eq : whether $a is equal to $b.
1468 ne : whether $a is not equal to $b.
1469 lt : whether $a is lesser than $b.
1470 le : whether $a is lesser than $b or equal to $b.
1471 gt : whether $a is greater than $b.
1472 ge : whether $a is greater than $b or equal to $b.
1473
1474=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1475
4d36a948 1476-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1477
1478Returns a sort key.
1479
1480You compare the sort keys using a binary comparison
1481and get the result of the comparison of the strings using UCA.
1482
5398038e 1483 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1484
1485 is equivalent to
1486
5398038e 1487 $Collator->cmp($a, $b)
45394607 1488
a7fbee98 1489=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1490
a7fbee98 1491 use Unicode::Collate;
1492 my $c = Unicode::Collate->new();
1493 print $c->viewSortKey("Perl"),"\n";
1494
0116f5dc 1495 # output:
1496 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1497 # Level 1 Level 2 Level 3 Level 4
1498
1499 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1500
4d36a948 1501=back
1502
1503=head2 Methods for Searching
d16e9e3d 1504
4d36a948 1505B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1506for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1507C<subst>, C<gsubst>) is croaked,
1508as the position and the length might differ
1509from those on the specified string.
91ae00cb 1510(And C<rearrange> and C<hangul_terminator> tags are neglected.)
d16e9e3d 1511
4d36a948 1512The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1513like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1514but they are not aware of any pattern, but only a literal substring.
1515
1516=over 4
1517
1518=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1519
1520=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1521
1522If C<$substring> matches a part of C<$string>, returns
1523the position of the first occurrence of the matching part in scalar context;
1524in list context, returns a two-element list of
1525the position and the length of the matching part.
1526
d16e9e3d 1527If C<$substring> does not match any part of C<$string>,
1528returns C<-1> in scalar context and
1529an empty list in list context.
1530
1531e.g. you say
1532
5398038e 1533 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1534 # (normalization => undef) is REQUIRED.
1535 my $str = "Ich muß studieren Perl.";
1536 my $sub = "MÜSS";
d16e9e3d 1537 my $match;
a7fbee98 1538 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1539 $match = substr($str, $pos, $len);
d16e9e3d 1540 }
1541
4d36a948 1542and get C<"muß"> in C<$match> since C<"muß">
1543is primary equal to C<"MÜSS">.
1544
1545=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1546
1547=item C<($match) = $Collator-E<gt>match($string, $substring)>
1548
1549If C<$substring> matches a part of C<$string>, in scalar context, returns
1550B<a reference to> the first occurrence of the matching part
1551(C<$match_ref> is always true if matches,
1552since every reference is B<true>);
1553in list context, returns the first occurrence of the matching part.
1554
1555If C<$substring> does not match any part of C<$string>,
1556returns C<undef> in scalar context and
1557an empty list in list context.
1558
1559e.g.
1560
1561 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1562 print "matches [$$match_ref].\n";
1563 } else {
1564 print "doesn't match.\n";
1565 }
1566
1567 or
1568
1569 if (($match) = $Collator->match($str, $sub)) { # list context
1570 print "matches [$match].\n";
1571 } else {
1572 print "doesn't match.\n";
1573 }
1574
1575=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1576
1577If C<$substring> matches a part of C<$string>, returns
1578all the matching parts (or matching count in scalar context).
1579
1580If C<$substring> does not match any part of C<$string>,
1581returns an empty list.
1582
1583=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1584
1585If C<$substring> matches a part of C<$string>,
1586the first occurrence of the matching part is replaced by C<$replacement>
1587(C<$string> is modified) and return C<$count> (always equals to C<1>).
1588
1589C<$replacement> can be a C<CODEREF>,
1590taking the matching part as an argument,
1591and returning a string to replace the matching part
1592(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1593
1594=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1595
1596If C<$substring> matches a part of C<$string>,
1597all the occurrences of the matching part is replaced by C<$replacement>
1598(C<$string> is modified) and return C<$count>.
1599
1600C<$replacement> can be a C<CODEREF>,
1601taking the matching part as an argument,
1602and returning a string to replace the matching part
1603(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1604
1605e.g.
1606
1607 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1608 # (normalization => undef) is REQUIRED.
1609 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1610 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1611
1612 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1613 # i.e., all the camels are made bold-faced.
d16e9e3d 1614
45394607 1615=back
1616
3164dd77 1617=head2 Other Methods
1618
1619=over 4
1620
0116f5dc 1621=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1622
1623Change the value of specified keys and returns the changed part.
1624
1625 $Collator = Unicode::Collate->new(level => 4);
1626
1627 $Collator->eq("perl", "PERL"); # false
1628
1629 %old = $Collator->change(level => 2); # returns (level => 4).
1630
1631 $Collator->eq("perl", "PERL"); # true
1632
1633 $Collator->change(%old); # returns (level => 2).
1634
1635 $Collator->eq("perl", "PERL"); # false
1636
1637Not all C<(key,value)>s are allowed to be changed.
1638See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1639
1640In the scalar context, returns the modified collator
1641(but it is B<not> a clone from the original).
1642
1643 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1644
1645 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1646
1647 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1648
91ae00cb 1649=item C<$version = $Collator-E<gt>version()>
3164dd77 1650
91ae00cb 1651Returns the version number (a string) of the Unicode Standard
1652which the C<table> file used by the collator object is based on.
1653If the table does not include a version line (starting with C<@version>),
1654returns C<"unknown">.
1655
1656=item C<UCA_Version()>
3164dd77 1657
91ae00cb 1658Returns the tracking version number of UTS #10 this module consults.
3164dd77 1659
91ae00cb 1660=item C<Base_Unicode_Version()>
1661
1662Returns the version number of UTS #10 this module consults.
3164dd77 1663
1664=back
1665
45394607 1666=head2 EXPORT
1667
1668None by default.
1669
1670=head2 CAVEAT
1671
1672Use of the C<normalization> parameter requires
1673the B<Unicode::Normalize> module.
1674
5398038e 1675If you need not it (say, in the case when you need not
45394607 1676handle any combining characters),
1677assign C<normalization =E<gt> undef> explicitly.
1678
4d36a948 1679-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1680
0116f5dc 1681=head2 Conformance Test
1682
1683The Conformance Test for the UCA is provided
1684in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1685and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1686
1687For F<CollationTest_SHIFTED.txt>,
1688a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1689for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1690C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1691
4d36a948 1692B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1693
45394607 1694=head1 AUTHOR
1695
1d2654e1 1696SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
45394607 1697
1698 http://homepage1.nifty.com/nomenclator/perl/
1699
ae6aa562 1700 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1701
a7fbee98 1702 This library is free software; you can redistribute it
1703 and/or modify it under the same terms as Perl itself.
45394607 1704
1705=head1 SEE ALSO
1706
1707=over 4
1708
91ae00cb 1709=item Unicode Collation Algorithm - UTS #10
1710
1711L<http://www.unicode.org/reports/tr10/>
1712
1713=item The Default Unicode Collation Element Table (DUCET)
1714
1715L<http://www.unicode.org/reports/tr10/allkeys.txt>
45394607 1716
91ae00cb 1717=item The conformance test for the UCA
45394607 1718
91ae00cb 1719L<http://www.unicode.org/reports/tr10/CollationTest.html>
a7fbee98 1720
91ae00cb 1721L<http://www.unicode.org/reports/tr10/CollationTest.zip>
45394607 1722
91ae00cb 1723=item Hangul Syllable Type
0116f5dc 1724
91ae00cb 1725http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt
0116f5dc 1726
91ae00cb 1727=item Unicode Normalization Forms - UAX #15
a7fbee98 1728
91ae00cb 1729L<http://www.unicode.org/reports/tr15/>
a7fbee98 1730
a7fbee98 1731=item L<Unicode::Normalize>
45394607 1732
45394607 1733=back
1734
1735=cut