Assimilate PodParser-1.26
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48 15no warnings 'utf8';
16
45394607 17require Exporter;
18
10d7ec48 19our $VERSION = '0.31';
45394607 20our $PACKAGE = __PACKAGE__;
21
22our @ISA = qw(Exporter);
23
24our %EXPORT_TAGS = ();
25our @EXPORT_OK = ();
26our @EXPORT = ();
27
28(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
29our $KeyFile = "allkeys.txt";
30
4d36a948 31# Perl's boolean
32use constant TRUE => 1;
33use constant FALSE => "";
34use constant NOMATCHPOS => -1;
35
36# A coderef to get combining class imported from Unicode::Normalize
37# (i.e. \&Unicode::Normalize::getCombinClass).
38# This is also used as a HAS_UNICODE_NORMALIZE flag.
06c8fc8f 39our $CVgetCombinClass;
4d36a948 40
9f1f04a1 41# Supported Levels
42use constant MinLevel => 1;
43use constant MaxLevel => 4;
44
4d36a948 45# Minimum weights at level 2 and 3, respectively
9f1f04a1 46use constant Min2Wt => 0x20;
47use constant Min3Wt => 0x02;
4d36a948 48
49# Shifted weight at 4th level
9f1f04a1 50use constant Shift4Wt => 0xFFFF;
4d36a948 51
52# Variable weight at 1st level.
53# This is a negative value but should be regarded as zero on collation.
54# This is for distinction of variable chars from level 3 ignorable chars.
9f1f04a1 55use constant Var1Wt => -1;
4d36a948 56
57
58# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
59# PROBLEM: The Default Unicode Collation Element Table
60# has weights over 0xFFFF at the 4th level.
61# The tie-breaking in the variable weights
62# other than "shift" (as well as "shift-trimmed") is unreliable.
63use constant VCE_TEMPLATE => 'Cn4';
64
4d36a948 65# A sort key: 16-bit weights
66# See also the PROBLEM on VCE_TEMPLATE above.
67use constant KEY_TEMPLATE => 'n*';
68
69# Level separator in a sort key:
70# i.e. pack(KEY_TEMPLATE, 0)
71use constant LEVEL_SEP => "\0\0";
72
73# As Unicode code point separator for hash keys.
74# A joined code point string (denoted by JCPS below)
75# like "65;768" is used for internal processing
76# instead of Perl's Unicode string like "\x41\x{300}",
77# as the native code point is different from the Unicode code point
78# on EBCDIC platform.
79# This character must not be included in any stringified
80# representation of an integer.
81use constant CODE_SEP => ';';
82
83# boolean values of variable weights
0116f5dc 84use constant NON_VAR => 0; # Non-Variable character
85use constant VAR => 1; # Variable character
3164dd77 86
91ae00cb 87# specific code points
88use constant Hangul_LBase => 0x1100;
89use constant Hangul_LIni => 0x1100;
90use constant Hangul_LFin => 0x1159;
91use constant Hangul_LFill => 0x115F;
92use constant Hangul_VBase => 0x1161;
93use constant Hangul_VIni => 0x1160;
94use constant Hangul_VFin => 0x11A2;
95use constant Hangul_TBase => 0x11A7;
96use constant Hangul_TIni => 0x11A8;
97use constant Hangul_TFin => 0x11F9;
98use constant Hangul_TCount => 28;
99use constant Hangul_NCount => 588;
100use constant Hangul_SBase => 0xAC00;
101use constant Hangul_SIni => 0xAC00;
102use constant Hangul_SFin => 0xD7A3;
103use constant CJK_UidIni => 0x4E00;
104use constant CJK_UidFin => 0x9FA5;
105use constant CJK_ExtAIni => 0x3400;
106use constant CJK_ExtAFin => 0x4DB5;
107use constant CJK_ExtBIni => 0x20000;
108use constant CJK_ExtBFin => 0x2A6D6;
109use constant BMP_Max => 0xFFFF;
110
4d36a948 111# Logical_Order_Exception in PropList.txt
112# TODO: synchronization with change of PropList.txt.
0116f5dc 113our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 114
91ae00cb 115sub UCA_Version { "11" }
a7fbee98 116
91ae00cb 117sub Base_Unicode_Version { "4.0" }
a7fbee98 118
9f1f04a1 119######
120
9f1f04a1 121sub pack_U {
ae6aa562 122 return pack('U*', @_);
9f1f04a1 123}
124
125sub unpack_U {
ae6aa562 126 return unpack('U*', pack('U*').shift);
9f1f04a1 127}
128
129######
130
91ae00cb 131my (%VariableOK);
132@VariableOK{ qw/
0116f5dc 133 blanked non-ignorable shifted shift-trimmed
91ae00cb 134 / } = (); # keys lowercased
0116f5dc 135
136our @ChangeOK = qw/
137 alternate backwards level normalization rearrange
138 katakana_before_hiragana upper_before_lower
139 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 140 hangul_terminator variable
0116f5dc 141 /;
142
143our @ChangeNG = qw/
91ae00cb 144 entry mapping table maxlength
145 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 146 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 147 derivCode normCode rearrangeHash L3_ignorable
9f1f04a1 148 backwardsFlag
0116f5dc 149 /;
9f1f04a1 150# The hash key 'ignored' is deleted at v 0.21.
151# The hash key 'isShift' is deleted at v 0.23.
152# The hash key 'combining' is deleted at v 0.24.
91ae00cb 153# The hash key 'entries' is deleted at v 0.30.
154
155sub version {
156 my $self = shift;
157 return $self->{versionTable} || 'unknown';
158}
0116f5dc 159
160my (%ChangeOK, %ChangeNG);
161@ChangeOK{ @ChangeOK } = ();
162@ChangeNG{ @ChangeNG } = ();
163
164sub change {
165 my $self = shift;
166 my %hash = @_;
167 my %old;
91ae00cb 168 if (exists $hash{variable} && exists $hash{alternate}) {
169 delete $hash{alternate};
170 }
171 elsif (!exists $hash{variable} && exists $hash{alternate}) {
172 $hash{variable} = $hash{alternate};
173 }
0116f5dc 174 foreach my $k (keys %hash) {
175 if (exists $ChangeOK{$k}) {
176 $old{$k} = $self->{$k};
177 $self->{$k} = $hash{$k};
178 }
179 elsif (exists $ChangeNG{$k}) {
180 croak "change of $k via change() is not allowed!";
181 }
182 # else => ignored
183 }
184 $self->checkCollator;
185 return wantarray ? %old : $self;
186}
a7fbee98 187
9f1f04a1 188sub _checkLevel {
189 my $level = shift;
190 my $key = shift;
191 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
192 $level, $key, MinLevel if MinLevel > $level;
193 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
194 $level, $key, MaxLevel if MaxLevel < $level;
195}
196
91ae00cb 197my %DerivCode = (
198 8 => \&_derivCE_8,
199 9 => \&_derivCE_9,
200 11 => \&_derivCE_9, # 11 == 9
201);
202
0116f5dc 203sub checkCollator {
204 my $self = shift;
9f1f04a1 205 _checkLevel($self->{level}, "level");
a7fbee98 206
91ae00cb 207 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
208 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 209
91ae00cb 210 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 211 $self->{alternateTable} || 'shifted';
91ae00cb 212 $self->{variable} = $self->{alternate} = lc($self->{variable});
213 exists $VariableOK{ $self->{variable} }
214 or croak "$PACKAGE unknown variable tag name: $self->{variable}";
0116f5dc 215
9f1f04a1 216 if (! defined $self->{backwards}) {
217 $self->{backwardsFlag} = 0;
218 }
219 elsif (! ref $self->{backwards}) {
220 _checkLevel($self->{backwards}, "backwards");
221 $self->{backwardsFlag} = 1 << $self->{backwards};
222 }
223 else {
224 my %level;
225 $self->{backwardsFlag} = 0;
226 for my $b (@{ $self->{backwards} }) {
227 _checkLevel($b, "backwards");
228 $level{$b} = 1;
229 }
230 for my $v (sort keys %level) {
231 $self->{backwardsFlag} += 1 << $v;
232 }
233 }
0116f5dc 234
91ae00cb 235 defined $self->{rearrange} or $self->{rearrange} = [];
236 ref $self->{rearrange}
237 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc 238
239 # keys of $self->{rearrangeHash} are $self->{rearrange}.
240 $self->{rearrangeHash} = undef;
241
242 if (@{ $self->{rearrange} }) {
243 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
244 }
245
246 $self->{normCode} = undef;
a7fbee98 247
248 if (defined $self->{normalization}) {
249 eval { require Unicode::Normalize };
91ae00cb 250 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 251
91ae00cb 252 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 253
91ae00cb 254 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
255 $self->{normCode} = \&Unicode::Normalize::NFD;
256 }
257 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f 258 my $norm = $self->{normalization};
259 $self->{normCode} = sub {
1d2654e1 260 Unicode::Normalize::normalize($norm, shift);
261 };
06c8fc8f 262 eval { $self->{normCode}->("") }; # try
263 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 264 }
a7fbee98 265 }
0116f5dc 266 return;
267}
268
269sub new
270{
271 my $class = shift;
272 my $self = bless { @_ }, $class;
45394607 273
a7fbee98 274 # If undef is passed explicitly, no file is read.
0116f5dc 275 $self->{table} = $KeyFile if ! exists $self->{table};
276 $self->read_table if defined $self->{table};
905aa9f0 277
a7fbee98 278 if ($self->{entry}) {
279 $self->parseEntry($_) foreach split /\n/, $self->{entry};
280 }
905aa9f0 281
9f1f04a1 282 $self->{level} ||= MaxLevel;
0116f5dc 283 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 284
0116f5dc 285 $self->{overrideHangul} = ''
286 if ! exists $self->{overrideHangul};
287 $self->{overrideCJK} = ''
288 if ! exists $self->{overrideCJK};
06c8fc8f 289 $self->{normalization} = 'NFD'
0116f5dc 290 if ! exists $self->{normalization};
0116f5dc 291 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 292 if ! exists $self->{rearrange};
0116f5dc 293 $self->{backwards} = $self->{backwardsTable}
294 if ! exists $self->{backwards};
a7fbee98 295
0116f5dc 296 $self->checkCollator;
a7fbee98 297
298 return $self;
299}
905aa9f0 300
301sub read_table {
a7fbee98 302 my $self = shift;
303 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
304
e69a2255 305 my $filepath = File::Spec->catfile($Path, $file);
306 open my $fk, "<$filepath"
307 or croak "File does not exist at $filepath";
a7fbee98 308
309 while (<$fk>) {
310 next if /^\s*#/;
311 if (/^\s*\@/) {
0116f5dc 312 if (/^\s*\@version\s*(\S*)/) {
313 $self->{versionTable} ||= $1;
314 }
91ae00cb 315 elsif (/^\s*\@variable\s+(\S*)/) { # since UTS #10-9
316 $self->{variableTable} ||= $1;
317 }
318 elsif (/^\s*\@alternate\s+(\S*)/) { # till UTS #10-8
0116f5dc 319 $self->{alternateTable} ||= $1;
a7fbee98 320 }
0116f5dc 321 elsif (/^\s*\@backwards\s+(\S*)/) {
322 push @{ $self->{backwardsTable} }, $1;
a7fbee98 323 }
0116f5dc 324 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
325 push @{ $self->{forwardsTable} }, $1;
a7fbee98 326 }
0116f5dc 327 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
328 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98 329 }
330 next;
331 }
332 $self->parseEntry($_);
45394607 333 }
a7fbee98 334 close $fk;
45394607 335}
336
905aa9f0 337
45394607 338##
339## get $line, parse it, and write an entry in $self
340##
341sub parseEntry
342{
a7fbee98 343 my $self = shift;
344 my $line = shift;
4d36a948 345 my($name, $entry, @uv, @key);
a7fbee98 346
347 return if $line !~ /^\s*[0-9A-Fa-f]/;
348
349 # removes comment and gets name
350 $name = $1
351 if $line =~ s/[#%]\s*(.*)//;
352 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
353
354 # gets element
355 my($e, $k) = split /;/, $line;
356 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
357 if ! $k;
358
4d36a948 359 @uv = _getHexArray($e);
360 return if !@uv;
361
362 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 363
4d36a948 364 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 365 my $ele = pack_U(@uv);
a7fbee98 366
4d36a948 367 # regarded as if it were not entried in the table
368 return
369 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 370
4d36a948 371 # replaced as completely ignorable
372 $k = '[.0000.0000.0000.0000]'
373 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 374 }
0116f5dc 375
4d36a948 376 # replaced as completely ignorable
377 $k = '[.0000.0000.0000.0000]'
378 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
379
4c843366 380 my $is_L3_ignorable = TRUE;
4d36a948 381
caffd4cf 382 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
383 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 384 my @wt = _getHexArray($arr);
385 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 386 $is_L3_ignorable = FALSE
387 if $wt[0] + $wt[1] + $wt[2] != 0;
4d36a948 388 # if $arr !~ /[1-9A-Fa-f]/; NG
389 # Conformance Test shows L3-ignorable is completely ignorable.
4c843366 390 # For expansion, an entry $is_L3_ignorable
391 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 392 }
caffd4cf 393
91ae00cb 394 $self->{mapping}{$entry} = \@key;
caffd4cf 395
91ae00cb 396 if (@uv > 1) {
397 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
398 and $self->{maxlength}{$uv[0]} = @uv;
399 }
400 else {
401 $is_L3_ignorable
402 ? ($self->{L3_ignorable}{$uv[0]} = TRUE)
403 : ($self->{L3_ignorable}{$uv[0]} and
404 $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key.
405 }
45394607 406}
407
9f1f04a1 408
45394607 409##
91ae00cb 410## arrayref[weights] = varCE(VCE)
45394607 411##
91ae00cb 412sub varCE
45394607 413{
a7fbee98 414 my $self = shift;
4d36a948 415 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
a7fbee98 416
91ae00cb 417 $self->{variable} eq 'blanked' ?
9f1f04a1 418 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
91ae00cb 419 $self->{variable} eq 'non-ignorable' ?
4d36a948 420 \@wt :
91ae00cb 421 $self->{variable} eq 'shifted' ?
9f1f04a1 422 $var ? [Var1Wt, 0, 0, $wt[0] ]
423 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
91ae00cb 424 $self->{variable} eq 'shift-trimmed' ?
9f1f04a1 425 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
91ae00cb 426 croak "$PACKAGE unknown variable name: $self->{variable}";
45394607 427}
428
45394607 429sub viewSortKey
430{
a7fbee98 431 my $self = shift;
9f1f04a1 432 $self->visualizeSortKey($self->getSortKey(@_));
433}
0116f5dc 434
9f1f04a1 435sub visualizeSortKey
436{
437 my $self = shift;
438 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 439
9f1f04a1 440 if ($self->{UCA_Version} <= 8) {
0116f5dc 441 $view =~ s/ ?0000 ?/|/g;
442 } else {
443 $view =~ s/\b0000\b/|/g;
444 }
a7fbee98 445 return "[$view]";
45394607 446}
447
d16e9e3d 448
45394607 449##
91ae00cb 450## arrayref of JCPS = splitEnt(string to be collated)
451## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 452##
91ae00cb 453sub splitEnt
45394607 454{
a7fbee98 455 my $self = shift;
4d36a948 456 my $wLen = $_[1];
457
a7fbee98 458 my $code = $self->{preprocess};
0116f5dc 459 my $norm = $self->{normCode};
91ae00cb 460 my $map = $self->{mapping};
a7fbee98 461 my $max = $self->{maxlength};
462 my $reH = $self->{rearrangeHash};
4d36a948 463 my $ign = $self->{L3_ignorable};
91ae00cb 464 my $ver9 = $self->{UCA_Version} >= 9;
a7fbee98 465
4d36a948 466 my ($str, @buf);
a7fbee98 467
4d36a948 468 if ($wLen) {
469 $code and croak "Preprocess breaks character positions. "
470 . "Don't use with index(), match(), etc.";
471 $norm and croak "Normalization breaks character positions. "
472 . "Don't use with index(), match(), etc.";
473 $str = $_[0];
474 }
475 else {
476 $str = $_[0];
477 $str = &$code($str) if ref $code;
478 $str = &$norm($str) if ref $norm;
479 }
a7fbee98 480
4d36a948 481 # get array of Unicode code point of string.
9f1f04a1 482 my @src = unpack_U($str);
4d36a948 483
484 # rearrangement:
485 # Character positions are not kept if rearranged,
486 # then neglected if $wLen is true.
487 if ($reH && ! $wLen) {
a7fbee98 488 for (my $i = 0; $i < @src; $i++) {
489 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
490 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
491 $i++;
492 }
493 }
45394607 494 }
45394607 495
0116f5dc 496 if ($ver9) {
4d36a948 497 # To remove a character marked as a completely ignorable.
498 for (my $i = 0; $i < @src; $i++) {
499 $src[$i] = undef if $ign->{ $src[$i] };
500 }
0116f5dc 501 }
502
a7fbee98 503 for (my $i = 0; $i < @src; $i++) {
10d7ec48 504 next if _isIllegal($src[$i]);
4d36a948 505
506 my $i_orig = $i;
91ae00cb 507 my $jcps = $src[$i];
4d36a948 508
91ae00cb 509 if ($max->{$jcps}) { # contract
510 my $temp_jcps = $jcps;
511 my $jcpsLen = 1;
512 my $maxLen = $max->{$jcps};
4d36a948 513
91ae00cb 514 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 515 next if ! defined $src[$p];
91ae00cb 516 $temp_jcps .= CODE_SEP . $src[$p];
517 $jcpsLen++;
518 if ($map->{$temp_jcps}) {
519 $jcps = $temp_jcps;
4d36a948 520 $i = $p;
521 }
522 }
4d36a948 523
06c8fc8f 524 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
525 # This process requires Unicode::Normalize.
91ae00cb 526 # If "normalization" is undef, here should be skipped *always*
06c8fc8f 527 # (in spite of bool value of $CVgetCombinClass),
528 # since canonical ordering cannot be expected.
529 # Blocked combining character should not be contracted.
530
531 if ($self->{normalization})
532 # $self->{normCode} is false in the case of "prenormalized".
533 {
534 my $preCC = 0;
535 my $curCC = 0;
536
537 for (my $p = $i + 1; $p < @src; $p++) {
538 next if ! defined $src[$p];
539 $curCC = $CVgetCombinClass->($src[$p]);
540 last unless $curCC;
541 my $tail = CODE_SEP . $src[$p];
91ae00cb 542 if ($preCC != $curCC && $map->{$jcps.$tail}) {
543 $jcps .= $tail;
06c8fc8f 544 $src[$p] = undef;
545 } else {
546 $preCC = $curCC;
547 }
4d36a948 548 }
a7fbee98 549 }
a7fbee98 550 }
551
4d36a948 552 if ($wLen) {
553 for (my $p = $i + 1; $p < @src; $p++) {
554 last if defined $src[$p];
555 $i = $p;
a7fbee98 556 }
557 }
4d36a948 558
91ae00cb 559 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 560 }
4d36a948 561 return \@buf;
d16e9e3d 562}
45394607 563
d16e9e3d 564
565##
4d36a948 566## list of arrayrefs of weights = getWt(JCPS)
d16e9e3d 567##
568sub getWt
569{
a7fbee98 570 my $self = shift;
91ae00cb 571 my $u = shift;
572 my $map = $self->{mapping};
0116f5dc 573 my $der = $self->{derivCode};
a7fbee98 574
91ae00cb 575 return if !defined $u;
576 return map($self->varCE($_), @{ $map->{$u} })
577 if $map->{$u};
a7fbee98 578
91ae00cb 579 # JCPS must not be a contraction, then it's a code point.
580 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 581 my $hang = $self->{overrideHangul};
582 my @hangulCE;
583 if ($hang) {
584 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
585 }
586 elsif (!defined $hang) {
587 @hangulCE = $der->($u);
588 }
589 else {
590 my $max = $self->{maxlength};
591 my @decH = _decompHangul($u);
592
593 if (@decH == 2) {
594 my $contract = join(CODE_SEP, @decH);
91ae00cb 595 @decH = ($contract) if $map->{$contract};
1d2654e1 596 } else { # must be <@decH == 3>
597 if ($max->{$decH[0]}) {
598 my $contract = join(CODE_SEP, @decH);
91ae00cb 599 if ($map->{$contract}) {
1d2654e1 600 @decH = ($contract);
601 } else {
602 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 603 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1 604 }
605 # even if V's ignorable, LT contraction is not supported.
606 # If such a situatution were required, NFD should be used.
607 }
608 if (@decH == 3 && $max->{$decH[1]}) {
609 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 610 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1 611 }
612 }
613
614 @hangulCE = map({
91ae00cb 615 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1 616 } @decH);
617 }
91ae00cb 618 return map $self->varCE($_), @hangulCE;
a7fbee98 619 }
91ae00cb 620 elsif (CJK_UidIni <= $u && $u <= CJK_UidFin ||
621 CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
622 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
1d2654e1 623 my $cjk = $self->{overrideCJK};
91ae00cb 624 return map $self->varCE($_),
0116f5dc 625 $cjk
4d36a948 626 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
91ae00cb 627 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
9f1f04a1 628 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 629 : $der->($u);
a7fbee98 630 }
631 else {
91ae00cb 632 return map $self->varCE($_), $der->($u);
a7fbee98 633 }
d16e9e3d 634}
635
d16e9e3d 636
637##
638## string sortkey = getSortKey(string arg)
639##
640sub getSortKey
641{
a7fbee98 642 my $self = shift;
643 my $lev = $self->{level};
91ae00cb 644 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
645 my $ver9 = $self->{UCA_Version} >= 9;
646 my $v2i = $self->{variable} ne 'non-ignorable';
a7fbee98 647
648 # weight arrays
91ae00cb 649 my (@wts, @buf, $last_is_variable);
650
651 if ($self->{hangul_terminator}) {
652 my $preHST = '';
653 foreach my $jcps (@$rEnt) {
654 # weird things like VL, TL-contraction are not considered!
655 my $curHST = '';
656 foreach my $u (split /;/, $jcps) {
657 $curHST .= getHST($u);
658 }
659 if ($preHST && !$curHST || # hangul before non-hangul
660 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
661 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
662 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
0116f5dc 663
91ae00cb 664 push @wts, $self->varCE_HangulTerm;
665 }
666 $preHST = $curHST;
667
668 push @wts, $self->getWt($jcps);
669 }
670 $preHST # end at hangul
671 and push @wts, $self->varCE_HangulTerm;
672 }
673 else {
674 foreach my $jcps (@$rEnt) {
675 push @wts, $self->getWt($jcps);
676 }
677 }
678
679 foreach my $wt (@wts) {
4d36a948 680 if ($v2i && $ver9) {
681 if ($wt->[0] == 0) { # ignorable
682 next if $last_is_variable;
0116f5dc 683 } else {
9f1f04a1 684 $last_is_variable = ($wt->[0] == Var1Wt);
0116f5dc 685 }
686 }
4d36a948 687 push @buf, $wt;
0116f5dc 688 }
a7fbee98 689
690 # make sort key
691 my @ret = ([],[],[],[]);
692 foreach my $v (0..$lev-1) {
693 foreach my $b (@buf) {
4d36a948 694 push @{ $ret[$v] }, $b->[$v]
695 if 0 < $b->[$v];
a7fbee98 696 }
697 }
45394607 698
a7fbee98 699 # modification of tertiary weights
700 if ($self->{upper_before_lower}) {
701 foreach (@{ $ret[2] }) {
702 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
703 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
704 elsif ($_ == 0x1C) { $_ += 1 } # square upper
705 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
706 }
45394607 707 }
a7fbee98 708 if ($self->{katakana_before_hiragana}) {
709 foreach (@{ $ret[2] }) {
710 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
711 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
712 }
45394607 713 }
9f1f04a1 714
715 if ($self->{backwardsFlag}) {
716 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
717 if ($self->{backwardsFlag} & (1 << $v)) {
718 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
719 }
720 }
721 }
722
4d36a948 723 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 724}
725
726
727##
d16e9e3d 728## int compare = cmp(string a, string b)
45394607 729##
5398038e 730sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
731sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
732sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
733sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
734sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
735sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
736sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 737
738##
d16e9e3d 739## list[strings] sorted = sort(list[strings] arg)
45394607 740##
a7fbee98 741sub sort {
742 my $obj = shift;
743 return
744 map { $_->[1] }
745 sort{ $a->[0] cmp $b->[0] }
746 map [ $obj->getSortKey($_), $_ ], @_;
45394607 747}
748
0116f5dc 749
4d36a948 750sub _derivCE_9 {
0116f5dc 751 my $u = shift;
752 my $base =
91ae00cb 753 (CJK_UidIni <= $u && $u <= CJK_UidFin)
4d36a948 754 ? 0xFB40 : # CJK
91ae00cb 755 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
756 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
4d36a948 757 ? 0xFB80 # CJK ext.
758 : 0xFBC0; # others
0116f5dc 759
760 my $aaaa = $base + ($u >> 15);
761 my $bbbb = ($u & 0x7FFF) | 0x8000;
762 return
9f1f04a1 763 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
764 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 765}
766
4d36a948 767sub _derivCE_8 {
0116f5dc 768 my $code = shift;
769 my $aaaa = 0xFF80 + ($code >> 15);
770 my $bbbb = ($code & 0x7FFF) | 0x8000;
771 return
4d36a948 772 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
773 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 774}
775
91ae00cb 776
777sub varCE_HangulTerm {
778 my $self = shift;
779 return $self->varCE(pack(VCE_TEMPLATE,
780 NON_VAR, $self->{hangul_terminator}, 0,0,0));
781}
782
783
45394607 784##
785## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
786##
a7fbee98 787sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 788
a7fbee98 789#
4d36a948 790# $code *must* be in Hangul syllable.
a7fbee98 791# Check it before you enter here.
792#
5398038e 793sub _decompHangul {
794 my $code = shift;
91ae00cb 795 my $SIndex = $code - Hangul_SBase;
796 my $LIndex = int( $SIndex / Hangul_NCount);
797 my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
798 my $TIndex = $SIndex % Hangul_TCount;
5398038e 799 return (
91ae00cb 800 Hangul_LBase + $LIndex,
801 Hangul_VBase + $VIndex,
802 $TIndex ? (Hangul_TBase + $TIndex) : (),
5398038e 803 );
45394607 804}
805
10d7ec48 806sub _isIllegal {
4d36a948 807 my $code = shift;
808 return ! defined $code # removed
809 || ($code < 0 || 0x10FFFF < $code) # out of range
810 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
811 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
812 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
813 ;
814}
815
91ae00cb 816# Hangul Syllable Type
817sub getHST {
818 my $u = shift;
819 return
820 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
821 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
822 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
823 Hangul_SIni <= $u && $u <= Hangul_SFin ?
824 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
825}
826
4d36a948 827
828##
829## bool _nonIgnorAtLevel(arrayref weights, int level)
830##
831sub _nonIgnorAtLevel($$)
832{
833 my $wt = shift;
834 return if ! defined $wt;
835 my $lv = shift;
9f1f04a1 836 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 837}
838
839##
840## bool _eqArray(
841## arrayref of arrayref[weights] source,
842## arrayref of arrayref[weights] substr,
843## int level)
844## * comparison of graphemes vs graphemes.
845## @$source >= @$substr must be true (check it before call this);
846##
847sub _eqArray($$$)
848{
849 my $source = shift;
850 my $substr = shift;
851 my $lev = shift;
852
853 for my $g (0..@$substr-1){
854 # Do the $g'th graphemes have the same number of AV weigths?
855 return if @{ $source->[$g] } != @{ $substr->[$g] };
856
857 for my $w (0..@{ $substr->[$g] }-1) {
858 for my $v (0..$lev-1) {
859 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
860 }
861 }
862 }
863 return 1;
864}
865
866##
867## (int position, int length)
868## int position = index(string, substring, position, [undoc'ed grobal])
869##
870## With "grobal" (only for the list context),
871## returns list of arrayref[position, length].
872##
873sub index
874{
91ae00cb 875 my $self = shift;
876 my $str = shift;
877 my $len = length($str);
878 my $subE = $self->splitEnt(shift);
879 my $pos = @_ ? shift : 0;
880 $pos = 0 if $pos < 0;
881 my $grob = shift;
882
883 my $lev = $self->{level};
884 my $ver9 = $self->{UCA_Version} >= 9;
885 my $v2i = $self->{variable} ne 'non-ignorable';
886
887 if (! @$subE) {
4d36a948 888 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
889 return $grob
890 ? map([$_, 0], $temp..$len)
891 : wantarray ? ($temp,0) : $temp;
892 }
893 if ($len < $pos) {
894 return wantarray ? () : NOMATCHPOS;
895 }
91ae00cb 896 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
897 if (! @$strE) {
4d36a948 898 return wantarray ? () : NOMATCHPOS;
899 }
900 my $last_is_variable;
901 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
902
903 $last_is_variable = FALSE;
91ae00cb 904 for my $wt (map $self->getWt($_), @$subE) {
4d36a948 905 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
906
907 if ($v2i && $ver9) {
908 if ($wt->[0] == 0) {
909 $to_be_pushed = FALSE if $last_is_variable;
910 } else {
9f1f04a1 911 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 912 }
913 }
914
915 if (@subWt && $wt->[0] == 0) {
916 push @{ $subWt[-1] }, $wt if $to_be_pushed;
917 } else {
9f1f04a1 918 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 919 push @subWt, [ $wt ];
920 }
921 }
922
923 my $count = 0;
91ae00cb 924 my $end = @$strE - 1;
4d36a948 925
926 $last_is_variable = FALSE;
927
928 for (my $i = 0; $i <= $end; ) { # no $i++
929 my $found_base = 0;
930
931 # fetch a grapheme
932 while ($i <= $end && $found_base == 0) {
91ae00cb 933 for my $wt ($self->getWt($strE->[$i][0])) {
4d36a948 934 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
935
936 if ($v2i && $ver9) {
937 if ($wt->[0] == 0) {
938 $to_be_pushed = FALSE if $last_is_variable;
939 } else {
9f1f04a1 940 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948 941 }
942 }
943
944 if (@strWt && $wt->[0] == 0) {
945 push @{ $strWt[-1] }, $wt if $to_be_pushed;
91ae00cb 946 $finPos[-1] = $strE->[$i][2];
4d36a948 947 } elsif ($to_be_pushed) {
9f1f04a1 948 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948 949 push @strWt, [ $wt ];
91ae00cb 950 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 951 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 952 push @finPos, $strE->[$i][2];
4d36a948 953 $found_base++;
954 }
955 # else ===> no-op
956 }
957 $i++;
958 }
959
960 # try to match
961 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
962 if ($iniPos[0] != NOMATCHPOS &&
963 $finPos[$#subWt] != NOMATCHPOS &&
964 _eqArray(\@strWt, \@subWt, $lev)) {
965 my $temp = $iniPos[0] + $pos;
966
967 if ($grob) {
968 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
969 splice @strWt, 0, $#subWt;
970 splice @iniPos, 0, $#subWt;
971 splice @finPos, 0, $#subWt;
972 }
973 else {
974 return wantarray
975 ? ($temp, $finPos[$#subWt] - $iniPos[0])
976 : $temp;
977 }
978 }
979 shift @strWt;
980 shift @iniPos;
981 shift @finPos;
982 }
983 }
984
985 return $grob
986 ? @g_ret
987 : wantarray ? () : NOMATCHPOS;
988}
989
990##
991## scalarref to matching part = match(string, substring)
992##
993sub match
994{
995 my $self = shift;
996 if (my($pos,$len) = $self->index($_[0], $_[1])) {
997 my $temp = substr($_[0], $pos, $len);
998 return wantarray ? $temp : \$temp;
999 # An lvalue ref \substr should be avoided,
1000 # since its value is affected by modification of its referent.
1001 }
1002 else {
1003 return;
1004 }
1005}
1006
1007##
1008## arrayref matching parts = gmatch(string, substring)
1009##
1010sub gmatch
1011{
1012 my $self = shift;
1013 my $str = shift;
1014 my $sub = shift;
1015 return map substr($str, $_->[0], $_->[1]),
1016 $self->index($str, $sub, 0, 'g');
1017}
1018
1019##
1020## bool subst'ed = subst(string, substring, replace)
1021##
1022sub subst
1023{
1024 my $self = shift;
1025 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1026
1027 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1028 if ($code) {
1029 my $mat = substr($_[0], $pos, $len);
1030 substr($_[0], $pos, $len, $code->($mat));
1031 } else {
1032 substr($_[0], $pos, $len, $_[2]);
1033 }
1034 return TRUE;
1035 }
1036 else {
1037 return FALSE;
1038 }
1039}
1040
1041##
1042## int count = gsubst(string, substring, replace)
1043##
1044sub gsubst
1045{
1046 my $self = shift;
1047 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1048 my $cnt = 0;
1049
1050 # Replacement is carried out from the end, then use reverse.
1051 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1052 if ($code) {
1053 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1054 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1055 } else {
1056 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1057 }
1058 $cnt++;
1059 }
1060 return $cnt;
1061}
1062
45394607 10631;
1064__END__
1065
1066=head1 NAME
1067
a7fbee98 1068Unicode::Collate - Unicode Collation Algorithm
45394607 1069
1070=head1 SYNOPSIS
1071
1072 use Unicode::Collate;
1073
1074 #construct
5398038e 1075 $Collator = Unicode::Collate->new(%tailoring);
45394607 1076
1077 #sort
5398038e 1078 @sorted = $Collator->sort(@not_sorted);
45394607 1079
1080 #compare
a7fbee98 1081 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1082
91ae00cb 1083 # If %tailoring is false (i.e. empty),
1084 # $Collator should do the default collation.
1085
45394607 1086=head1 DESCRIPTION
1087
4d36a948 1088This module is an implementation
1089of Unicode Technical Standard #10 (UTS #10)
1090"Unicode Collation Algorithm."
1091
45394607 1092=head2 Constructor and Tailoring
1093
d16e9e3d 1094The C<new> method returns a collator object.
1095
5398038e 1096 $Collator = Unicode::Collate->new(
0116f5dc 1097 UCA_Version => $UCA_Version,
91ae00cb 1098 alternate => $alternate, # deprecated: use of 'variable' is recommended.
45394607 1099 backwards => $levelNumber, # or \@levelNumbers
1100 entry => $element,
91ae00cb 1101 hangul_terminator => $term_primary_weight,
45394607 1102 ignoreName => qr/$ignoreName/,
1103 ignoreChar => qr/$ignoreChar/,
1104 katakana_before_hiragana => $bool,
1105 level => $collationLevel,
91ae00cb 1106 normalization => $normalization_form,
45394607 1107 overrideCJK => \&overrideCJK,
1108 overrideHangul => \&overrideHangul,
1109 preprocess => \&preprocess,
1110 rearrange => \@charList,
1111 table => $filename,
1112 undefName => qr/$undefName/,
1113 undefChar => qr/$undefChar/,
1114 upper_before_lower => $bool,
91ae00cb 1115 variable => $variable,
45394607 1116 );
45394607 1117
1118=over 4
1119
0116f5dc 1120=item UCA_Version
1121
91ae00cb 1122If the tracking version number of the older UCA is given,
1123the older behavior of that tracking version is emulated on collating.
0116f5dc 1124If omitted, the return value of C<UCA_Version()> is used.
1125
91ae00cb 1126The supported tracking version: 8, 9, or 11.
0116f5dc 1127
1128B<This parameter may be removed in the future version,
1129as switching the algorithm would affect the performance.>
1130
45394607 1131=item backwards
1132
4d36a948 1133-- see 3.1.2 French Accents, UTS #10.
45394607 1134
1135 backwards => $levelNumber or \@levelNumbers
1136
1137Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1138If omitted, forwards at all the levels.
1139
1140=item entry
1141
4d36a948 1142-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1143
91ae00cb 1144If the same character (or a sequence of characters) exists
1145in the collation element table through C<table>,
1146mapping to collation elements is overrided.
1147If it does not exist, the mapping is defined additionally.
45394607 1148
1149 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 115000E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607 11510063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
11520043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1153ENTRIES
1154
4d36a948 1155B<NOTE:> The code point in the UCA file format (before C<';'>)
1156B<must> be a Unicode code point, but not a native code point.
1157So C<0063> must always denote C<U+0063>,
1158but not a character of C<"\x63">.
1159
91ae00cb 1160=item hangul_terminator
1161
1162-- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1163
1164If a true value is given (non-zero but should be positive),
1165it will be added as a terminator primary weight to the end of
1166every standard Hangul syllable. Secondary and any higher weights
1167for terminator are set to zero.
1168If the value is false or C<hangul_terminator> key does not exist,
1169insertion of terminator weights will not be performed.
1170
1171Boundaries of Hangul syllables are determined
1172according to conjoining Jamo behavior in F<the Unicode Standard>
1173and F<HangulSyllableType.txt>.
1174
1175B<Implementation Note:>
1176(1) For expansion mapping (Unicode character mapped
1177to a sequence of collation elements), a terminator will not be added
1178between collation elements, even if Hangul syllable boundary exists there.
1179Addition of terminator is restricted to the next position
1180to the last collation element.
1181
1182(2) Non-conjoining Hangul letters
1183(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1184automatically terminated with a terminator primary weight.
1185These characters may need terminator included in a collation element
1186table beforehand.
1187
45394607 1188=item ignoreName
1189
1190=item ignoreChar
1191
4d36a948 1192-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1193
caffd4cf 1194Makes the entry in the table completely ignorable;
1195i.e. as if the weights were zero at all level.
45394607 1196
a7fbee98 1197E.g. when 'a' and 'e' are ignorable,
45394607 1198'element' is equal to 'lament' (or 'lmnt').
1199
1200=item level
1201
4d36a948 1202-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1203
1204Set the maximum level.
1205Any higher levels than the specified one are ignored.
1206
1207 Level 1: alphabetic ordering
1208 Level 2: diacritic ordering
1209 Level 3: case ordering
91ae00cb 1210 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607 1211
1212 ex.level => 2,
1213
a7fbee98 1214If omitted, the maximum is the 4th.
1215
45394607 1216=item normalization
1217
4d36a948 1218-- see 4.1 Normalize each input string, UTS #10.
45394607 1219
905aa9f0 1220If specified, strings are normalized before preparation of sort keys
45394607 1221(the normalization is executed after preprocess).
1222
1d2654e1 1223A form name C<Unicode::Normalize::normalize()> accepts will be applied
1224as C<$normalization_form>.
06c8fc8f 1225Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1 1226See C<Unicode::Normalize::normalize()> for detail.
1227If omitted, C<'NFD'> is used.
45394607 1228
91ae00cb 1229C<normalization> is performed after C<preprocess> (if defined).
45394607 1230
06c8fc8f 1231Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1232though they are not concerned with C<Unicode::Normalize::normalize()>.
1233
1234If C<undef> (not a string C<"undef">) is passed explicitly
1235as the value for this key,
45394607 1236any normalization is not carried out (this may make tailoring easier
1237if any normalization is not desired).
06c8fc8f 1238Under C<(normalization =E<gt> undef)>, only contiguous contractions
1239are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
1240even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
1241In this point,
1242C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1243B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1244
1245In the case of C<(normalization =E<gt> "prenormalized")>,
1246any normalization is not performed, but
1247non-contiguous contractions with combining characters are performed.
1248Therefore
1249C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1250B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1251If source strings are finely prenormalized,
1252C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1253
1254Except C<(normalization =E<gt> undef)>,
1255B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607 1256
1257=item overrideCJK
1258
4d36a948 1259-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1260
91ae00cb 1261By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1262(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than
1263C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1264C<U+20000> to C<U+2A6D6>].
1265
1266Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
45394607 1267
a7fbee98 1268ex. CJK Unified Ideographs in the JIS code point order.
45394607 1269
1270 overrideCJK => sub {
a7fbee98 1271 my $u = shift; # get a Unicode codepoint
1272 my $b = pack('n', $u); # to UTF-16BE
1273 my $s = your_unicode_to_sjis_converter($b); # convert
1274 my $n = unpack('n', $s); # convert sjis to short
1275 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1276 },
1277
a7fbee98 1278ex. ignores all CJK Unified Ideographs.
1279
1280 overrideCJK => sub {()}, # CODEREF returning empty list
1281
1282 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1283 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1284
1285If C<undef> is passed explicitly as the value for this key,
1286weights for CJK Unified Ideographs are treated as undefined.
1287But assignment of weight for CJK Unified Ideographs
91ae00cb 1288in table or C<entry> is still valid.
a7fbee98 1289
1290=item overrideHangul
1291
4d36a948 1292-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1293
1294By default, Hangul Syllables are decomposed into Hangul Jamo.
1295But the mapping of Hangul Syllables may be overrided.
1296
91ae00cb 1297This tag works like C<overrideCJK>, so see there for examples.
a7fbee98 1298
45394607 1299If you want to override the mapping of Hangul Syllables,
1300the Normalization Forms D and KD are not appropriate
1301(they will be decomposed before overriding).
1302
a7fbee98 1303If C<undef> is passed explicitly as the value for this key,
1304weight for Hangul Syllables is treated as undefined
1305without decomposition into Hangul Jamo.
1306But definition of weight for Hangul Syllables
91ae00cb 1307in table or C<entry> is still valid.
a7fbee98 1308
45394607 1309=item preprocess
1310
4d36a948 1311-- see 5.1 Preprocessing, UTS #10.
45394607 1312
1313If specified, the coderef is used to preprocess
1314before the formation of sort keys.
1315
a7fbee98 1316ex. dropping English articles, such as "a" or "the".
45394607 1317Then, "the pen" is before "a pencil".
1318
1319 preprocess => sub {
1320 my $str = shift;
a7fbee98 1321 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1322 return $str;
45394607 1323 },
1324
91ae00cb 1325C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1326
45394607 1327=item rearrange
1328
4d36a948 1329-- see 3.1.3 Rearrangement, UTS #10.
45394607 1330
1331Characters that are not coded in logical order and to be rearranged.
a7fbee98 1332By default,
45394607 1333
1334 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1335
a7fbee98 1336If you want to disallow any rearrangement,
1337pass C<undef> or C<[]> (a reference to an empty list)
1338as the value for this key.
1339
0116f5dc 1340B<According to the version 9 of UCA, this parameter shall not be used;
1341but it is not warned at present.>
1342
45394607 1343=item table
1344
4d36a948 1345-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1346
91ae00cb 1347You can use another collation element table if desired.
ae6aa562 1348The table file must be put into a directory
10d7ec48 1349where F<Unicode/Collate.pm> is installed; e.g. into
1350F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>.
45394607 1351
10d7ec48 1352By default, the filename F<allkeys.txt> is used.
45394607 1353
a7fbee98 1354If C<undef> is passed explicitly as the value for this key,
91ae00cb 1355no file is read (but you can define collation elements via C<entry>).
a7fbee98 1356
1357A typical way to define a collation element table
1358without any file of table:
1359
1360 $onlyABC = Unicode::Collate->new(
1361 table => undef,
1362 entry => << 'ENTRIES',
13630061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
13640041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
13650062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
13660042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
13670063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
13680043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1369ENTRIES
1370 );
905aa9f0 1371
45394607 1372=item undefName
1373
1374=item undefChar
1375
4d36a948 1376-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1377
1378Undefines the collation element as if it were unassigned in the table.
1379This reduces the size of the table.
1380If an unassigned character appears in the string to be collated,
1381the sort key is made from its codepoint
1382as a single-character collation element,
1383as it is greater than any other assigned collation elements
1384(in the codepoint order among the unassigned characters).
1385But, it'd be better to ignore characters
1386unfamiliar to you and maybe never used.
1387
1388=item katakana_before_hiragana
1389
1390=item upper_before_lower
1391
4d36a948 1392-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1393
1394By default, lowercase is before uppercase
1395and hiragana is before katakana.
1396
a7fbee98 1397If the tag is made true, this is reversed.
1398
1399B<NOTE>: These tags simplemindedly assume
1400any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1 1401must occur in level 3, and their weights at level 3
1402must be same as those mentioned in 7.3.1, UTS #10.
1403If you define your collation elements which violate this requirement,
4d36a948 1404these tags don't work validly.
45394607 1405
91ae00cb 1406=item variable
1407
1408=item alternate
1409
1410-- see 3.2.2 Variable Weighting, UTS #10.
1411
1412(the title in UCA version 8: Alternate Weighting)
1413
1414This key allows to variable weighting for variable collation elements,
1415which are marked with an ASTERISK in the table
1416(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1417
1418 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1419
1420These names are case-insensitive.
1421By default (if specification is omitted), 'shifted' is adopted.
1422
1423 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1424 considered at the 4th level.
1425
1426 'Non-ignorable' Variable elements are not reset to ignorable.
1427
1428 'Shifted' Variable elements are made ignorable at levels 1 through 3
1429 their level 4 weight is replaced by the old level 1 weight.
1430 Level 4 weight for Non-Variable elements is 0xFFFF.
1431
1432 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1433 are trimmed.
1434
1435For backward compatibility, C<alternate> can be used as an alias
1436for C<variable>.
1437
45394607 1438=back
1439
3164dd77 1440=head2 Methods for Collation
45394607 1441
1442=over 4
1443
5398038e 1444=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1445
1446Sorts a list of strings.
1447
5398038e 1448=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1449
1450Returns 1 (when C<$a> is greater than C<$b>)
1451or 0 (when C<$a> is equal to C<$b>)
1452or -1 (when C<$a> is lesser than C<$b>).
1453
5398038e 1454=item C<$result = $Collator-E<gt>eq($a, $b)>
1455
1456=item C<$result = $Collator-E<gt>ne($a, $b)>
1457
1458=item C<$result = $Collator-E<gt>lt($a, $b)>
1459
1460=item C<$result = $Collator-E<gt>le($a, $b)>
1461
1462=item C<$result = $Collator-E<gt>gt($a, $b)>
1463
1464=item C<$result = $Collator-E<gt>ge($a, $b)>
1465
a7fbee98 1466They works like the same name operators as theirs.
5398038e 1467
1468 eq : whether $a is equal to $b.
1469 ne : whether $a is not equal to $b.
1470 lt : whether $a is lesser than $b.
1471 le : whether $a is lesser than $b or equal to $b.
1472 gt : whether $a is greater than $b.
1473 ge : whether $a is greater than $b or equal to $b.
1474
1475=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1476
4d36a948 1477-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1478
1479Returns a sort key.
1480
1481You compare the sort keys using a binary comparison
1482and get the result of the comparison of the strings using UCA.
1483
5398038e 1484 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1485
1486 is equivalent to
1487
5398038e 1488 $Collator->cmp($a, $b)
45394607 1489
a7fbee98 1490=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1491
a7fbee98 1492 use Unicode::Collate;
1493 my $c = Unicode::Collate->new();
1494 print $c->viewSortKey("Perl"),"\n";
1495
0116f5dc 1496 # output:
1497 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1498 # Level 1 Level 2 Level 3 Level 4
1499
1500 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1501
4d36a948 1502=back
1503
1504=head2 Methods for Searching
d16e9e3d 1505
4d36a948 1506B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1507for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1508C<subst>, C<gsubst>) is croaked,
1509as the position and the length might differ
1510from those on the specified string.
91ae00cb 1511(And C<rearrange> and C<hangul_terminator> tags are neglected.)
d16e9e3d 1512
4d36a948 1513The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1514like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1515but they are not aware of any pattern, but only a literal substring.
1516
1517=over 4
1518
1519=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1520
1521=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1522
1523If C<$substring> matches a part of C<$string>, returns
1524the position of the first occurrence of the matching part in scalar context;
1525in list context, returns a two-element list of
1526the position and the length of the matching part.
1527
d16e9e3d 1528If C<$substring> does not match any part of C<$string>,
1529returns C<-1> in scalar context and
1530an empty list in list context.
1531
1532e.g. you say
1533
5398038e 1534 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1535 # (normalization => undef) is REQUIRED.
1536 my $str = "Ich muß studieren Perl.";
1537 my $sub = "MÜSS";
d16e9e3d 1538 my $match;
a7fbee98 1539 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1540 $match = substr($str, $pos, $len);
d16e9e3d 1541 }
1542
4d36a948 1543and get C<"muß"> in C<$match> since C<"muß">
1544is primary equal to C<"MÜSS">.
1545
1546=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1547
1548=item C<($match) = $Collator-E<gt>match($string, $substring)>
1549
1550If C<$substring> matches a part of C<$string>, in scalar context, returns
1551B<a reference to> the first occurrence of the matching part
1552(C<$match_ref> is always true if matches,
1553since every reference is B<true>);
1554in list context, returns the first occurrence of the matching part.
1555
1556If C<$substring> does not match any part of C<$string>,
1557returns C<undef> in scalar context and
1558an empty list in list context.
1559
1560e.g.
1561
1562 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1563 print "matches [$$match_ref].\n";
1564 } else {
1565 print "doesn't match.\n";
1566 }
1567
1568 or
1569
1570 if (($match) = $Collator->match($str, $sub)) { # list context
1571 print "matches [$match].\n";
1572 } else {
1573 print "doesn't match.\n";
1574 }
1575
1576=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1577
1578If C<$substring> matches a part of C<$string>, returns
1579all the matching parts (or matching count in scalar context).
1580
1581If C<$substring> does not match any part of C<$string>,
1582returns an empty list.
1583
1584=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1585
1586If C<$substring> matches a part of C<$string>,
1587the first occurrence of the matching part is replaced by C<$replacement>
1588(C<$string> is modified) and return C<$count> (always equals to C<1>).
1589
1590C<$replacement> can be a C<CODEREF>,
1591taking the matching part as an argument,
1592and returning a string to replace the matching part
1593(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1594
1595=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1596
1597If C<$substring> matches a part of C<$string>,
1598all the occurrences of the matching part is replaced by C<$replacement>
1599(C<$string> is modified) and return C<$count>.
1600
1601C<$replacement> can be a C<CODEREF>,
1602taking the matching part as an argument,
1603and returning a string to replace the matching part
1604(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1605
1606e.g.
1607
1608 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1609 # (normalization => undef) is REQUIRED.
1610 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1611 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1612
1613 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1614 # i.e., all the camels are made bold-faced.
d16e9e3d 1615
45394607 1616=back
1617
3164dd77 1618=head2 Other Methods
1619
1620=over 4
1621
0116f5dc 1622=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1623
1624Change the value of specified keys and returns the changed part.
1625
1626 $Collator = Unicode::Collate->new(level => 4);
1627
1628 $Collator->eq("perl", "PERL"); # false
1629
1630 %old = $Collator->change(level => 2); # returns (level => 4).
1631
1632 $Collator->eq("perl", "PERL"); # true
1633
1634 $Collator->change(%old); # returns (level => 2).
1635
1636 $Collator->eq("perl", "PERL"); # false
1637
1638Not all C<(key,value)>s are allowed to be changed.
1639See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1640
1641In the scalar context, returns the modified collator
1642(but it is B<not> a clone from the original).
1643
1644 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1645
1646 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1647
1648 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1649
91ae00cb 1650=item C<$version = $Collator-E<gt>version()>
3164dd77 1651
91ae00cb 1652Returns the version number (a string) of the Unicode Standard
1653which the C<table> file used by the collator object is based on.
1654If the table does not include a version line (starting with C<@version>),
1655returns C<"unknown">.
1656
1657=item C<UCA_Version()>
3164dd77 1658
91ae00cb 1659Returns the tracking version number of UTS #10 this module consults.
3164dd77 1660
91ae00cb 1661=item C<Base_Unicode_Version()>
1662
1663Returns the version number of UTS #10 this module consults.
3164dd77 1664
1665=back
1666
45394607 1667=head2 EXPORT
1668
1669None by default.
1670
1671=head2 CAVEAT
1672
1673Use of the C<normalization> parameter requires
1674the B<Unicode::Normalize> module.
1675
5398038e 1676If you need not it (say, in the case when you need not
45394607 1677handle any combining characters),
1678assign C<normalization =E<gt> undef> explicitly.
1679
4d36a948 1680-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1681
0116f5dc 1682=head2 Conformance Test
1683
10d7ec48 1684The Conformance Test for the UCA is available
1685under L<http://www.unicode.org/Public/UCA/>.
0116f5dc 1686
1687For F<CollationTest_SHIFTED.txt>,
1688a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1689for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1690C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1691
4d36a948 1692B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1693
45394607 1694=head1 AUTHOR
1695
10d7ec48 1696SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
45394607 1697
1698 http://homepage1.nifty.com/nomenclator/perl/
1699
ae6aa562 1700 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1701
a7fbee98 1702 This library is free software; you can redistribute it
1703 and/or modify it under the same terms as Perl itself.
45394607 1704
1705=head1 SEE ALSO
1706
1707=over 4
1708
91ae00cb 1709=item Unicode Collation Algorithm - UTS #10
1710
1711L<http://www.unicode.org/reports/tr10/>
1712
1713=item The Default Unicode Collation Element Table (DUCET)
1714
10d7ec48 1715L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1716
91ae00cb 1717=item The conformance test for the UCA
45394607 1718
10d7ec48 1719L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1720
10d7ec48 1721L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1722
91ae00cb 1723=item Hangul Syllable Type
0116f5dc 1724
10d7ec48 1725L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1726
91ae00cb 1727=item Unicode Normalization Forms - UAX #15
a7fbee98 1728
91ae00cb 1729L<http://www.unicode.org/reports/tr15/>
a7fbee98 1730
a7fbee98 1731=item L<Unicode::Normalize>
45394607 1732
45394607 1733=back
1734
1735=cut