Upgrade to Time::Local 1.10.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48 15no warnings 'utf8';
16
45394607 17require Exporter;
18
abd1ec54 19our $VERSION = '0.33';
45394607 20our $PACKAGE = __PACKAGE__;
21
22our @ISA = qw(Exporter);
23
24our %EXPORT_TAGS = ();
25our @EXPORT_OK = ();
26our @EXPORT = ();
27
28(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
29our $KeyFile = "allkeys.txt";
30
4d36a948 31# Perl's boolean
32use constant TRUE => 1;
33use constant FALSE => "";
34use constant NOMATCHPOS => -1;
35
36# A coderef to get combining class imported from Unicode::Normalize
37# (i.e. \&Unicode::Normalize::getCombinClass).
38# This is also used as a HAS_UNICODE_NORMALIZE flag.
06c8fc8f 39our $CVgetCombinClass;
4d36a948 40
9f1f04a1 41# Supported Levels
42use constant MinLevel => 1;
43use constant MaxLevel => 4;
44
4d36a948 45# Minimum weights at level 2 and 3, respectively
9f1f04a1 46use constant Min2Wt => 0x20;
47use constant Min3Wt => 0x02;
4d36a948 48
49# Shifted weight at 4th level
9f1f04a1 50use constant Shift4Wt => 0xFFFF;
4d36a948 51
4d36a948 52# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
53# PROBLEM: The Default Unicode Collation Element Table
54# has weights over 0xFFFF at the 4th level.
55# The tie-breaking in the variable weights
56# other than "shift" (as well as "shift-trimmed") is unreliable.
57use constant VCE_TEMPLATE => 'Cn4';
58
4d36a948 59# A sort key: 16-bit weights
60# See also the PROBLEM on VCE_TEMPLATE above.
61use constant KEY_TEMPLATE => 'n*';
62
63# Level separator in a sort key:
64# i.e. pack(KEY_TEMPLATE, 0)
65use constant LEVEL_SEP => "\0\0";
66
67# As Unicode code point separator for hash keys.
68# A joined code point string (denoted by JCPS below)
69# like "65;768" is used for internal processing
70# instead of Perl's Unicode string like "\x41\x{300}",
71# as the native code point is different from the Unicode code point
72# on EBCDIC platform.
73# This character must not be included in any stringified
74# representation of an integer.
75use constant CODE_SEP => ';';
76
77# boolean values of variable weights
0116f5dc 78use constant NON_VAR => 0; # Non-Variable character
79use constant VAR => 1; # Variable character
3164dd77 80
91ae00cb 81# specific code points
82use constant Hangul_LBase => 0x1100;
83use constant Hangul_LIni => 0x1100;
84use constant Hangul_LFin => 0x1159;
85use constant Hangul_LFill => 0x115F;
86use constant Hangul_VBase => 0x1161;
87use constant Hangul_VIni => 0x1160;
88use constant Hangul_VFin => 0x11A2;
89use constant Hangul_TBase => 0x11A7;
90use constant Hangul_TIni => 0x11A8;
91use constant Hangul_TFin => 0x11F9;
92use constant Hangul_TCount => 28;
93use constant Hangul_NCount => 588;
94use constant Hangul_SBase => 0xAC00;
95use constant Hangul_SIni => 0xAC00;
96use constant Hangul_SFin => 0xD7A3;
97use constant CJK_UidIni => 0x4E00;
98use constant CJK_UidFin => 0x9FA5;
99use constant CJK_ExtAIni => 0x3400;
100use constant CJK_ExtAFin => 0x4DB5;
101use constant CJK_ExtBIni => 0x20000;
102use constant CJK_ExtBFin => 0x2A6D6;
103use constant BMP_Max => 0xFFFF;
104
4d36a948 105# Logical_Order_Exception in PropList.txt
106# TODO: synchronization with change of PropList.txt.
0116f5dc 107our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 108
91ae00cb 109sub UCA_Version { "11" }
a7fbee98 110
91ae00cb 111sub Base_Unicode_Version { "4.0" }
a7fbee98 112
9f1f04a1 113######
114
9f1f04a1 115sub pack_U {
ae6aa562 116 return pack('U*', @_);
9f1f04a1 117}
118
119sub unpack_U {
ae6aa562 120 return unpack('U*', pack('U*').shift);
9f1f04a1 121}
122
123######
124
91ae00cb 125my (%VariableOK);
126@VariableOK{ qw/
0116f5dc 127 blanked non-ignorable shifted shift-trimmed
91ae00cb 128 / } = (); # keys lowercased
0116f5dc 129
130our @ChangeOK = qw/
131 alternate backwards level normalization rearrange
132 katakana_before_hiragana upper_before_lower
133 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 134 hangul_terminator variable
0116f5dc 135 /;
136
137our @ChangeNG = qw/
91ae00cb 138 entry mapping table maxlength
139 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 140 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 141 derivCode normCode rearrangeHash L3_ignorable
9f1f04a1 142 backwardsFlag
0116f5dc 143 /;
9f1f04a1 144# The hash key 'ignored' is deleted at v 0.21.
145# The hash key 'isShift' is deleted at v 0.23.
146# The hash key 'combining' is deleted at v 0.24.
91ae00cb 147# The hash key 'entries' is deleted at v 0.30.
148
149sub version {
150 my $self = shift;
151 return $self->{versionTable} || 'unknown';
152}
0116f5dc 153
154my (%ChangeOK, %ChangeNG);
155@ChangeOK{ @ChangeOK } = ();
156@ChangeNG{ @ChangeNG } = ();
157
158sub change {
159 my $self = shift;
160 my %hash = @_;
161 my %old;
91ae00cb 162 if (exists $hash{variable} && exists $hash{alternate}) {
163 delete $hash{alternate};
164 }
165 elsif (!exists $hash{variable} && exists $hash{alternate}) {
166 $hash{variable} = $hash{alternate};
167 }
0116f5dc 168 foreach my $k (keys %hash) {
169 if (exists $ChangeOK{$k}) {
170 $old{$k} = $self->{$k};
171 $self->{$k} = $hash{$k};
172 }
173 elsif (exists $ChangeNG{$k}) {
174 croak "change of $k via change() is not allowed!";
175 }
176 # else => ignored
177 }
178 $self->checkCollator;
179 return wantarray ? %old : $self;
180}
a7fbee98 181
9f1f04a1 182sub _checkLevel {
183 my $level = shift;
abd1ec54 184 my $key = shift; # 'level' or 'backwards'
185 MinLevel <= $level or croak sprintf
186 "Illegal level %d (in value for key '%s') lower than %d.",
187 $level, $key, MinLevel;
188 $level <= MaxLevel or croak sprintf
189 "Unsupported level %d (in value for key '%s') higher than %d.",
190 $level, $key, MaxLevel;
9f1f04a1 191}
192
91ae00cb 193my %DerivCode = (
194 8 => \&_derivCE_8,
195 9 => \&_derivCE_9,
196 11 => \&_derivCE_9, # 11 == 9
197);
198
0116f5dc 199sub checkCollator {
200 my $self = shift;
9f1f04a1 201 _checkLevel($self->{level}, "level");
a7fbee98 202
91ae00cb 203 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
204 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 205
91ae00cb 206 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 207 $self->{alternateTable} || 'shifted';
91ae00cb 208 $self->{variable} = $self->{alternate} = lc($self->{variable});
209 exists $VariableOK{ $self->{variable} }
210 or croak "$PACKAGE unknown variable tag name: $self->{variable}";
0116f5dc 211
9f1f04a1 212 if (! defined $self->{backwards}) {
213 $self->{backwardsFlag} = 0;
214 }
215 elsif (! ref $self->{backwards}) {
216 _checkLevel($self->{backwards}, "backwards");
217 $self->{backwardsFlag} = 1 << $self->{backwards};
218 }
219 else {
220 my %level;
221 $self->{backwardsFlag} = 0;
222 for my $b (@{ $self->{backwards} }) {
223 _checkLevel($b, "backwards");
224 $level{$b} = 1;
225 }
226 for my $v (sort keys %level) {
227 $self->{backwardsFlag} += 1 << $v;
228 }
229 }
0116f5dc 230
91ae00cb 231 defined $self->{rearrange} or $self->{rearrange} = [];
232 ref $self->{rearrange}
233 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc 234
235 # keys of $self->{rearrangeHash} are $self->{rearrange}.
236 $self->{rearrangeHash} = undef;
237
238 if (@{ $self->{rearrange} }) {
239 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
240 }
241
242 $self->{normCode} = undef;
a7fbee98 243
244 if (defined $self->{normalization}) {
245 eval { require Unicode::Normalize };
91ae00cb 246 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 247
91ae00cb 248 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 249
91ae00cb 250 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
251 $self->{normCode} = \&Unicode::Normalize::NFD;
252 }
253 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f 254 my $norm = $self->{normalization};
255 $self->{normCode} = sub {
1d2654e1 256 Unicode::Normalize::normalize($norm, shift);
257 };
06c8fc8f 258 eval { $self->{normCode}->("") }; # try
259 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 260 }
a7fbee98 261 }
0116f5dc 262 return;
263}
264
265sub new
266{
267 my $class = shift;
268 my $self = bless { @_ }, $class;
45394607 269
a7fbee98 270 # If undef is passed explicitly, no file is read.
0116f5dc 271 $self->{table} = $KeyFile if ! exists $self->{table};
272 $self->read_table if defined $self->{table};
905aa9f0 273
a7fbee98 274 if ($self->{entry}) {
275 $self->parseEntry($_) foreach split /\n/, $self->{entry};
276 }
905aa9f0 277
9f1f04a1 278 $self->{level} ||= MaxLevel;
0116f5dc 279 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 280
abd1ec54 281 $self->{overrideHangul} = FALSE
0116f5dc 282 if ! exists $self->{overrideHangul};
abd1ec54 283 $self->{overrideCJK} = FALSE
0116f5dc 284 if ! exists $self->{overrideCJK};
06c8fc8f 285 $self->{normalization} = 'NFD'
0116f5dc 286 if ! exists $self->{normalization};
0116f5dc 287 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 288 if ! exists $self->{rearrange};
0116f5dc 289 $self->{backwards} = $self->{backwardsTable}
290 if ! exists $self->{backwards};
a7fbee98 291
0116f5dc 292 $self->checkCollator;
a7fbee98 293
294 return $self;
295}
905aa9f0 296
297sub read_table {
a7fbee98 298 my $self = shift;
a7fbee98 299
abd1ec54 300 my $filepath = File::Spec->catfile($Path, $self->{table});
e69a2255 301 open my $fk, "<$filepath"
302 or croak "File does not exist at $filepath";
a7fbee98 303
304 while (<$fk>) {
305 next if /^\s*#/;
abd1ec54 306 unless (s/^\s*\@//) {
307 $self->parseEntry($_);
a7fbee98 308 next;
309 }
abd1ec54 310
311 if (/^version\s*(\S*)/) {
312 $self->{versionTable} ||= $1;
313 }
314 elsif (/^variable\s+(\S*)/) { # since UTS #10-9
315 $self->{variableTable} ||= $1;
316 }
317 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
318 $self->{alternateTable} ||= $1;
319 }
320 elsif (/^backwards\s+(\S*)/) {
321 push @{ $self->{backwardsTable} }, $1;
322 }
323 elsif (/^forwards\s+(\S*)/) { # parhaps no use
324 push @{ $self->{forwardsTable} }, $1;
325 }
326 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
327 push @{ $self->{rearrangeTable} }, _getHexArray($1);
328 }
45394607 329 }
a7fbee98 330 close $fk;
45394607 331}
332
905aa9f0 333
45394607 334##
335## get $line, parse it, and write an entry in $self
336##
337sub parseEntry
338{
a7fbee98 339 my $self = shift;
340 my $line = shift;
4d36a948 341 my($name, $entry, @uv, @key);
a7fbee98 342
343 return if $line !~ /^\s*[0-9A-Fa-f]/;
344
345 # removes comment and gets name
346 $name = $1
347 if $line =~ s/[#%]\s*(.*)//;
348 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
349
350 # gets element
351 my($e, $k) = split /;/, $line;
352 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
353 if ! $k;
354
4d36a948 355 @uv = _getHexArray($e);
356 return if !@uv;
357
358 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 359
4d36a948 360 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 361 my $ele = pack_U(@uv);
a7fbee98 362
4d36a948 363 # regarded as if it were not entried in the table
364 return
365 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 366
4d36a948 367 # replaced as completely ignorable
368 $k = '[.0000.0000.0000.0000]'
369 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 370 }
0116f5dc 371
4d36a948 372 # replaced as completely ignorable
373 $k = '[.0000.0000.0000.0000]'
374 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
375
4c843366 376 my $is_L3_ignorable = TRUE;
4d36a948 377
caffd4cf 378 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
379 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 380 my @wt = _getHexArray($arr);
381 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 382 $is_L3_ignorable = FALSE
383 if $wt[0] + $wt[1] + $wt[2] != 0;
4d36a948 384 # if $arr !~ /[1-9A-Fa-f]/; NG
385 # Conformance Test shows L3-ignorable is completely ignorable.
4c843366 386 # For expansion, an entry $is_L3_ignorable
387 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 388 }
caffd4cf 389
91ae00cb 390 $self->{mapping}{$entry} = \@key;
caffd4cf 391
91ae00cb 392 if (@uv > 1) {
393 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
394 and $self->{maxlength}{$uv[0]} = @uv;
395 }
396 else {
397 $is_L3_ignorable
398 ? ($self->{L3_ignorable}{$uv[0]} = TRUE)
399 : ($self->{L3_ignorable}{$uv[0]} and
400 $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key.
401 }
45394607 402}
403
9f1f04a1 404
45394607 405##
abd1ec54 406## VCE = _varCE(variable term, VCE)
45394607 407##
abd1ec54 408sub _varCE
45394607 409{
abd1ec54 410 my $vbl = shift;
411 my $vce = shift;
412 if ($vbl eq 'non-ignorable') {
413 return $vce;
414 }
415 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
416
417 if ($var) {
418 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
419 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
420 }
421 elsif ($vbl eq 'blanked') {
422 return $vce;
423 }
424 else {
425 return pack(VCE_TEMPLATE, $var, @wt[0..2],
426 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
427 }
45394607 428}
429
45394607 430sub viewSortKey
431{
a7fbee98 432 my $self = shift;
9f1f04a1 433 $self->visualizeSortKey($self->getSortKey(@_));
434}
0116f5dc 435
9f1f04a1 436sub visualizeSortKey
437{
438 my $self = shift;
439 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 440
9f1f04a1 441 if ($self->{UCA_Version} <= 8) {
0116f5dc 442 $view =~ s/ ?0000 ?/|/g;
443 } else {
444 $view =~ s/\b0000\b/|/g;
445 }
a7fbee98 446 return "[$view]";
45394607 447}
448
d16e9e3d 449
45394607 450##
91ae00cb 451## arrayref of JCPS = splitEnt(string to be collated)
452## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 453##
91ae00cb 454sub splitEnt
45394607 455{
a7fbee98 456 my $self = shift;
4d36a948 457 my $wLen = $_[1];
458
a7fbee98 459 my $code = $self->{preprocess};
0116f5dc 460 my $norm = $self->{normCode};
91ae00cb 461 my $map = $self->{mapping};
a7fbee98 462 my $max = $self->{maxlength};
463 my $reH = $self->{rearrangeHash};
4d36a948 464 my $ign = $self->{L3_ignorable};
91ae00cb 465 my $ver9 = $self->{UCA_Version} >= 9;
a7fbee98 466
4d36a948 467 my ($str, @buf);
a7fbee98 468
4d36a948 469 if ($wLen) {
470 $code and croak "Preprocess breaks character positions. "
471 . "Don't use with index(), match(), etc.";
472 $norm and croak "Normalization breaks character positions. "
473 . "Don't use with index(), match(), etc.";
474 $str = $_[0];
475 }
476 else {
477 $str = $_[0];
478 $str = &$code($str) if ref $code;
479 $str = &$norm($str) if ref $norm;
480 }
a7fbee98 481
4d36a948 482 # get array of Unicode code point of string.
9f1f04a1 483 my @src = unpack_U($str);
4d36a948 484
485 # rearrangement:
486 # Character positions are not kept if rearranged,
487 # then neglected if $wLen is true.
488 if ($reH && ! $wLen) {
a7fbee98 489 for (my $i = 0; $i < @src; $i++) {
490 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
491 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
492 $i++;
493 }
494 }
45394607 495 }
45394607 496
abd1ec54 497 # To remove a character marked as a completely ignorable.
498 for (my $i = 0; $i < @src; $i++) {
499 $src[$i] = undef
500 if _isIllegal($src[$i]) || ($ver9 && $ign->{ $src[$i] });
0116f5dc 501 }
502
a7fbee98 503 for (my $i = 0; $i < @src; $i++) {
91ae00cb 504 my $jcps = $src[$i];
abd1ec54 505 next if ! defined $jcps;
506 my $i_orig = $i;
4d36a948 507
91ae00cb 508 if ($max->{$jcps}) { # contract
509 my $temp_jcps = $jcps;
510 my $jcpsLen = 1;
511 my $maxLen = $max->{$jcps};
4d36a948 512
91ae00cb 513 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 514 next if ! defined $src[$p];
91ae00cb 515 $temp_jcps .= CODE_SEP . $src[$p];
516 $jcpsLen++;
517 if ($map->{$temp_jcps}) {
518 $jcps = $temp_jcps;
4d36a948 519 $i = $p;
520 }
521 }
4d36a948 522
06c8fc8f 523 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
524 # This process requires Unicode::Normalize.
91ae00cb 525 # If "normalization" is undef, here should be skipped *always*
06c8fc8f 526 # (in spite of bool value of $CVgetCombinClass),
527 # since canonical ordering cannot be expected.
528 # Blocked combining character should not be contracted.
529
530 if ($self->{normalization})
531 # $self->{normCode} is false in the case of "prenormalized".
532 {
533 my $preCC = 0;
534 my $curCC = 0;
535
536 for (my $p = $i + 1; $p < @src; $p++) {
537 next if ! defined $src[$p];
538 $curCC = $CVgetCombinClass->($src[$p]);
539 last unless $curCC;
540 my $tail = CODE_SEP . $src[$p];
91ae00cb 541 if ($preCC != $curCC && $map->{$jcps.$tail}) {
542 $jcps .= $tail;
06c8fc8f 543 $src[$p] = undef;
544 } else {
545 $preCC = $curCC;
546 }
4d36a948 547 }
a7fbee98 548 }
a7fbee98 549 }
550
4d36a948 551 if ($wLen) {
abd1ec54 552 for (; $i + 1 < @src; $i++) {
553 last if defined $src[$i + 1];
a7fbee98 554 }
555 }
4d36a948 556
91ae00cb 557 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 558 }
4d36a948 559 return \@buf;
d16e9e3d 560}
45394607 561
d16e9e3d 562
563##
abd1ec54 564## list of VCE = getWt(JCPS)
d16e9e3d 565##
566sub getWt
567{
a7fbee98 568 my $self = shift;
91ae00cb 569 my $u = shift;
abd1ec54 570 my $vbl = $self->{variable};
91ae00cb 571 my $map = $self->{mapping};
0116f5dc 572 my $der = $self->{derivCode};
a7fbee98 573
91ae00cb 574 return if !defined $u;
abd1ec54 575 return map(_varCE($vbl, $_), @{ $map->{$u} })
91ae00cb 576 if $map->{$u};
a7fbee98 577
91ae00cb 578 # JCPS must not be a contraction, then it's a code point.
579 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 580 my $hang = $self->{overrideHangul};
581 my @hangulCE;
582 if ($hang) {
583 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
584 }
585 elsif (!defined $hang) {
586 @hangulCE = $der->($u);
587 }
588 else {
589 my $max = $self->{maxlength};
590 my @decH = _decompHangul($u);
591
592 if (@decH == 2) {
593 my $contract = join(CODE_SEP, @decH);
91ae00cb 594 @decH = ($contract) if $map->{$contract};
1d2654e1 595 } else { # must be <@decH == 3>
596 if ($max->{$decH[0]}) {
597 my $contract = join(CODE_SEP, @decH);
91ae00cb 598 if ($map->{$contract}) {
1d2654e1 599 @decH = ($contract);
600 } else {
601 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 602 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1 603 }
604 # even if V's ignorable, LT contraction is not supported.
605 # If such a situatution were required, NFD should be used.
606 }
607 if (@decH == 3 && $max->{$decH[1]}) {
608 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 609 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1 610 }
611 }
612
613 @hangulCE = map({
91ae00cb 614 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1 615 } @decH);
616 }
abd1ec54 617 return map _varCE($vbl, $_), @hangulCE;
a7fbee98 618 }
91ae00cb 619 elsif (CJK_UidIni <= $u && $u <= CJK_UidFin ||
620 CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
621 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
1d2654e1 622 my $cjk = $self->{overrideCJK};
abd1ec54 623 return map _varCE($vbl, $_),
0116f5dc 624 $cjk
4d36a948 625 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
91ae00cb 626 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
9f1f04a1 627 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 628 : $der->($u);
a7fbee98 629 }
630 else {
abd1ec54 631 return map _varCE($vbl, $_), $der->($u);
a7fbee98 632 }
d16e9e3d 633}
634
d16e9e3d 635
636##
637## string sortkey = getSortKey(string arg)
638##
639sub getSortKey
640{
a7fbee98 641 my $self = shift;
642 my $lev = $self->{level};
91ae00cb 643 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
644 my $ver9 = $self->{UCA_Version} >= 9;
abd1ec54 645 my $v2i = $ver9 && $self->{variable} ne 'non-ignorable';
91ae00cb 646
abd1ec54 647 my @buf; # weight arrays
91ae00cb 648 if ($self->{hangul_terminator}) {
649 my $preHST = '';
650 foreach my $jcps (@$rEnt) {
651 # weird things like VL, TL-contraction are not considered!
652 my $curHST = '';
653 foreach my $u (split /;/, $jcps) {
654 $curHST .= getHST($u);
655 }
656 if ($preHST && !$curHST || # hangul before non-hangul
657 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
658 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
659 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
0116f5dc 660
abd1ec54 661 push @buf, $self->getWtHangulTerm();
91ae00cb 662 }
663 $preHST = $curHST;
664
abd1ec54 665 push @buf, $self->getWt($jcps);
91ae00cb 666 }
667 $preHST # end at hangul
abd1ec54 668 and push @buf, $self->getWtHangulTerm();
91ae00cb 669 }
670 else {
671 foreach my $jcps (@$rEnt) {
abd1ec54 672 push @buf, $self->getWt($jcps);
91ae00cb 673 }
674 }
675
abd1ec54 676 # make sort key
677 my @ret = ([],[],[],[]);
678 my $last_is_variable;
679
680 foreach my $vwt (@buf) {
681 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
682 if ($v2i) {
683 if ($var) {
684 $last_is_variable = TRUE;
685 }
686 elsif (!$wt[0]) { # ignorable
4d36a948 687 next if $last_is_variable;
abd1ec54 688 }
689 else {
690 $last_is_variable = FALSE;
0116f5dc 691 }
692 }
abd1ec54 693 foreach my $v (0..$lev-1) {
694 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
a7fbee98 695 }
696 }
45394607 697
a7fbee98 698 # modification of tertiary weights
699 if ($self->{upper_before_lower}) {
700 foreach (@{ $ret[2] }) {
701 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
702 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
703 elsif ($_ == 0x1C) { $_ += 1 } # square upper
704 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
705 }
45394607 706 }
a7fbee98 707 if ($self->{katakana_before_hiragana}) {
708 foreach (@{ $ret[2] }) {
709 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
710 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
711 }
45394607 712 }
9f1f04a1 713
714 if ($self->{backwardsFlag}) {
715 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
716 if ($self->{backwardsFlag} & (1 << $v)) {
717 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
718 }
719 }
720 }
721
4d36a948 722 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 723}
724
725
726##
d16e9e3d 727## int compare = cmp(string a, string b)
45394607 728##
5398038e 729sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
730sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
731sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
732sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
733sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
734sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
735sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 736
737##
d16e9e3d 738## list[strings] sorted = sort(list[strings] arg)
45394607 739##
a7fbee98 740sub sort {
741 my $obj = shift;
742 return
743 map { $_->[1] }
744 sort{ $a->[0] cmp $b->[0] }
745 map [ $obj->getSortKey($_), $_ ], @_;
45394607 746}
747
0116f5dc 748
4d36a948 749sub _derivCE_9 {
0116f5dc 750 my $u = shift;
751 my $base =
91ae00cb 752 (CJK_UidIni <= $u && $u <= CJK_UidFin)
4d36a948 753 ? 0xFB40 : # CJK
91ae00cb 754 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
755 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
4d36a948 756 ? 0xFB80 # CJK ext.
757 : 0xFBC0; # others
0116f5dc 758
759 my $aaaa = $base + ($u >> 15);
760 my $bbbb = ($u & 0x7FFF) | 0x8000;
761 return
9f1f04a1 762 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
763 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 764}
765
4d36a948 766sub _derivCE_8 {
0116f5dc 767 my $code = shift;
768 my $aaaa = 0xFF80 + ($code >> 15);
769 my $bbbb = ($code & 0x7FFF) | 0x8000;
770 return
4d36a948 771 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
772 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 773}
774
91ae00cb 775
abd1ec54 776sub getWtHangulTerm {
91ae00cb 777 my $self = shift;
abd1ec54 778 return _varCE($self->{variable},
779 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
91ae00cb 780}
781
782
45394607 783##
784## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
785##
a7fbee98 786sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 787
a7fbee98 788#
4d36a948 789# $code *must* be in Hangul syllable.
a7fbee98 790# Check it before you enter here.
791#
5398038e 792sub _decompHangul {
793 my $code = shift;
91ae00cb 794 my $SIndex = $code - Hangul_SBase;
795 my $LIndex = int( $SIndex / Hangul_NCount);
796 my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
797 my $TIndex = $SIndex % Hangul_TCount;
5398038e 798 return (
91ae00cb 799 Hangul_LBase + $LIndex,
800 Hangul_VBase + $VIndex,
801 $TIndex ? (Hangul_TBase + $TIndex) : (),
5398038e 802 );
45394607 803}
804
10d7ec48 805sub _isIllegal {
4d36a948 806 my $code = shift;
807 return ! defined $code # removed
808 || ($code < 0 || 0x10FFFF < $code) # out of range
809 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
810 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
811 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
812 ;
813}
814
91ae00cb 815# Hangul Syllable Type
816sub getHST {
817 my $u = shift;
818 return
819 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
820 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
821 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
822 Hangul_SIni <= $u && $u <= Hangul_SFin ?
823 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
824}
825
4d36a948 826
827##
828## bool _nonIgnorAtLevel(arrayref weights, int level)
829##
830sub _nonIgnorAtLevel($$)
831{
832 my $wt = shift;
833 return if ! defined $wt;
834 my $lv = shift;
9f1f04a1 835 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 836}
837
838##
839## bool _eqArray(
840## arrayref of arrayref[weights] source,
841## arrayref of arrayref[weights] substr,
842## int level)
843## * comparison of graphemes vs graphemes.
844## @$source >= @$substr must be true (check it before call this);
845##
846sub _eqArray($$$)
847{
848 my $source = shift;
849 my $substr = shift;
850 my $lev = shift;
851
852 for my $g (0..@$substr-1){
853 # Do the $g'th graphemes have the same number of AV weigths?
854 return if @{ $source->[$g] } != @{ $substr->[$g] };
855
856 for my $w (0..@{ $substr->[$g] }-1) {
857 for my $v (0..$lev-1) {
858 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
859 }
860 }
861 }
862 return 1;
863}
864
865##
866## (int position, int length)
867## int position = index(string, substring, position, [undoc'ed grobal])
868##
869## With "grobal" (only for the list context),
870## returns list of arrayref[position, length].
871##
872sub index
873{
91ae00cb 874 my $self = shift;
875 my $str = shift;
876 my $len = length($str);
877 my $subE = $self->splitEnt(shift);
878 my $pos = @_ ? shift : 0;
879 $pos = 0 if $pos < 0;
880 my $grob = shift;
881
882 my $lev = $self->{level};
883 my $ver9 = $self->{UCA_Version} >= 9;
884 my $v2i = $self->{variable} ne 'non-ignorable';
885
886 if (! @$subE) {
4d36a948 887 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
888 return $grob
889 ? map([$_, 0], $temp..$len)
890 : wantarray ? ($temp,0) : $temp;
891 }
abd1ec54 892 $len < $pos
893 and return wantarray ? () : NOMATCHPOS;
91ae00cb 894 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54 895 @$strE
896 or return wantarray ? () : NOMATCHPOS;
897
4d36a948 898 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
899
abd1ec54 900 my $last_is_variable;
901 for my $vwt (map $self->getWt($_), @$subE) {
902 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
903 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 904
905 if ($v2i && $ver9) {
abd1ec54 906 if ($var) {
907 $last_is_variable = TRUE;
908 }
909 elsif (!$wt[0]) { # ignorable
4d36a948 910 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54 911 }
912 else {
913 $last_is_variable = FALSE;
4d36a948 914 }
915 }
916
abd1ec54 917 if (@subWt && !$var && !$wt[0]) {
918 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
4d36a948 919 } else {
abd1ec54 920 push @subWt, [ \@wt ];
4d36a948 921 }
922 }
923
924 my $count = 0;
91ae00cb 925 my $end = @$strE - 1;
4d36a948 926
abd1ec54 927 $last_is_variable = FALSE; # reuse
4d36a948 928 for (my $i = 0; $i <= $end; ) { # no $i++
929 my $found_base = 0;
930
931 # fetch a grapheme
932 while ($i <= $end && $found_base == 0) {
abd1ec54 933 for my $vwt ($self->getWt($strE->[$i][0])) {
934 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
935 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 936
937 if ($v2i && $ver9) {
abd1ec54 938 if ($var) {
939 $last_is_variable = TRUE;
940 }
941 elsif (!$wt[0]) { # ignorable
4d36a948 942 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54 943 }
944 else {
945 $last_is_variable = FALSE;
4d36a948 946 }
947 }
948
abd1ec54 949 if (@strWt && !$var && !$wt[0]) {
950 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 951 $finPos[-1] = $strE->[$i][2];
4d36a948 952 } elsif ($to_be_pushed) {
abd1ec54 953 push @strWt, [ \@wt ];
91ae00cb 954 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 955 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 956 push @finPos, $strE->[$i][2];
4d36a948 957 $found_base++;
958 }
959 # else ===> no-op
960 }
961 $i++;
962 }
963
964 # try to match
965 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
966 if ($iniPos[0] != NOMATCHPOS &&
967 $finPos[$#subWt] != NOMATCHPOS &&
968 _eqArray(\@strWt, \@subWt, $lev)) {
969 my $temp = $iniPos[0] + $pos;
970
971 if ($grob) {
972 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
973 splice @strWt, 0, $#subWt;
974 splice @iniPos, 0, $#subWt;
975 splice @finPos, 0, $#subWt;
976 }
977 else {
978 return wantarray
979 ? ($temp, $finPos[$#subWt] - $iniPos[0])
980 : $temp;
981 }
982 }
983 shift @strWt;
984 shift @iniPos;
985 shift @finPos;
986 }
987 }
988
989 return $grob
990 ? @g_ret
991 : wantarray ? () : NOMATCHPOS;
992}
993
994##
995## scalarref to matching part = match(string, substring)
996##
997sub match
998{
999 my $self = shift;
1000 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1001 my $temp = substr($_[0], $pos, $len);
1002 return wantarray ? $temp : \$temp;
1003 # An lvalue ref \substr should be avoided,
1004 # since its value is affected by modification of its referent.
1005 }
1006 else {
1007 return;
1008 }
1009}
1010
1011##
1012## arrayref matching parts = gmatch(string, substring)
1013##
1014sub gmatch
1015{
1016 my $self = shift;
1017 my $str = shift;
1018 my $sub = shift;
1019 return map substr($str, $_->[0], $_->[1]),
1020 $self->index($str, $sub, 0, 'g');
1021}
1022
1023##
1024## bool subst'ed = subst(string, substring, replace)
1025##
1026sub subst
1027{
1028 my $self = shift;
1029 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1030
1031 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1032 if ($code) {
1033 my $mat = substr($_[0], $pos, $len);
1034 substr($_[0], $pos, $len, $code->($mat));
1035 } else {
1036 substr($_[0], $pos, $len, $_[2]);
1037 }
1038 return TRUE;
1039 }
1040 else {
1041 return FALSE;
1042 }
1043}
1044
1045##
1046## int count = gsubst(string, substring, replace)
1047##
1048sub gsubst
1049{
1050 my $self = shift;
1051 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1052 my $cnt = 0;
1053
1054 # Replacement is carried out from the end, then use reverse.
1055 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1056 if ($code) {
1057 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1058 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1059 } else {
1060 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1061 }
1062 $cnt++;
1063 }
1064 return $cnt;
1065}
1066
45394607 10671;
1068__END__
1069
1070=head1 NAME
1071
a7fbee98 1072Unicode::Collate - Unicode Collation Algorithm
45394607 1073
1074=head1 SYNOPSIS
1075
1076 use Unicode::Collate;
1077
1078 #construct
5398038e 1079 $Collator = Unicode::Collate->new(%tailoring);
45394607 1080
1081 #sort
5398038e 1082 @sorted = $Collator->sort(@not_sorted);
45394607 1083
1084 #compare
a7fbee98 1085 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1086
91ae00cb 1087 # If %tailoring is false (i.e. empty),
1088 # $Collator should do the default collation.
1089
45394607 1090=head1 DESCRIPTION
1091
4d36a948 1092This module is an implementation
1093of Unicode Technical Standard #10 (UTS #10)
1094"Unicode Collation Algorithm."
1095
45394607 1096=head2 Constructor and Tailoring
1097
d16e9e3d 1098The C<new> method returns a collator object.
1099
5398038e 1100 $Collator = Unicode::Collate->new(
0116f5dc 1101 UCA_Version => $UCA_Version,
91ae00cb 1102 alternate => $alternate, # deprecated: use of 'variable' is recommended.
45394607 1103 backwards => $levelNumber, # or \@levelNumbers
1104 entry => $element,
91ae00cb 1105 hangul_terminator => $term_primary_weight,
45394607 1106 ignoreName => qr/$ignoreName/,
1107 ignoreChar => qr/$ignoreChar/,
1108 katakana_before_hiragana => $bool,
1109 level => $collationLevel,
91ae00cb 1110 normalization => $normalization_form,
45394607 1111 overrideCJK => \&overrideCJK,
1112 overrideHangul => \&overrideHangul,
1113 preprocess => \&preprocess,
1114 rearrange => \@charList,
1115 table => $filename,
1116 undefName => qr/$undefName/,
1117 undefChar => qr/$undefChar/,
1118 upper_before_lower => $bool,
91ae00cb 1119 variable => $variable,
45394607 1120 );
45394607 1121
1122=over 4
1123
0116f5dc 1124=item UCA_Version
1125
91ae00cb 1126If the tracking version number of the older UCA is given,
1127the older behavior of that tracking version is emulated on collating.
0116f5dc 1128If omitted, the return value of C<UCA_Version()> is used.
1129
91ae00cb 1130The supported tracking version: 8, 9, or 11.
0116f5dc 1131
1132B<This parameter may be removed in the future version,
1133as switching the algorithm would affect the performance.>
1134
45394607 1135=item backwards
1136
4d36a948 1137-- see 3.1.2 French Accents, UTS #10.
45394607 1138
1139 backwards => $levelNumber or \@levelNumbers
1140
1141Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1142If omitted, forwards at all the levels.
1143
1144=item entry
1145
4d36a948 1146-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1147
91ae00cb 1148If the same character (or a sequence of characters) exists
1149in the collation element table through C<table>,
1150mapping to collation elements is overrided.
1151If it does not exist, the mapping is defined additionally.
45394607 1152
abd1ec54 1153 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11540063 0068 ; [.0E6A.0020.0002.0063] # ch
11550043 0068 ; [.0E6A.0020.0007.0043] # Ch
11560043 0048 ; [.0E6A.0020.0008.0043] # CH
1157006C 006C ; [.0F4C.0020.0002.006C] # ll
1158004C 006C ; [.0F4C.0020.0007.004C] # Ll
1159004C 004C ; [.0F4C.0020.0008.004C] # LL
1160006E 0303 ; [.0F7B.0020.0002.006E] # n-tilde
1161004E 0303 ; [.0F7B.0020.0008.004E] # N-tilde
1162ENTRY
1163
1164 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
116500E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
116600C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1167ENTRY
45394607 1168
4d36a948 1169B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54 1170B<must> be a Unicode code point (defined as hexadecimal),
1171but not a native code point.
4d36a948 1172So C<0063> must always denote C<U+0063>,
1173but not a character of C<"\x63">.
1174
abd1ec54 1175Weighting may vary depending on collation element table.
1176So ensure the weights defined in C<entry> will be consistent with
1177those in the collation element table loaded via C<table>.
1178
1179In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1180and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1181(as a value between C<0E60> and C<0E6D>)
1182makes ordering as C<C E<lt> CH E<lt> D>.
1183Exactly speaking DUCET already has some characters between C<C> and C<D>:
1184C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1185C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1186and C<c-curl> (C<U+0255>) with C<0E69>.
1187Then primary weight C<0E6A> for C<CH> makes C<CH>
1188ordered between C<c-curl> and C<D>.
1189
91ae00cb 1190=item hangul_terminator
1191
1192-- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1193
1194If a true value is given (non-zero but should be positive),
1195it will be added as a terminator primary weight to the end of
1196every standard Hangul syllable. Secondary and any higher weights
1197for terminator are set to zero.
1198If the value is false or C<hangul_terminator> key does not exist,
1199insertion of terminator weights will not be performed.
1200
1201Boundaries of Hangul syllables are determined
1202according to conjoining Jamo behavior in F<the Unicode Standard>
1203and F<HangulSyllableType.txt>.
1204
1205B<Implementation Note:>
1206(1) For expansion mapping (Unicode character mapped
1207to a sequence of collation elements), a terminator will not be added
1208between collation elements, even if Hangul syllable boundary exists there.
1209Addition of terminator is restricted to the next position
1210to the last collation element.
1211
1212(2) Non-conjoining Hangul letters
1213(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1214automatically terminated with a terminator primary weight.
1215These characters may need terminator included in a collation element
1216table beforehand.
1217
45394607 1218=item ignoreName
1219
1220=item ignoreChar
1221
4d36a948 1222-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1223
caffd4cf 1224Makes the entry in the table completely ignorable;
1225i.e. as if the weights were zero at all level.
45394607 1226
a7fbee98 1227E.g. when 'a' and 'e' are ignorable,
45394607 1228'element' is equal to 'lament' (or 'lmnt').
1229
1230=item level
1231
4d36a948 1232-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1233
1234Set the maximum level.
1235Any higher levels than the specified one are ignored.
1236
1237 Level 1: alphabetic ordering
1238 Level 2: diacritic ordering
1239 Level 3: case ordering
91ae00cb 1240 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607 1241
1242 ex.level => 2,
1243
a7fbee98 1244If omitted, the maximum is the 4th.
1245
45394607 1246=item normalization
1247
4d36a948 1248-- see 4.1 Normalize each input string, UTS #10.
45394607 1249
905aa9f0 1250If specified, strings are normalized before preparation of sort keys
45394607 1251(the normalization is executed after preprocess).
1252
1d2654e1 1253A form name C<Unicode::Normalize::normalize()> accepts will be applied
1254as C<$normalization_form>.
06c8fc8f 1255Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1 1256See C<Unicode::Normalize::normalize()> for detail.
1257If omitted, C<'NFD'> is used.
45394607 1258
91ae00cb 1259C<normalization> is performed after C<preprocess> (if defined).
45394607 1260
06c8fc8f 1261Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1262though they are not concerned with C<Unicode::Normalize::normalize()>.
1263
1264If C<undef> (not a string C<"undef">) is passed explicitly
1265as the value for this key,
45394607 1266any normalization is not carried out (this may make tailoring easier
abd1ec54 1267if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1268only contiguous contractions are resolved;
1269e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1270C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f 1271In this point,
1272C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1273B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1274
1275In the case of C<(normalization =E<gt> "prenormalized")>,
1276any normalization is not performed, but
1277non-contiguous contractions with combining characters are performed.
1278Therefore
1279C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1280B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1281If source strings are finely prenormalized,
1282C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1283
1284Except C<(normalization =E<gt> undef)>,
1285B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607 1286
1287=item overrideCJK
1288
4d36a948 1289-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1290
91ae00cb 1291By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1292(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than
1293C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1294C<U+20000> to C<U+2A6D6>].
1295
1296Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
45394607 1297
a7fbee98 1298ex. CJK Unified Ideographs in the JIS code point order.
45394607 1299
1300 overrideCJK => sub {
a7fbee98 1301 my $u = shift; # get a Unicode codepoint
1302 my $b = pack('n', $u); # to UTF-16BE
1303 my $s = your_unicode_to_sjis_converter($b); # convert
1304 my $n = unpack('n', $s); # convert sjis to short
1305 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1306 },
1307
a7fbee98 1308ex. ignores all CJK Unified Ideographs.
1309
1310 overrideCJK => sub {()}, # CODEREF returning empty list
1311
1312 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1313 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1314
1315If C<undef> is passed explicitly as the value for this key,
1316weights for CJK Unified Ideographs are treated as undefined.
1317But assignment of weight for CJK Unified Ideographs
91ae00cb 1318in table or C<entry> is still valid.
a7fbee98 1319
1320=item overrideHangul
1321
4d36a948 1322-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1323
abd1ec54 1324By default, Hangul Syllables are decomposed into Hangul Jamo,
1325even if C<(normalization =E<gt> undef)>.
a7fbee98 1326But the mapping of Hangul Syllables may be overrided.
1327
91ae00cb 1328This tag works like C<overrideCJK>, so see there for examples.
a7fbee98 1329
45394607 1330If you want to override the mapping of Hangul Syllables,
abd1ec54 1331NFD, NFKD, and FCD are not appropriate,
1332since they will decompose Hangul Syllables before overriding.
45394607 1333
a7fbee98 1334If C<undef> is passed explicitly as the value for this key,
1335weight for Hangul Syllables is treated as undefined
1336without decomposition into Hangul Jamo.
1337But definition of weight for Hangul Syllables
91ae00cb 1338in table or C<entry> is still valid.
a7fbee98 1339
45394607 1340=item preprocess
1341
4d36a948 1342-- see 5.1 Preprocessing, UTS #10.
45394607 1343
1344If specified, the coderef is used to preprocess
1345before the formation of sort keys.
1346
a7fbee98 1347ex. dropping English articles, such as "a" or "the".
45394607 1348Then, "the pen" is before "a pencil".
1349
1350 preprocess => sub {
1351 my $str = shift;
a7fbee98 1352 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1353 return $str;
45394607 1354 },
1355
91ae00cb 1356C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1357
45394607 1358=item rearrange
1359
4d36a948 1360-- see 3.1.3 Rearrangement, UTS #10.
45394607 1361
1362Characters that are not coded in logical order and to be rearranged.
a7fbee98 1363By default,
45394607 1364
1365 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1366
a7fbee98 1367If you want to disallow any rearrangement,
1368pass C<undef> or C<[]> (a reference to an empty list)
1369as the value for this key.
1370
0116f5dc 1371B<According to the version 9 of UCA, this parameter shall not be used;
1372but it is not warned at present.>
1373
45394607 1374=item table
1375
4d36a948 1376-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1377
91ae00cb 1378You can use another collation element table if desired.
ae6aa562 1379The table file must be put into a directory
10d7ec48 1380where F<Unicode/Collate.pm> is installed; e.g. into
1381F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>.
45394607 1382
10d7ec48 1383By default, the filename F<allkeys.txt> is used.
45394607 1384
a7fbee98 1385If C<undef> is passed explicitly as the value for this key,
91ae00cb 1386no file is read (but you can define collation elements via C<entry>).
a7fbee98 1387
1388A typical way to define a collation element table
1389without any file of table:
1390
1391 $onlyABC = Unicode::Collate->new(
1392 table => undef,
1393 entry => << 'ENTRIES',
13940061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
13950041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
13960062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
13970042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
13980063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
13990043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1400ENTRIES
1401 );
905aa9f0 1402
45394607 1403=item undefName
1404
1405=item undefChar
1406
4d36a948 1407-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1408
1409Undefines the collation element as if it were unassigned in the table.
1410This reduces the size of the table.
1411If an unassigned character appears in the string to be collated,
1412the sort key is made from its codepoint
1413as a single-character collation element,
1414as it is greater than any other assigned collation elements
1415(in the codepoint order among the unassigned characters).
1416But, it'd be better to ignore characters
1417unfamiliar to you and maybe never used.
1418
1419=item katakana_before_hiragana
1420
1421=item upper_before_lower
1422
4d36a948 1423-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1424
1425By default, lowercase is before uppercase
1426and hiragana is before katakana.
1427
a7fbee98 1428If the tag is made true, this is reversed.
1429
1430B<NOTE>: These tags simplemindedly assume
1431any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1 1432must occur in level 3, and their weights at level 3
1433must be same as those mentioned in 7.3.1, UTS #10.
1434If you define your collation elements which violate this requirement,
4d36a948 1435these tags don't work validly.
45394607 1436
91ae00cb 1437=item variable
1438
1439=item alternate
1440
1441-- see 3.2.2 Variable Weighting, UTS #10.
1442
1443(the title in UCA version 8: Alternate Weighting)
1444
1445This key allows to variable weighting for variable collation elements,
1446which are marked with an ASTERISK in the table
1447(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1448
1449 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1450
1451These names are case-insensitive.
1452By default (if specification is omitted), 'shifted' is adopted.
1453
1454 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1455 considered at the 4th level.
1456
abd1ec54 1457 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb 1458
1459 'Shifted' Variable elements are made ignorable at levels 1 through 3
1460 their level 4 weight is replaced by the old level 1 weight.
1461 Level 4 weight for Non-Variable elements is 0xFFFF.
1462
1463 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1464 are trimmed.
1465
1466For backward compatibility, C<alternate> can be used as an alias
1467for C<variable>.
1468
45394607 1469=back
1470
3164dd77 1471=head2 Methods for Collation
45394607 1472
1473=over 4
1474
5398038e 1475=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1476
1477Sorts a list of strings.
1478
5398038e 1479=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1480
1481Returns 1 (when C<$a> is greater than C<$b>)
1482or 0 (when C<$a> is equal to C<$b>)
1483or -1 (when C<$a> is lesser than C<$b>).
1484
5398038e 1485=item C<$result = $Collator-E<gt>eq($a, $b)>
1486
1487=item C<$result = $Collator-E<gt>ne($a, $b)>
1488
1489=item C<$result = $Collator-E<gt>lt($a, $b)>
1490
1491=item C<$result = $Collator-E<gt>le($a, $b)>
1492
1493=item C<$result = $Collator-E<gt>gt($a, $b)>
1494
1495=item C<$result = $Collator-E<gt>ge($a, $b)>
1496
a7fbee98 1497They works like the same name operators as theirs.
5398038e 1498
1499 eq : whether $a is equal to $b.
1500 ne : whether $a is not equal to $b.
1501 lt : whether $a is lesser than $b.
1502 le : whether $a is lesser than $b or equal to $b.
1503 gt : whether $a is greater than $b.
1504 ge : whether $a is greater than $b or equal to $b.
1505
1506=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1507
4d36a948 1508-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1509
1510Returns a sort key.
1511
1512You compare the sort keys using a binary comparison
1513and get the result of the comparison of the strings using UCA.
1514
5398038e 1515 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1516
1517 is equivalent to
1518
5398038e 1519 $Collator->cmp($a, $b)
45394607 1520
a7fbee98 1521=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1522
a7fbee98 1523 use Unicode::Collate;
1524 my $c = Unicode::Collate->new();
1525 print $c->viewSortKey("Perl"),"\n";
1526
0116f5dc 1527 # output:
1528 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1529 # Level 1 Level 2 Level 3 Level 4
1530
1531 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1532
4d36a948 1533=back
1534
1535=head2 Methods for Searching
d16e9e3d 1536
4d36a948 1537B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1538for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1539C<subst>, C<gsubst>) is croaked,
1540as the position and the length might differ
1541from those on the specified string.
91ae00cb 1542(And C<rearrange> and C<hangul_terminator> tags are neglected.)
d16e9e3d 1543
4d36a948 1544The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1545like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1546but they are not aware of any pattern, but only a literal substring.
1547
1548=over 4
1549
1550=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1551
1552=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1553
1554If C<$substring> matches a part of C<$string>, returns
1555the position of the first occurrence of the matching part in scalar context;
1556in list context, returns a two-element list of
1557the position and the length of the matching part.
1558
d16e9e3d 1559If C<$substring> does not match any part of C<$string>,
1560returns C<-1> in scalar context and
1561an empty list in list context.
1562
1563e.g. you say
1564
5398038e 1565 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1566 # (normalization => undef) is REQUIRED.
1567 my $str = "Ich muß studieren Perl.";
1568 my $sub = "MÜSS";
d16e9e3d 1569 my $match;
a7fbee98 1570 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1571 $match = substr($str, $pos, $len);
d16e9e3d 1572 }
1573
4d36a948 1574and get C<"muß"> in C<$match> since C<"muß">
1575is primary equal to C<"MÜSS">.
1576
1577=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1578
1579=item C<($match) = $Collator-E<gt>match($string, $substring)>
1580
1581If C<$substring> matches a part of C<$string>, in scalar context, returns
1582B<a reference to> the first occurrence of the matching part
1583(C<$match_ref> is always true if matches,
1584since every reference is B<true>);
1585in list context, returns the first occurrence of the matching part.
1586
1587If C<$substring> does not match any part of C<$string>,
1588returns C<undef> in scalar context and
1589an empty list in list context.
1590
1591e.g.
1592
1593 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1594 print "matches [$$match_ref].\n";
1595 } else {
1596 print "doesn't match.\n";
1597 }
1598
1599 or
1600
1601 if (($match) = $Collator->match($str, $sub)) { # list context
1602 print "matches [$match].\n";
1603 } else {
1604 print "doesn't match.\n";
1605 }
1606
1607=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1608
1609If C<$substring> matches a part of C<$string>, returns
1610all the matching parts (or matching count in scalar context).
1611
1612If C<$substring> does not match any part of C<$string>,
1613returns an empty list.
1614
1615=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1616
1617If C<$substring> matches a part of C<$string>,
1618the first occurrence of the matching part is replaced by C<$replacement>
1619(C<$string> is modified) and return C<$count> (always equals to C<1>).
1620
1621C<$replacement> can be a C<CODEREF>,
1622taking the matching part as an argument,
1623and returning a string to replace the matching part
1624(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1625
1626=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1627
1628If C<$substring> matches a part of C<$string>,
1629all the occurrences of the matching part is replaced by C<$replacement>
1630(C<$string> is modified) and return C<$count>.
1631
1632C<$replacement> can be a C<CODEREF>,
1633taking the matching part as an argument,
1634and returning a string to replace the matching part
1635(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1636
1637e.g.
1638
1639 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1640 # (normalization => undef) is REQUIRED.
1641 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1642 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1643
1644 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1645 # i.e., all the camels are made bold-faced.
d16e9e3d 1646
45394607 1647=back
1648
3164dd77 1649=head2 Other Methods
1650
1651=over 4
1652
0116f5dc 1653=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1654
1655Change the value of specified keys and returns the changed part.
1656
1657 $Collator = Unicode::Collate->new(level => 4);
1658
1659 $Collator->eq("perl", "PERL"); # false
1660
1661 %old = $Collator->change(level => 2); # returns (level => 4).
1662
1663 $Collator->eq("perl", "PERL"); # true
1664
1665 $Collator->change(%old); # returns (level => 2).
1666
1667 $Collator->eq("perl", "PERL"); # false
1668
1669Not all C<(key,value)>s are allowed to be changed.
1670See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1671
1672In the scalar context, returns the modified collator
1673(but it is B<not> a clone from the original).
1674
1675 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1676
1677 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1678
1679 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1680
91ae00cb 1681=item C<$version = $Collator-E<gt>version()>
3164dd77 1682
91ae00cb 1683Returns the version number (a string) of the Unicode Standard
1684which the C<table> file used by the collator object is based on.
1685If the table does not include a version line (starting with C<@version>),
1686returns C<"unknown">.
1687
1688=item C<UCA_Version()>
3164dd77 1689
91ae00cb 1690Returns the tracking version number of UTS #10 this module consults.
3164dd77 1691
91ae00cb 1692=item C<Base_Unicode_Version()>
1693
1694Returns the version number of UTS #10 this module consults.
3164dd77 1695
1696=back
1697
45394607 1698=head2 EXPORT
1699
1700None by default.
1701
1702=head2 CAVEAT
1703
1704Use of the C<normalization> parameter requires
1705the B<Unicode::Normalize> module.
1706
5398038e 1707If you need not it (say, in the case when you need not
45394607 1708handle any combining characters),
1709assign C<normalization =E<gt> undef> explicitly.
1710
4d36a948 1711-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1712
0116f5dc 1713=head2 Conformance Test
1714
10d7ec48 1715The Conformance Test for the UCA is available
1716under L<http://www.unicode.org/Public/UCA/>.
0116f5dc 1717
1718For F<CollationTest_SHIFTED.txt>,
1719a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1720for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1721C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1722
4d36a948 1723B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1724
45394607 1725=head1 AUTHOR
1726
10d7ec48 1727SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
45394607 1728
1729 http://homepage1.nifty.com/nomenclator/perl/
1730
ae6aa562 1731 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1732
a7fbee98 1733 This library is free software; you can redistribute it
1734 and/or modify it under the same terms as Perl itself.
45394607 1735
1736=head1 SEE ALSO
1737
1738=over 4
1739
91ae00cb 1740=item Unicode Collation Algorithm - UTS #10
1741
1742L<http://www.unicode.org/reports/tr10/>
1743
1744=item The Default Unicode Collation Element Table (DUCET)
1745
10d7ec48 1746L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1747
91ae00cb 1748=item The conformance test for the UCA
45394607 1749
10d7ec48 1750L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1751
10d7ec48 1752L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1753
91ae00cb 1754=item Hangul Syllable Type
0116f5dc 1755
10d7ec48 1756L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1757
91ae00cb 1758=item Unicode Normalization Forms - UAX #15
a7fbee98 1759
91ae00cb 1760L<http://www.unicode.org/reports/tr15/>
a7fbee98 1761
a7fbee98 1762=item L<Unicode::Normalize>
45394607 1763
45394607 1764=back
1765
1766=cut