Change 24413 should have updated makedef.pl with the knowledge that 2
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48 15no warnings 'utf8';
16
e7f779c8 17our $VERSION = '0.40';
45394607 18our $PACKAGE = __PACKAGE__;
19
e7f779c8 20my @Path = qw(Unicode Collate);
21my $KeyFile = "allkeys.txt";
45394607 22
4d36a948 23# Perl's boolean
24use constant TRUE => 1;
25use constant FALSE => "";
26use constant NOMATCHPOS => -1;
27
28# A coderef to get combining class imported from Unicode::Normalize
29# (i.e. \&Unicode::Normalize::getCombinClass).
30# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 31my $CVgetCombinClass;
4d36a948 32
9f1f04a1 33# Supported Levels
34use constant MinLevel => 1;
35use constant MaxLevel => 4;
36
4d36a948 37# Minimum weights at level 2 and 3, respectively
9f1f04a1 38use constant Min2Wt => 0x20;
39use constant Min3Wt => 0x02;
4d36a948 40
41# Shifted weight at 4th level
9f1f04a1 42use constant Shift4Wt => 0xFFFF;
4d36a948 43
4d36a948 44# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45# PROBLEM: The Default Unicode Collation Element Table
46# has weights over 0xFFFF at the 4th level.
47# The tie-breaking in the variable weights
48# other than "shift" (as well as "shift-trimmed") is unreliable.
49use constant VCE_TEMPLATE => 'Cn4';
50
4d36a948 51# A sort key: 16-bit weights
52# See also the PROBLEM on VCE_TEMPLATE above.
53use constant KEY_TEMPLATE => 'n*';
54
55# Level separator in a sort key:
56# i.e. pack(KEY_TEMPLATE, 0)
57use constant LEVEL_SEP => "\0\0";
58
59# As Unicode code point separator for hash keys.
60# A joined code point string (denoted by JCPS below)
61# like "65;768" is used for internal processing
62# instead of Perl's Unicode string like "\x41\x{300}",
63# as the native code point is different from the Unicode code point
64# on EBCDIC platform.
65# This character must not be included in any stringified
66# representation of an integer.
67use constant CODE_SEP => ';';
68
69# boolean values of variable weights
0116f5dc 70use constant NON_VAR => 0; # Non-Variable character
71use constant VAR => 1; # Variable character
3164dd77 72
91ae00cb 73# specific code points
74use constant Hangul_LBase => 0x1100;
75use constant Hangul_LIni => 0x1100;
76use constant Hangul_LFin => 0x1159;
77use constant Hangul_LFill => 0x115F;
78use constant Hangul_VBase => 0x1161;
79use constant Hangul_VIni => 0x1160;
80use constant Hangul_VFin => 0x11A2;
81use constant Hangul_TBase => 0x11A7;
82use constant Hangul_TIni => 0x11A8;
83use constant Hangul_TFin => 0x11F9;
84use constant Hangul_TCount => 28;
85use constant Hangul_NCount => 588;
86use constant Hangul_SBase => 0xAC00;
87use constant Hangul_SIni => 0xAC00;
88use constant Hangul_SFin => 0xD7A3;
89use constant CJK_UidIni => 0x4E00;
90use constant CJK_UidFin => 0x9FA5;
91use constant CJK_ExtAIni => 0x3400;
92use constant CJK_ExtAFin => 0x4DB5;
93use constant CJK_ExtBIni => 0x20000;
94use constant CJK_ExtBFin => 0x2A6D6;
95use constant BMP_Max => 0xFFFF;
96
4d36a948 97# Logical_Order_Exception in PropList.txt
98# TODO: synchronization with change of PropList.txt.
e7f779c8 99my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 100
91ae00cb 101sub UCA_Version { "11" }
a7fbee98 102
91ae00cb 103sub Base_Unicode_Version { "4.0" }
a7fbee98 104
9f1f04a1 105######
106
9f1f04a1 107sub pack_U {
ae6aa562 108 return pack('U*', @_);
9f1f04a1 109}
110
111sub unpack_U {
ae6aa562 112 return unpack('U*', pack('U*').shift);
9f1f04a1 113}
114
115######
116
91ae00cb 117my (%VariableOK);
118@VariableOK{ qw/
0116f5dc 119 blanked non-ignorable shifted shift-trimmed
91ae00cb 120 / } = (); # keys lowercased
0116f5dc 121
122our @ChangeOK = qw/
123 alternate backwards level normalization rearrange
124 katakana_before_hiragana upper_before_lower
125 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 126 hangul_terminator variable
0116f5dc 127 /;
128
129our @ChangeNG = qw/
91ae00cb 130 entry mapping table maxlength
131 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
e7f779c8 133 derivCode normCode rearrangeHash
9f1f04a1 134 backwardsFlag
0116f5dc 135 /;
9f1f04a1 136# The hash key 'ignored' is deleted at v 0.21.
137# The hash key 'isShift' is deleted at v 0.23.
138# The hash key 'combining' is deleted at v 0.24.
91ae00cb 139# The hash key 'entries' is deleted at v 0.30.
e7f779c8 140# The hash key 'L3_ignorable' is deleted at v 0.40.
91ae00cb 141
142sub version {
143 my $self = shift;
144 return $self->{versionTable} || 'unknown';
145}
0116f5dc 146
147my (%ChangeOK, %ChangeNG);
148@ChangeOK{ @ChangeOK } = ();
149@ChangeNG{ @ChangeNG } = ();
150
151sub change {
152 my $self = shift;
153 my %hash = @_;
154 my %old;
91ae00cb 155 if (exists $hash{variable} && exists $hash{alternate}) {
156 delete $hash{alternate};
157 }
158 elsif (!exists $hash{variable} && exists $hash{alternate}) {
159 $hash{variable} = $hash{alternate};
160 }
0116f5dc 161 foreach my $k (keys %hash) {
162 if (exists $ChangeOK{$k}) {
163 $old{$k} = $self->{$k};
164 $self->{$k} = $hash{$k};
165 }
166 elsif (exists $ChangeNG{$k}) {
167 croak "change of $k via change() is not allowed!";
168 }
169 # else => ignored
170 }
171 $self->checkCollator;
172 return wantarray ? %old : $self;
173}
a7fbee98 174
9f1f04a1 175sub _checkLevel {
176 my $level = shift;
abd1ec54 177 my $key = shift; # 'level' or 'backwards'
178 MinLevel <= $level or croak sprintf
179 "Illegal level %d (in value for key '%s') lower than %d.",
180 $level, $key, MinLevel;
181 $level <= MaxLevel or croak sprintf
182 "Unsupported level %d (in value for key '%s') higher than %d.",
183 $level, $key, MaxLevel;
9f1f04a1 184}
185
91ae00cb 186my %DerivCode = (
187 8 => \&_derivCE_8,
188 9 => \&_derivCE_9,
189 11 => \&_derivCE_9, # 11 == 9
190);
191
0116f5dc 192sub checkCollator {
193 my $self = shift;
9f1f04a1 194 _checkLevel($self->{level}, "level");
a7fbee98 195
91ae00cb 196 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
197 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 198
91ae00cb 199 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 200 $self->{alternateTable} || 'shifted';
91ae00cb 201 $self->{variable} = $self->{alternate} = lc($self->{variable});
202 exists $VariableOK{ $self->{variable} }
203 or croak "$PACKAGE unknown variable tag name: $self->{variable}";
0116f5dc 204
9f1f04a1 205 if (! defined $self->{backwards}) {
206 $self->{backwardsFlag} = 0;
207 }
208 elsif (! ref $self->{backwards}) {
209 _checkLevel($self->{backwards}, "backwards");
210 $self->{backwardsFlag} = 1 << $self->{backwards};
211 }
212 else {
213 my %level;
214 $self->{backwardsFlag} = 0;
215 for my $b (@{ $self->{backwards} }) {
216 _checkLevel($b, "backwards");
217 $level{$b} = 1;
218 }
219 for my $v (sort keys %level) {
220 $self->{backwardsFlag} += 1 << $v;
221 }
222 }
0116f5dc 223
91ae00cb 224 defined $self->{rearrange} or $self->{rearrange} = [];
225 ref $self->{rearrange}
226 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc 227
228 # keys of $self->{rearrangeHash} are $self->{rearrange}.
229 $self->{rearrangeHash} = undef;
230
231 if (@{ $self->{rearrange} }) {
232 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
233 }
234
235 $self->{normCode} = undef;
a7fbee98 236
237 if (defined $self->{normalization}) {
238 eval { require Unicode::Normalize };
91ae00cb 239 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 240
91ae00cb 241 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 242
91ae00cb 243 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
244 $self->{normCode} = \&Unicode::Normalize::NFD;
245 }
246 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f 247 my $norm = $self->{normalization};
248 $self->{normCode} = sub {
1d2654e1 249 Unicode::Normalize::normalize($norm, shift);
250 };
06c8fc8f 251 eval { $self->{normCode}->("") }; # try
252 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 253 }
a7fbee98 254 }
0116f5dc 255 return;
256}
257
258sub new
259{
260 my $class = shift;
261 my $self = bless { @_ }, $class;
45394607 262
a7fbee98 263 # If undef is passed explicitly, no file is read.
0116f5dc 264 $self->{table} = $KeyFile if ! exists $self->{table};
265 $self->read_table if defined $self->{table};
905aa9f0 266
a7fbee98 267 if ($self->{entry}) {
e7f779c8 268 while ($self->{entry} =~ /([^\n]+)/g) {
269 $self->parseEntry($1);
270 }
a7fbee98 271 }
905aa9f0 272
9f1f04a1 273 $self->{level} ||= MaxLevel;
0116f5dc 274 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 275
abd1ec54 276 $self->{overrideHangul} = FALSE
0116f5dc 277 if ! exists $self->{overrideHangul};
abd1ec54 278 $self->{overrideCJK} = FALSE
0116f5dc 279 if ! exists $self->{overrideCJK};
06c8fc8f 280 $self->{normalization} = 'NFD'
0116f5dc 281 if ! exists $self->{normalization};
0116f5dc 282 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 283 if ! exists $self->{rearrange};
0116f5dc 284 $self->{backwards} = $self->{backwardsTable}
285 if ! exists $self->{backwards};
a7fbee98 286
0116f5dc 287 $self->checkCollator;
a7fbee98 288
289 return $self;
290}
905aa9f0 291
292sub read_table {
a7fbee98 293 my $self = shift;
a7fbee98 294
e7f779c8 295 my($f, $fh);
296 foreach my $d (@INC) {
297 $f = File::Spec->catfile($d, @Path, $self->{table});
298 last if open($fh, $f);
299 $f = undef;
300 }
301 defined $f
302 or croak "$PACKAGE: $self->{table} is not found in @INC";
a7fbee98 303
e7f779c8 304 while (<$fh>) {
a7fbee98 305 next if /^\s*#/;
abd1ec54 306 unless (s/^\s*\@//) {
307 $self->parseEntry($_);
a7fbee98 308 next;
309 }
abd1ec54 310
311 if (/^version\s*(\S*)/) {
312 $self->{versionTable} ||= $1;
313 }
314 elsif (/^variable\s+(\S*)/) { # since UTS #10-9
315 $self->{variableTable} ||= $1;
316 }
317 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
318 $self->{alternateTable} ||= $1;
319 }
320 elsif (/^backwards\s+(\S*)/) {
321 push @{ $self->{backwardsTable} }, $1;
322 }
323 elsif (/^forwards\s+(\S*)/) { # parhaps no use
324 push @{ $self->{forwardsTable} }, $1;
325 }
326 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
327 push @{ $self->{rearrangeTable} }, _getHexArray($1);
328 }
45394607 329 }
e7f779c8 330 close $fh;
45394607 331}
332
905aa9f0 333
45394607 334##
335## get $line, parse it, and write an entry in $self
336##
337sub parseEntry
338{
a7fbee98 339 my $self = shift;
340 my $line = shift;
4d36a948 341 my($name, $entry, @uv, @key);
a7fbee98 342
343 return if $line !~ /^\s*[0-9A-Fa-f]/;
344
345 # removes comment and gets name
346 $name = $1
347 if $line =~ s/[#%]\s*(.*)//;
348 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
349
350 # gets element
351 my($e, $k) = split /;/, $line;
352 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
353 if ! $k;
354
4d36a948 355 @uv = _getHexArray($e);
356 return if !@uv;
357
358 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 359
4d36a948 360 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 361 my $ele = pack_U(@uv);
a7fbee98 362
4d36a948 363 # regarded as if it were not entried in the table
364 return
365 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 366
4d36a948 367 # replaced as completely ignorable
368 $k = '[.0000.0000.0000.0000]'
369 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 370 }
0116f5dc 371
4d36a948 372 # replaced as completely ignorable
373 $k = '[.0000.0000.0000.0000]'
374 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
375
4c843366 376 my $is_L3_ignorable = TRUE;
4d36a948 377
caffd4cf 378 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
379 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 380 my @wt = _getHexArray($arr);
381 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 382 $is_L3_ignorable = FALSE
383 if $wt[0] + $wt[1] + $wt[2] != 0;
4d36a948 384 # if $arr !~ /[1-9A-Fa-f]/; NG
385 # Conformance Test shows L3-ignorable is completely ignorable.
4c843366 386 # For expansion, an entry $is_L3_ignorable
387 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 388 }
caffd4cf 389
e7f779c8 390 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 391
91ae00cb 392 if (@uv > 1) {
393 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
394 and $self->{maxlength}{$uv[0]} = @uv;
395 }
45394607 396}
397
9f1f04a1 398
45394607 399##
abd1ec54 400## VCE = _varCE(variable term, VCE)
45394607 401##
abd1ec54 402sub _varCE
45394607 403{
abd1ec54 404 my $vbl = shift;
405 my $vce = shift;
406 if ($vbl eq 'non-ignorable') {
407 return $vce;
408 }
409 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
410
411 if ($var) {
412 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
413 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
414 }
415 elsif ($vbl eq 'blanked') {
416 return $vce;
417 }
418 else {
419 return pack(VCE_TEMPLATE, $var, @wt[0..2],
420 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
421 }
45394607 422}
423
45394607 424sub viewSortKey
425{
a7fbee98 426 my $self = shift;
9f1f04a1 427 $self->visualizeSortKey($self->getSortKey(@_));
428}
0116f5dc 429
9f1f04a1 430sub visualizeSortKey
431{
432 my $self = shift;
433 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 434
9f1f04a1 435 if ($self->{UCA_Version} <= 8) {
0116f5dc 436 $view =~ s/ ?0000 ?/|/g;
437 } else {
438 $view =~ s/\b0000\b/|/g;
439 }
a7fbee98 440 return "[$view]";
45394607 441}
442
d16e9e3d 443
45394607 444##
91ae00cb 445## arrayref of JCPS = splitEnt(string to be collated)
446## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 447##
91ae00cb 448sub splitEnt
45394607 449{
a7fbee98 450 my $self = shift;
4d36a948 451 my $wLen = $_[1];
452
a7fbee98 453 my $code = $self->{preprocess};
0116f5dc 454 my $norm = $self->{normCode};
91ae00cb 455 my $map = $self->{mapping};
a7fbee98 456 my $max = $self->{maxlength};
457 my $reH = $self->{rearrangeHash};
91ae00cb 458 my $ver9 = $self->{UCA_Version} >= 9;
a7fbee98 459
4d36a948 460 my ($str, @buf);
a7fbee98 461
4d36a948 462 if ($wLen) {
463 $code and croak "Preprocess breaks character positions. "
464 . "Don't use with index(), match(), etc.";
465 $norm and croak "Normalization breaks character positions. "
466 . "Don't use with index(), match(), etc.";
467 $str = $_[0];
468 }
469 else {
470 $str = $_[0];
471 $str = &$code($str) if ref $code;
472 $str = &$norm($str) if ref $norm;
473 }
a7fbee98 474
4d36a948 475 # get array of Unicode code point of string.
9f1f04a1 476 my @src = unpack_U($str);
4d36a948 477
478 # rearrangement:
479 # Character positions are not kept if rearranged,
480 # then neglected if $wLen is true.
481 if ($reH && ! $wLen) {
a7fbee98 482 for (my $i = 0; $i < @src; $i++) {
483 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
484 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
485 $i++;
486 }
487 }
45394607 488 }
45394607 489
abd1ec54 490 # To remove a character marked as a completely ignorable.
491 for (my $i = 0; $i < @src; $i++) {
492 $src[$i] = undef
e7f779c8 493 if _isIllegal($src[$i]) || ($ver9 &&
494 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
0116f5dc 495 }
496
a7fbee98 497 for (my $i = 0; $i < @src; $i++) {
91ae00cb 498 my $jcps = $src[$i];
abd1ec54 499 next if ! defined $jcps;
500 my $i_orig = $i;
4d36a948 501
91ae00cb 502 if ($max->{$jcps}) { # contract
503 my $temp_jcps = $jcps;
504 my $jcpsLen = 1;
505 my $maxLen = $max->{$jcps};
4d36a948 506
91ae00cb 507 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 508 next if ! defined $src[$p];
91ae00cb 509 $temp_jcps .= CODE_SEP . $src[$p];
510 $jcpsLen++;
511 if ($map->{$temp_jcps}) {
512 $jcps = $temp_jcps;
4d36a948 513 $i = $p;
514 }
515 }
4d36a948 516
06c8fc8f 517 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
518 # This process requires Unicode::Normalize.
91ae00cb 519 # If "normalization" is undef, here should be skipped *always*
06c8fc8f 520 # (in spite of bool value of $CVgetCombinClass),
521 # since canonical ordering cannot be expected.
522 # Blocked combining character should not be contracted.
523
524 if ($self->{normalization})
525 # $self->{normCode} is false in the case of "prenormalized".
526 {
527 my $preCC = 0;
528 my $curCC = 0;
529
530 for (my $p = $i + 1; $p < @src; $p++) {
531 next if ! defined $src[$p];
532 $curCC = $CVgetCombinClass->($src[$p]);
533 last unless $curCC;
534 my $tail = CODE_SEP . $src[$p];
91ae00cb 535 if ($preCC != $curCC && $map->{$jcps.$tail}) {
536 $jcps .= $tail;
06c8fc8f 537 $src[$p] = undef;
538 } else {
539 $preCC = $curCC;
540 }
4d36a948 541 }
a7fbee98 542 }
a7fbee98 543 }
544
4d36a948 545 if ($wLen) {
abd1ec54 546 for (; $i + 1 < @src; $i++) {
547 last if defined $src[$i + 1];
a7fbee98 548 }
549 }
4d36a948 550
91ae00cb 551 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 552 }
4d36a948 553 return \@buf;
d16e9e3d 554}
45394607 555
d16e9e3d 556
557##
abd1ec54 558## list of VCE = getWt(JCPS)
d16e9e3d 559##
560sub getWt
561{
a7fbee98 562 my $self = shift;
91ae00cb 563 my $u = shift;
abd1ec54 564 my $vbl = $self->{variable};
91ae00cb 565 my $map = $self->{mapping};
0116f5dc 566 my $der = $self->{derivCode};
a7fbee98 567
91ae00cb 568 return if !defined $u;
abd1ec54 569 return map(_varCE($vbl, $_), @{ $map->{$u} })
91ae00cb 570 if $map->{$u};
a7fbee98 571
91ae00cb 572 # JCPS must not be a contraction, then it's a code point.
573 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 574 my $hang = $self->{overrideHangul};
575 my @hangulCE;
576 if ($hang) {
577 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
578 }
579 elsif (!defined $hang) {
580 @hangulCE = $der->($u);
581 }
582 else {
583 my $max = $self->{maxlength};
584 my @decH = _decompHangul($u);
585
586 if (@decH == 2) {
587 my $contract = join(CODE_SEP, @decH);
91ae00cb 588 @decH = ($contract) if $map->{$contract};
1d2654e1 589 } else { # must be <@decH == 3>
590 if ($max->{$decH[0]}) {
591 my $contract = join(CODE_SEP, @decH);
91ae00cb 592 if ($map->{$contract}) {
1d2654e1 593 @decH = ($contract);
594 } else {
595 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 596 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1 597 }
598 # even if V's ignorable, LT contraction is not supported.
599 # If such a situatution were required, NFD should be used.
600 }
601 if (@decH == 3 && $max->{$decH[1]}) {
602 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 603 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1 604 }
605 }
606
607 @hangulCE = map({
91ae00cb 608 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1 609 } @decH);
610 }
abd1ec54 611 return map _varCE($vbl, $_), @hangulCE;
a7fbee98 612 }
91ae00cb 613 elsif (CJK_UidIni <= $u && $u <= CJK_UidFin ||
614 CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
615 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
1d2654e1 616 my $cjk = $self->{overrideCJK};
abd1ec54 617 return map _varCE($vbl, $_),
0116f5dc 618 $cjk
4d36a948 619 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
91ae00cb 620 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
9f1f04a1 621 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 622 : $der->($u);
a7fbee98 623 }
624 else {
abd1ec54 625 return map _varCE($vbl, $_), $der->($u);
a7fbee98 626 }
d16e9e3d 627}
628
d16e9e3d 629
630##
631## string sortkey = getSortKey(string arg)
632##
633sub getSortKey
634{
a7fbee98 635 my $self = shift;
636 my $lev = $self->{level};
91ae00cb 637 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
638 my $ver9 = $self->{UCA_Version} >= 9;
abd1ec54 639 my $v2i = $ver9 && $self->{variable} ne 'non-ignorable';
91ae00cb 640
abd1ec54 641 my @buf; # weight arrays
91ae00cb 642 if ($self->{hangul_terminator}) {
643 my $preHST = '';
644 foreach my $jcps (@$rEnt) {
645 # weird things like VL, TL-contraction are not considered!
646 my $curHST = '';
647 foreach my $u (split /;/, $jcps) {
648 $curHST .= getHST($u);
649 }
650 if ($preHST && !$curHST || # hangul before non-hangul
651 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
652 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
653 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
0116f5dc 654
abd1ec54 655 push @buf, $self->getWtHangulTerm();
91ae00cb 656 }
657 $preHST = $curHST;
658
abd1ec54 659 push @buf, $self->getWt($jcps);
91ae00cb 660 }
661 $preHST # end at hangul
abd1ec54 662 and push @buf, $self->getWtHangulTerm();
91ae00cb 663 }
664 else {
665 foreach my $jcps (@$rEnt) {
abd1ec54 666 push @buf, $self->getWt($jcps);
91ae00cb 667 }
668 }
669
abd1ec54 670 # make sort key
671 my @ret = ([],[],[],[]);
672 my $last_is_variable;
673
674 foreach my $vwt (@buf) {
675 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
676 if ($v2i) {
677 if ($var) {
678 $last_is_variable = TRUE;
679 }
680 elsif (!$wt[0]) { # ignorable
4d36a948 681 next if $last_is_variable;
abd1ec54 682 }
683 else {
684 $last_is_variable = FALSE;
0116f5dc 685 }
686 }
abd1ec54 687 foreach my $v (0..$lev-1) {
688 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
a7fbee98 689 }
690 }
45394607 691
a7fbee98 692 # modification of tertiary weights
693 if ($self->{upper_before_lower}) {
694 foreach (@{ $ret[2] }) {
695 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
696 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
697 elsif ($_ == 0x1C) { $_ += 1 } # square upper
698 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
699 }
45394607 700 }
a7fbee98 701 if ($self->{katakana_before_hiragana}) {
702 foreach (@{ $ret[2] }) {
703 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
704 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
705 }
45394607 706 }
9f1f04a1 707
708 if ($self->{backwardsFlag}) {
709 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
710 if ($self->{backwardsFlag} & (1 << $v)) {
711 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
712 }
713 }
714 }
715
4d36a948 716 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 717}
718
719
720##
d16e9e3d 721## int compare = cmp(string a, string b)
45394607 722##
5398038e 723sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
724sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
725sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
726sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
727sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
728sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
729sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 730
731##
d16e9e3d 732## list[strings] sorted = sort(list[strings] arg)
45394607 733##
a7fbee98 734sub sort {
735 my $obj = shift;
736 return
737 map { $_->[1] }
738 sort{ $a->[0] cmp $b->[0] }
739 map [ $obj->getSortKey($_), $_ ], @_;
45394607 740}
741
0116f5dc 742
4d36a948 743sub _derivCE_9 {
0116f5dc 744 my $u = shift;
745 my $base =
91ae00cb 746 (CJK_UidIni <= $u && $u <= CJK_UidFin)
4d36a948 747 ? 0xFB40 : # CJK
91ae00cb 748 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
749 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
4d36a948 750 ? 0xFB80 # CJK ext.
751 : 0xFBC0; # others
0116f5dc 752
753 my $aaaa = $base + ($u >> 15);
754 my $bbbb = ($u & 0x7FFF) | 0x8000;
755 return
9f1f04a1 756 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
757 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 758}
759
4d36a948 760sub _derivCE_8 {
0116f5dc 761 my $code = shift;
762 my $aaaa = 0xFF80 + ($code >> 15);
763 my $bbbb = ($code & 0x7FFF) | 0x8000;
764 return
4d36a948 765 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
766 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 767}
768
91ae00cb 769
abd1ec54 770sub getWtHangulTerm {
91ae00cb 771 my $self = shift;
abd1ec54 772 return _varCE($self->{variable},
773 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
91ae00cb 774}
775
776
45394607 777##
778## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
779##
a7fbee98 780sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 781
a7fbee98 782#
4d36a948 783# $code *must* be in Hangul syllable.
a7fbee98 784# Check it before you enter here.
785#
5398038e 786sub _decompHangul {
787 my $code = shift;
91ae00cb 788 my $SIndex = $code - Hangul_SBase;
789 my $LIndex = int( $SIndex / Hangul_NCount);
790 my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
791 my $TIndex = $SIndex % Hangul_TCount;
5398038e 792 return (
91ae00cb 793 Hangul_LBase + $LIndex,
794 Hangul_VBase + $VIndex,
795 $TIndex ? (Hangul_TBase + $TIndex) : (),
5398038e 796 );
45394607 797}
798
10d7ec48 799sub _isIllegal {
4d36a948 800 my $code = shift;
801 return ! defined $code # removed
802 || ($code < 0 || 0x10FFFF < $code) # out of range
803 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
804 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
805 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
806 ;
807}
808
91ae00cb 809# Hangul Syllable Type
810sub getHST {
811 my $u = shift;
812 return
813 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
814 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
815 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
816 Hangul_SIni <= $u && $u <= Hangul_SFin ?
817 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
818}
819
4d36a948 820
821##
822## bool _nonIgnorAtLevel(arrayref weights, int level)
823##
824sub _nonIgnorAtLevel($$)
825{
826 my $wt = shift;
827 return if ! defined $wt;
828 my $lv = shift;
9f1f04a1 829 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 830}
831
832##
833## bool _eqArray(
834## arrayref of arrayref[weights] source,
835## arrayref of arrayref[weights] substr,
836## int level)
837## * comparison of graphemes vs graphemes.
838## @$source >= @$substr must be true (check it before call this);
839##
840sub _eqArray($$$)
841{
842 my $source = shift;
843 my $substr = shift;
844 my $lev = shift;
845
846 for my $g (0..@$substr-1){
847 # Do the $g'th graphemes have the same number of AV weigths?
848 return if @{ $source->[$g] } != @{ $substr->[$g] };
849
850 for my $w (0..@{ $substr->[$g] }-1) {
851 for my $v (0..$lev-1) {
852 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
853 }
854 }
855 }
856 return 1;
857}
858
859##
860## (int position, int length)
861## int position = index(string, substring, position, [undoc'ed grobal])
862##
863## With "grobal" (only for the list context),
864## returns list of arrayref[position, length].
865##
866sub index
867{
91ae00cb 868 my $self = shift;
869 my $str = shift;
870 my $len = length($str);
871 my $subE = $self->splitEnt(shift);
872 my $pos = @_ ? shift : 0;
873 $pos = 0 if $pos < 0;
874 my $grob = shift;
875
876 my $lev = $self->{level};
877 my $ver9 = $self->{UCA_Version} >= 9;
878 my $v2i = $self->{variable} ne 'non-ignorable';
879
880 if (! @$subE) {
4d36a948 881 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
882 return $grob
883 ? map([$_, 0], $temp..$len)
884 : wantarray ? ($temp,0) : $temp;
885 }
abd1ec54 886 $len < $pos
887 and return wantarray ? () : NOMATCHPOS;
91ae00cb 888 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54 889 @$strE
890 or return wantarray ? () : NOMATCHPOS;
891
4d36a948 892 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
893
abd1ec54 894 my $last_is_variable;
895 for my $vwt (map $self->getWt($_), @$subE) {
896 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
897 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 898
899 if ($v2i && $ver9) {
abd1ec54 900 if ($var) {
901 $last_is_variable = TRUE;
902 }
903 elsif (!$wt[0]) { # ignorable
4d36a948 904 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54 905 }
906 else {
907 $last_is_variable = FALSE;
4d36a948 908 }
909 }
910
abd1ec54 911 if (@subWt && !$var && !$wt[0]) {
912 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
4d36a948 913 } else {
abd1ec54 914 push @subWt, [ \@wt ];
4d36a948 915 }
916 }
917
918 my $count = 0;
91ae00cb 919 my $end = @$strE - 1;
4d36a948 920
abd1ec54 921 $last_is_variable = FALSE; # reuse
4d36a948 922 for (my $i = 0; $i <= $end; ) { # no $i++
923 my $found_base = 0;
924
925 # fetch a grapheme
926 while ($i <= $end && $found_base == 0) {
abd1ec54 927 for my $vwt ($self->getWt($strE->[$i][0])) {
928 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
929 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 930
931 if ($v2i && $ver9) {
abd1ec54 932 if ($var) {
933 $last_is_variable = TRUE;
934 }
935 elsif (!$wt[0]) { # ignorable
4d36a948 936 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54 937 }
938 else {
939 $last_is_variable = FALSE;
4d36a948 940 }
941 }
942
abd1ec54 943 if (@strWt && !$var && !$wt[0]) {
944 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 945 $finPos[-1] = $strE->[$i][2];
4d36a948 946 } elsif ($to_be_pushed) {
abd1ec54 947 push @strWt, [ \@wt ];
91ae00cb 948 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 949 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 950 push @finPos, $strE->[$i][2];
4d36a948 951 $found_base++;
952 }
953 # else ===> no-op
954 }
955 $i++;
956 }
957
958 # try to match
959 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
960 if ($iniPos[0] != NOMATCHPOS &&
961 $finPos[$#subWt] != NOMATCHPOS &&
962 _eqArray(\@strWt, \@subWt, $lev)) {
963 my $temp = $iniPos[0] + $pos;
964
965 if ($grob) {
966 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
967 splice @strWt, 0, $#subWt;
968 splice @iniPos, 0, $#subWt;
969 splice @finPos, 0, $#subWt;
970 }
971 else {
972 return wantarray
973 ? ($temp, $finPos[$#subWt] - $iniPos[0])
974 : $temp;
975 }
976 }
977 shift @strWt;
978 shift @iniPos;
979 shift @finPos;
980 }
981 }
982
983 return $grob
984 ? @g_ret
985 : wantarray ? () : NOMATCHPOS;
986}
987
988##
989## scalarref to matching part = match(string, substring)
990##
991sub match
992{
993 my $self = shift;
994 if (my($pos,$len) = $self->index($_[0], $_[1])) {
995 my $temp = substr($_[0], $pos, $len);
996 return wantarray ? $temp : \$temp;
997 # An lvalue ref \substr should be avoided,
998 # since its value is affected by modification of its referent.
999 }
1000 else {
1001 return;
1002 }
1003}
1004
1005##
1006## arrayref matching parts = gmatch(string, substring)
1007##
1008sub gmatch
1009{
1010 my $self = shift;
1011 my $str = shift;
1012 my $sub = shift;
1013 return map substr($str, $_->[0], $_->[1]),
1014 $self->index($str, $sub, 0, 'g');
1015}
1016
1017##
1018## bool subst'ed = subst(string, substring, replace)
1019##
1020sub subst
1021{
1022 my $self = shift;
1023 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1024
1025 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1026 if ($code) {
1027 my $mat = substr($_[0], $pos, $len);
1028 substr($_[0], $pos, $len, $code->($mat));
1029 } else {
1030 substr($_[0], $pos, $len, $_[2]);
1031 }
1032 return TRUE;
1033 }
1034 else {
1035 return FALSE;
1036 }
1037}
1038
1039##
1040## int count = gsubst(string, substring, replace)
1041##
1042sub gsubst
1043{
1044 my $self = shift;
1045 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1046 my $cnt = 0;
1047
1048 # Replacement is carried out from the end, then use reverse.
1049 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1050 if ($code) {
1051 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1052 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1053 } else {
1054 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1055 }
1056 $cnt++;
1057 }
1058 return $cnt;
1059}
1060
45394607 10611;
1062__END__
1063
1064=head1 NAME
1065
a7fbee98 1066Unicode::Collate - Unicode Collation Algorithm
45394607 1067
1068=head1 SYNOPSIS
1069
1070 use Unicode::Collate;
1071
1072 #construct
5398038e 1073 $Collator = Unicode::Collate->new(%tailoring);
45394607 1074
1075 #sort
5398038e 1076 @sorted = $Collator->sort(@not_sorted);
45394607 1077
1078 #compare
a7fbee98 1079 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1080
91ae00cb 1081 # If %tailoring is false (i.e. empty),
1082 # $Collator should do the default collation.
1083
45394607 1084=head1 DESCRIPTION
1085
4d36a948 1086This module is an implementation
1087of Unicode Technical Standard #10 (UTS #10)
1088"Unicode Collation Algorithm."
1089
45394607 1090=head2 Constructor and Tailoring
1091
d16e9e3d 1092The C<new> method returns a collator object.
1093
5398038e 1094 $Collator = Unicode::Collate->new(
0116f5dc 1095 UCA_Version => $UCA_Version,
91ae00cb 1096 alternate => $alternate, # deprecated: use of 'variable' is recommended.
45394607 1097 backwards => $levelNumber, # or \@levelNumbers
1098 entry => $element,
91ae00cb 1099 hangul_terminator => $term_primary_weight,
45394607 1100 ignoreName => qr/$ignoreName/,
1101 ignoreChar => qr/$ignoreChar/,
1102 katakana_before_hiragana => $bool,
1103 level => $collationLevel,
91ae00cb 1104 normalization => $normalization_form,
45394607 1105 overrideCJK => \&overrideCJK,
1106 overrideHangul => \&overrideHangul,
1107 preprocess => \&preprocess,
1108 rearrange => \@charList,
1109 table => $filename,
1110 undefName => qr/$undefName/,
1111 undefChar => qr/$undefChar/,
1112 upper_before_lower => $bool,
91ae00cb 1113 variable => $variable,
45394607 1114 );
45394607 1115
1116=over 4
1117
0116f5dc 1118=item UCA_Version
1119
91ae00cb 1120If the tracking version number of the older UCA is given,
1121the older behavior of that tracking version is emulated on collating.
0116f5dc 1122If omitted, the return value of C<UCA_Version()> is used.
1123
91ae00cb 1124The supported tracking version: 8, 9, or 11.
0116f5dc 1125
1126B<This parameter may be removed in the future version,
1127as switching the algorithm would affect the performance.>
1128
45394607 1129=item backwards
1130
4d36a948 1131-- see 3.1.2 French Accents, UTS #10.
45394607 1132
1133 backwards => $levelNumber or \@levelNumbers
1134
1135Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1136If omitted, forwards at all the levels.
1137
1138=item entry
1139
4d36a948 1140-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1141
91ae00cb 1142If the same character (or a sequence of characters) exists
1143in the collation element table through C<table>,
1144mapping to collation elements is overrided.
1145If it does not exist, the mapping is defined additionally.
45394607 1146
abd1ec54 1147 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11480063 0068 ; [.0E6A.0020.0002.0063] # ch
11490043 0068 ; [.0E6A.0020.0007.0043] # Ch
11500043 0048 ; [.0E6A.0020.0008.0043] # CH
1151006C 006C ; [.0F4C.0020.0002.006C] # ll
1152004C 006C ; [.0F4C.0020.0007.004C] # Ll
1153004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8 115400F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1155006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
115600D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1157004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54 1158ENTRY
1159
1160 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
116100E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
116200C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1163ENTRY
45394607 1164
4d36a948 1165B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54 1166B<must> be a Unicode code point (defined as hexadecimal),
1167but not a native code point.
4d36a948 1168So C<0063> must always denote C<U+0063>,
1169but not a character of C<"\x63">.
1170
abd1ec54 1171Weighting may vary depending on collation element table.
1172So ensure the weights defined in C<entry> will be consistent with
1173those in the collation element table loaded via C<table>.
1174
1175In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1176and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1177(as a value between C<0E60> and C<0E6D>)
1178makes ordering as C<C E<lt> CH E<lt> D>.
1179Exactly speaking DUCET already has some characters between C<C> and C<D>:
1180C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1181C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1182and C<c-curl> (C<U+0255>) with C<0E69>.
1183Then primary weight C<0E6A> for C<CH> makes C<CH>
1184ordered between C<c-curl> and C<D>.
1185
91ae00cb 1186=item hangul_terminator
1187
1188-- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1189
1190If a true value is given (non-zero but should be positive),
1191it will be added as a terminator primary weight to the end of
1192every standard Hangul syllable. Secondary and any higher weights
1193for terminator are set to zero.
1194If the value is false or C<hangul_terminator> key does not exist,
1195insertion of terminator weights will not be performed.
1196
1197Boundaries of Hangul syllables are determined
1198according to conjoining Jamo behavior in F<the Unicode Standard>
1199and F<HangulSyllableType.txt>.
1200
1201B<Implementation Note:>
1202(1) For expansion mapping (Unicode character mapped
1203to a sequence of collation elements), a terminator will not be added
1204between collation elements, even if Hangul syllable boundary exists there.
1205Addition of terminator is restricted to the next position
1206to the last collation element.
1207
1208(2) Non-conjoining Hangul letters
1209(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1210automatically terminated with a terminator primary weight.
1211These characters may need terminator included in a collation element
1212table beforehand.
1213
45394607 1214=item ignoreName
1215
1216=item ignoreChar
1217
4d36a948 1218-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1219
caffd4cf 1220Makes the entry in the table completely ignorable;
1221i.e. as if the weights were zero at all level.
45394607 1222
a7fbee98 1223E.g. when 'a' and 'e' are ignorable,
45394607 1224'element' is equal to 'lament' (or 'lmnt').
1225
1226=item level
1227
4d36a948 1228-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1229
1230Set the maximum level.
1231Any higher levels than the specified one are ignored.
1232
1233 Level 1: alphabetic ordering
1234 Level 2: diacritic ordering
1235 Level 3: case ordering
91ae00cb 1236 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607 1237
1238 ex.level => 2,
1239
a7fbee98 1240If omitted, the maximum is the 4th.
1241
45394607 1242=item normalization
1243
4d36a948 1244-- see 4.1 Normalize each input string, UTS #10.
45394607 1245
905aa9f0 1246If specified, strings are normalized before preparation of sort keys
45394607 1247(the normalization is executed after preprocess).
1248
1d2654e1 1249A form name C<Unicode::Normalize::normalize()> accepts will be applied
1250as C<$normalization_form>.
06c8fc8f 1251Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1 1252See C<Unicode::Normalize::normalize()> for detail.
1253If omitted, C<'NFD'> is used.
45394607 1254
91ae00cb 1255C<normalization> is performed after C<preprocess> (if defined).
45394607 1256
06c8fc8f 1257Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1258though they are not concerned with C<Unicode::Normalize::normalize()>.
1259
1260If C<undef> (not a string C<"undef">) is passed explicitly
1261as the value for this key,
45394607 1262any normalization is not carried out (this may make tailoring easier
abd1ec54 1263if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1264only contiguous contractions are resolved;
1265e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1266C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f 1267In this point,
1268C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1269B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1270
1271In the case of C<(normalization =E<gt> "prenormalized")>,
1272any normalization is not performed, but
1273non-contiguous contractions with combining characters are performed.
1274Therefore
1275C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1276B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1277If source strings are finely prenormalized,
1278C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1279
1280Except C<(normalization =E<gt> undef)>,
1281B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607 1282
1283=item overrideCJK
1284
4d36a948 1285-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1286
91ae00cb 1287By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1288(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than
1289C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1290C<U+20000> to C<U+2A6D6>].
1291
1292Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
45394607 1293
a7fbee98 1294ex. CJK Unified Ideographs in the JIS code point order.
45394607 1295
1296 overrideCJK => sub {
a7fbee98 1297 my $u = shift; # get a Unicode codepoint
1298 my $b = pack('n', $u); # to UTF-16BE
1299 my $s = your_unicode_to_sjis_converter($b); # convert
1300 my $n = unpack('n', $s); # convert sjis to short
1301 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1302 },
1303
a7fbee98 1304ex. ignores all CJK Unified Ideographs.
1305
1306 overrideCJK => sub {()}, # CODEREF returning empty list
1307
1308 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1309 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1310
1311If C<undef> is passed explicitly as the value for this key,
1312weights for CJK Unified Ideographs are treated as undefined.
1313But assignment of weight for CJK Unified Ideographs
91ae00cb 1314in table or C<entry> is still valid.
a7fbee98 1315
1316=item overrideHangul
1317
4d36a948 1318-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1319
abd1ec54 1320By default, Hangul Syllables are decomposed into Hangul Jamo,
1321even if C<(normalization =E<gt> undef)>.
a7fbee98 1322But the mapping of Hangul Syllables may be overrided.
1323
91ae00cb 1324This tag works like C<overrideCJK>, so see there for examples.
a7fbee98 1325
45394607 1326If you want to override the mapping of Hangul Syllables,
abd1ec54 1327NFD, NFKD, and FCD are not appropriate,
1328since they will decompose Hangul Syllables before overriding.
45394607 1329
a7fbee98 1330If C<undef> is passed explicitly as the value for this key,
1331weight for Hangul Syllables is treated as undefined
1332without decomposition into Hangul Jamo.
1333But definition of weight for Hangul Syllables
91ae00cb 1334in table or C<entry> is still valid.
a7fbee98 1335
45394607 1336=item preprocess
1337
4d36a948 1338-- see 5.1 Preprocessing, UTS #10.
45394607 1339
1340If specified, the coderef is used to preprocess
1341before the formation of sort keys.
1342
a7fbee98 1343ex. dropping English articles, such as "a" or "the".
45394607 1344Then, "the pen" is before "a pencil".
1345
1346 preprocess => sub {
1347 my $str = shift;
a7fbee98 1348 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1349 return $str;
45394607 1350 },
1351
91ae00cb 1352C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1353
45394607 1354=item rearrange
1355
4d36a948 1356-- see 3.1.3 Rearrangement, UTS #10.
45394607 1357
1358Characters that are not coded in logical order and to be rearranged.
a7fbee98 1359By default,
45394607 1360
1361 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1362
a7fbee98 1363If you want to disallow any rearrangement,
1364pass C<undef> or C<[]> (a reference to an empty list)
1365as the value for this key.
1366
0116f5dc 1367B<According to the version 9 of UCA, this parameter shall not be used;
1368but it is not warned at present.>
1369
45394607 1370=item table
1371
4d36a948 1372-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1373
91ae00cb 1374You can use another collation element table if desired.
45394607 1375
e7f779c8 1376The table file should locate in the F<Unicode/Collate> directory
1377on C<@INC>. Say, if the filename is F<Foo.txt>
1378the table file is searched as F<Unicode/Collate/Foo.txt> in <@INC>.
1379
1380By default, F<allkeys.txt> (as the filename of DUCET) is used.
45394607 1381
a7fbee98 1382If C<undef> is passed explicitly as the value for this key,
91ae00cb 1383no file is read (but you can define collation elements via C<entry>).
a7fbee98 1384
1385A typical way to define a collation element table
1386without any file of table:
1387
1388 $onlyABC = Unicode::Collate->new(
1389 table => undef,
1390 entry => << 'ENTRIES',
13910061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
13920041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
13930062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
13940042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
13950063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
13960043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1397ENTRIES
1398 );
905aa9f0 1399
45394607 1400=item undefName
1401
1402=item undefChar
1403
4d36a948 1404-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1405
1406Undefines the collation element as if it were unassigned in the table.
1407This reduces the size of the table.
1408If an unassigned character appears in the string to be collated,
1409the sort key is made from its codepoint
1410as a single-character collation element,
1411as it is greater than any other assigned collation elements
1412(in the codepoint order among the unassigned characters).
1413But, it'd be better to ignore characters
1414unfamiliar to you and maybe never used.
1415
e7f779c8 1416ex. Collation weights for beyond-BMP characters are not stored in object:
1417
1418 undefChar => qr/[^\0-\x{fffd}]/,
1419
45394607 1420=item katakana_before_hiragana
1421
1422=item upper_before_lower
1423
4d36a948 1424-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607 1425
1426By default, lowercase is before uppercase
1427and hiragana is before katakana.
1428
a7fbee98 1429If the tag is made true, this is reversed.
1430
1431B<NOTE>: These tags simplemindedly assume
1432any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1 1433must occur in level 3, and their weights at level 3
1434must be same as those mentioned in 7.3.1, UTS #10.
1435If you define your collation elements which violate this requirement,
4d36a948 1436these tags don't work validly.
45394607 1437
91ae00cb 1438=item variable
1439
1440=item alternate
1441
1442-- see 3.2.2 Variable Weighting, UTS #10.
1443
1444(the title in UCA version 8: Alternate Weighting)
1445
1446This key allows to variable weighting for variable collation elements,
1447which are marked with an ASTERISK in the table
1448(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1449
1450 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1451
1452These names are case-insensitive.
1453By default (if specification is omitted), 'shifted' is adopted.
1454
1455 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1456 considered at the 4th level.
1457
abd1ec54 1458 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb 1459
1460 'Shifted' Variable elements are made ignorable at levels 1 through 3
1461 their level 4 weight is replaced by the old level 1 weight.
1462 Level 4 weight for Non-Variable elements is 0xFFFF.
1463
1464 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1465 are trimmed.
1466
1467For backward compatibility, C<alternate> can be used as an alias
1468for C<variable>.
1469
45394607 1470=back
1471
3164dd77 1472=head2 Methods for Collation
45394607 1473
1474=over 4
1475
5398038e 1476=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1477
1478Sorts a list of strings.
1479
5398038e 1480=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1481
1482Returns 1 (when C<$a> is greater than C<$b>)
1483or 0 (when C<$a> is equal to C<$b>)
1484or -1 (when C<$a> is lesser than C<$b>).
1485
5398038e 1486=item C<$result = $Collator-E<gt>eq($a, $b)>
1487
1488=item C<$result = $Collator-E<gt>ne($a, $b)>
1489
1490=item C<$result = $Collator-E<gt>lt($a, $b)>
1491
1492=item C<$result = $Collator-E<gt>le($a, $b)>
1493
1494=item C<$result = $Collator-E<gt>gt($a, $b)>
1495
1496=item C<$result = $Collator-E<gt>ge($a, $b)>
1497
a7fbee98 1498They works like the same name operators as theirs.
5398038e 1499
1500 eq : whether $a is equal to $b.
1501 ne : whether $a is not equal to $b.
1502 lt : whether $a is lesser than $b.
1503 le : whether $a is lesser than $b or equal to $b.
1504 gt : whether $a is greater than $b.
1505 ge : whether $a is greater than $b or equal to $b.
1506
1507=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1508
4d36a948 1509-- see 4.3 Form a sort key for each string, UTS #10.
45394607 1510
1511Returns a sort key.
1512
1513You compare the sort keys using a binary comparison
1514and get the result of the comparison of the strings using UCA.
1515
5398038e 1516 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1517
1518 is equivalent to
1519
5398038e 1520 $Collator->cmp($a, $b)
45394607 1521
a7fbee98 1522=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1523
a7fbee98 1524 use Unicode::Collate;
1525 my $c = Unicode::Collate->new();
1526 print $c->viewSortKey("Perl"),"\n";
1527
0116f5dc 1528 # output:
1529 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1530 # Level 1 Level 2 Level 3 Level 4
1531
1532 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1533
4d36a948 1534=back
1535
1536=head2 Methods for Searching
d16e9e3d 1537
4d36a948 1538B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1539for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1540C<subst>, C<gsubst>) is croaked,
1541as the position and the length might differ
1542from those on the specified string.
91ae00cb 1543(And C<rearrange> and C<hangul_terminator> tags are neglected.)
d16e9e3d 1544
4d36a948 1545The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1546like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1547but they are not aware of any pattern, but only a literal substring.
1548
1549=over 4
1550
1551=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1552
1553=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1554
1555If C<$substring> matches a part of C<$string>, returns
1556the position of the first occurrence of the matching part in scalar context;
1557in list context, returns a two-element list of
1558the position and the length of the matching part.
1559
d16e9e3d 1560If C<$substring> does not match any part of C<$string>,
1561returns C<-1> in scalar context and
1562an empty list in list context.
1563
1564e.g. you say
1565
5398038e 1566 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1567 # (normalization => undef) is REQUIRED.
1568 my $str = "Ich muß studieren Perl.";
1569 my $sub = "MÜSS";
d16e9e3d 1570 my $match;
a7fbee98 1571 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1572 $match = substr($str, $pos, $len);
d16e9e3d 1573 }
1574
4d36a948 1575and get C<"muß"> in C<$match> since C<"muß">
1576is primary equal to C<"MÜSS">.
1577
1578=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1579
1580=item C<($match) = $Collator-E<gt>match($string, $substring)>
1581
1582If C<$substring> matches a part of C<$string>, in scalar context, returns
1583B<a reference to> the first occurrence of the matching part
1584(C<$match_ref> is always true if matches,
1585since every reference is B<true>);
1586in list context, returns the first occurrence of the matching part.
1587
1588If C<$substring> does not match any part of C<$string>,
1589returns C<undef> in scalar context and
1590an empty list in list context.
1591
1592e.g.
1593
1594 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1595 print "matches [$$match_ref].\n";
1596 } else {
1597 print "doesn't match.\n";
1598 }
1599
1600 or
1601
1602 if (($match) = $Collator->match($str, $sub)) { # list context
1603 print "matches [$match].\n";
1604 } else {
1605 print "doesn't match.\n";
1606 }
1607
1608=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1609
1610If C<$substring> matches a part of C<$string>, returns
1611all the matching parts (or matching count in scalar context).
1612
1613If C<$substring> does not match any part of C<$string>,
1614returns an empty list.
1615
1616=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1617
1618If C<$substring> matches a part of C<$string>,
1619the first occurrence of the matching part is replaced by C<$replacement>
1620(C<$string> is modified) and return C<$count> (always equals to C<1>).
1621
1622C<$replacement> can be a C<CODEREF>,
1623taking the matching part as an argument,
1624and returning a string to replace the matching part
1625(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1626
1627=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1628
1629If C<$substring> matches a part of C<$string>,
1630all the occurrences of the matching part is replaced by C<$replacement>
1631(C<$string> is modified) and return C<$count>.
1632
1633C<$replacement> can be a C<CODEREF>,
1634taking the matching part as an argument,
1635and returning a string to replace the matching part
1636(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1637
1638e.g.
1639
1640 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1641 # (normalization => undef) is REQUIRED.
1642 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1643 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1644
1645 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1646 # i.e., all the camels are made bold-faced.
d16e9e3d 1647
45394607 1648=back
1649
3164dd77 1650=head2 Other Methods
1651
1652=over 4
1653
0116f5dc 1654=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1655
1656Change the value of specified keys and returns the changed part.
1657
1658 $Collator = Unicode::Collate->new(level => 4);
1659
1660 $Collator->eq("perl", "PERL"); # false
1661
1662 %old = $Collator->change(level => 2); # returns (level => 4).
1663
1664 $Collator->eq("perl", "PERL"); # true
1665
1666 $Collator->change(%old); # returns (level => 2).
1667
1668 $Collator->eq("perl", "PERL"); # false
1669
1670Not all C<(key,value)>s are allowed to be changed.
1671See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1672
1673In the scalar context, returns the modified collator
1674(but it is B<not> a clone from the original).
1675
1676 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1677
1678 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1679
1680 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1681
91ae00cb 1682=item C<$version = $Collator-E<gt>version()>
3164dd77 1683
91ae00cb 1684Returns the version number (a string) of the Unicode Standard
1685which the C<table> file used by the collator object is based on.
1686If the table does not include a version line (starting with C<@version>),
1687returns C<"unknown">.
1688
1689=item C<UCA_Version()>
3164dd77 1690
91ae00cb 1691Returns the tracking version number of UTS #10 this module consults.
3164dd77 1692
91ae00cb 1693=item C<Base_Unicode_Version()>
1694
1695Returns the version number of UTS #10 this module consults.
3164dd77 1696
1697=back
1698
45394607 1699=head2 EXPORT
1700
1701None by default.
1702
1703=head2 CAVEAT
1704
1705Use of the C<normalization> parameter requires
1706the B<Unicode::Normalize> module.
1707
5398038e 1708If you need not it (say, in the case when you need not
45394607 1709handle any combining characters),
1710assign C<normalization =E<gt> undef> explicitly.
1711
4d36a948 1712-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1713
0116f5dc 1714=head2 Conformance Test
1715
10d7ec48 1716The Conformance Test for the UCA is available
1717under L<http://www.unicode.org/Public/UCA/>.
0116f5dc 1718
1719For F<CollationTest_SHIFTED.txt>,
1720a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1721for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1722C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1723
4d36a948 1724B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1725
45394607 1726=head1 AUTHOR
1727
10d7ec48 1728SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
45394607 1729
1730 http://homepage1.nifty.com/nomenclator/perl/
1731
e7f779c8 1732 Copyright(C) 2001-2004, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1733
a7fbee98 1734 This library is free software; you can redistribute it
1735 and/or modify it under the same terms as Perl itself.
45394607 1736
1737=head1 SEE ALSO
1738
1739=over 4
1740
91ae00cb 1741=item Unicode Collation Algorithm - UTS #10
1742
1743L<http://www.unicode.org/reports/tr10/>
1744
1745=item The Default Unicode Collation Element Table (DUCET)
1746
10d7ec48 1747L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1748
91ae00cb 1749=item The conformance test for the UCA
45394607 1750
10d7ec48 1751L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1752
10d7ec48 1753L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1754
91ae00cb 1755=item Hangul Syllable Type
0116f5dc 1756
10d7ec48 1757L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1758
91ae00cb 1759=item Unicode Normalization Forms - UAX #15
a7fbee98 1760
91ae00cb 1761L<http://www.unicode.org/reports/tr15/>
a7fbee98 1762
a7fbee98 1763=item L<Unicode::Normalize>
45394607 1764
45394607 1765=back
1766
1767=cut