Yitzchak points out that this function argument is NULLOK
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48 15no warnings 'utf8';
16
0d50d293 17our $VERSION = '0.51';
45394607 18our $PACKAGE = __PACKAGE__;
19
e7f779c8 20my @Path = qw(Unicode Collate);
21my $KeyFile = "allkeys.txt";
45394607 22
4d36a948 23# Perl's boolean
24use constant TRUE => 1;
25use constant FALSE => "";
26use constant NOMATCHPOS => -1;
27
28# A coderef to get combining class imported from Unicode::Normalize
29# (i.e. \&Unicode::Normalize::getCombinClass).
30# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 31my $CVgetCombinClass;
4d36a948 32
9f1f04a1 33# Supported Levels
34use constant MinLevel => 1;
35use constant MaxLevel => 4;
36
4d36a948 37# Minimum weights at level 2 and 3, respectively
9f1f04a1 38use constant Min2Wt => 0x20;
39use constant Min3Wt => 0x02;
4d36a948 40
41# Shifted weight at 4th level
9f1f04a1 42use constant Shift4Wt => 0xFFFF;
4d36a948 43
4d36a948 44# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45# PROBLEM: The Default Unicode Collation Element Table
46# has weights over 0xFFFF at the 4th level.
47# The tie-breaking in the variable weights
48# other than "shift" (as well as "shift-trimmed") is unreliable.
49use constant VCE_TEMPLATE => 'Cn4';
50
4d36a948 51# A sort key: 16-bit weights
52# See also the PROBLEM on VCE_TEMPLATE above.
53use constant KEY_TEMPLATE => 'n*';
54
55# Level separator in a sort key:
56# i.e. pack(KEY_TEMPLATE, 0)
57use constant LEVEL_SEP => "\0\0";
58
59# As Unicode code point separator for hash keys.
60# A joined code point string (denoted by JCPS below)
61# like "65;768" is used for internal processing
62# instead of Perl's Unicode string like "\x41\x{300}",
63# as the native code point is different from the Unicode code point
64# on EBCDIC platform.
65# This character must not be included in any stringified
66# representation of an integer.
67use constant CODE_SEP => ';';
68
69# boolean values of variable weights
0116f5dc 70use constant NON_VAR => 0; # Non-Variable character
71use constant VAR => 1; # Variable character
3164dd77 72
91ae00cb 73# specific code points
74use constant Hangul_LBase => 0x1100;
75use constant Hangul_LIni => 0x1100;
76use constant Hangul_LFin => 0x1159;
77use constant Hangul_LFill => 0x115F;
78use constant Hangul_VBase => 0x1161;
3756e7ca 79use constant Hangul_VIni => 0x1160; # from Vowel Filler
91ae00cb 80use constant Hangul_VFin => 0x11A2;
3756e7ca 81use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
91ae00cb 82use constant Hangul_TIni => 0x11A8;
83use constant Hangul_TFin => 0x11F9;
84use constant Hangul_TCount => 28;
85use constant Hangul_NCount => 588;
86use constant Hangul_SBase => 0xAC00;
87use constant Hangul_SIni => 0xAC00;
88use constant Hangul_SFin => 0xD7A3;
89use constant CJK_UidIni => 0x4E00;
90use constant CJK_UidFin => 0x9FA5;
3756e7ca 91use constant CJK_UidF41 => 0x9FBB;
91ae00cb 92use constant CJK_ExtAIni => 0x3400;
93use constant CJK_ExtAFin => 0x4DB5;
94use constant CJK_ExtBIni => 0x20000;
95use constant CJK_ExtBFin => 0x2A6D6;
96use constant BMP_Max => 0xFFFF;
97
4d36a948 98# Logical_Order_Exception in PropList.txt
e7f779c8 99my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 100
3756e7ca 101sub UCA_Version { "14" }
a7fbee98 102
3756e7ca 103sub Base_Unicode_Version { "4.1.0" }
a7fbee98 104
9f1f04a1 105######
106
9f1f04a1 107sub pack_U {
ae6aa562 108 return pack('U*', @_);
9f1f04a1 109}
110
111sub unpack_U {
3756e7ca 112 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1 113}
114
115######
116
91ae00cb 117my (%VariableOK);
118@VariableOK{ qw/
0116f5dc 119 blanked non-ignorable shifted shift-trimmed
91ae00cb 120 / } = (); # keys lowercased
0116f5dc 121
122our @ChangeOK = qw/
123 alternate backwards level normalization rearrange
124 katakana_before_hiragana upper_before_lower
125 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 126 hangul_terminator variable
0116f5dc 127 /;
128
129our @ChangeNG = qw/
91ae00cb 130 entry mapping table maxlength
131 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
e7f779c8 133 derivCode normCode rearrangeHash
9f1f04a1 134 backwardsFlag
0116f5dc 135 /;
9f1f04a1 136# The hash key 'ignored' is deleted at v 0.21.
137# The hash key 'isShift' is deleted at v 0.23.
138# The hash key 'combining' is deleted at v 0.24.
91ae00cb 139# The hash key 'entries' is deleted at v 0.30.
e7f779c8 140# The hash key 'L3_ignorable' is deleted at v 0.40.
91ae00cb 141
142sub version {
143 my $self = shift;
144 return $self->{versionTable} || 'unknown';
145}
0116f5dc 146
147my (%ChangeOK, %ChangeNG);
148@ChangeOK{ @ChangeOK } = ();
149@ChangeNG{ @ChangeNG } = ();
150
151sub change {
152 my $self = shift;
153 my %hash = @_;
154 my %old;
91ae00cb 155 if (exists $hash{variable} && exists $hash{alternate}) {
156 delete $hash{alternate};
157 }
158 elsif (!exists $hash{variable} && exists $hash{alternate}) {
159 $hash{variable} = $hash{alternate};
160 }
0116f5dc 161 foreach my $k (keys %hash) {
162 if (exists $ChangeOK{$k}) {
163 $old{$k} = $self->{$k};
164 $self->{$k} = $hash{$k};
165 }
166 elsif (exists $ChangeNG{$k}) {
167 croak "change of $k via change() is not allowed!";
168 }
169 # else => ignored
170 }
3756e7ca 171 $self->checkCollator();
0116f5dc 172 return wantarray ? %old : $self;
173}
a7fbee98 174
9f1f04a1 175sub _checkLevel {
176 my $level = shift;
abd1ec54 177 my $key = shift; # 'level' or 'backwards'
178 MinLevel <= $level or croak sprintf
179 "Illegal level %d (in value for key '%s') lower than %d.",
180 $level, $key, MinLevel;
181 $level <= MaxLevel or croak sprintf
182 "Unsupported level %d (in value for key '%s') higher than %d.",
183 $level, $key, MaxLevel;
9f1f04a1 184}
185
91ae00cb 186my %DerivCode = (
187 8 => \&_derivCE_8,
188 9 => \&_derivCE_9,
189 11 => \&_derivCE_9, # 11 == 9
3756e7ca 190 14 => \&_derivCE_14,
91ae00cb 191);
192
0116f5dc 193sub checkCollator {
194 my $self = shift;
9f1f04a1 195 _checkLevel($self->{level}, "level");
a7fbee98 196
91ae00cb 197 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
198 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 199
91ae00cb 200 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 201 $self->{alternateTable} || 'shifted';
91ae00cb 202 $self->{variable} = $self->{alternate} = lc($self->{variable});
203 exists $VariableOK{ $self->{variable} }
3756e7ca 204 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 205
9f1f04a1 206 if (! defined $self->{backwards}) {
207 $self->{backwardsFlag} = 0;
208 }
209 elsif (! ref $self->{backwards}) {
210 _checkLevel($self->{backwards}, "backwards");
211 $self->{backwardsFlag} = 1 << $self->{backwards};
212 }
213 else {
214 my %level;
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
218 $level{$b} = 1;
219 }
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
222 }
223 }
0116f5dc 224
91ae00cb 225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc 228
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
231
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234 }
235
236 $self->{normCode} = undef;
a7fbee98 237
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
91ae00cb 240 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 241
91ae00cb 242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 243
91ae00cb 244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
246 }
247 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f 248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
1d2654e1 250 Unicode::Normalize::normalize($norm, shift);
251 };
06c8fc8f 252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 254 }
a7fbee98 255 }
0116f5dc 256 return;
257}
258
259sub new
260{
261 my $class = shift;
262 my $self = bless { @_ }, $class;
45394607 263
a7fbee98 264 # If undef is passed explicitly, no file is read.
0116f5dc 265 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 266 $self->read_table() if defined $self->{table};
905aa9f0 267
a7fbee98 268 if ($self->{entry}) {
e7f779c8 269 while ($self->{entry} =~ /([^\n]+)/g) {
270 $self->parseEntry($1);
271 }
a7fbee98 272 }
905aa9f0 273
9f1f04a1 274 $self->{level} ||= MaxLevel;
0116f5dc 275 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 276
abd1ec54 277 $self->{overrideHangul} = FALSE
0116f5dc 278 if ! exists $self->{overrideHangul};
abd1ec54 279 $self->{overrideCJK} = FALSE
0116f5dc 280 if ! exists $self->{overrideCJK};
06c8fc8f 281 $self->{normalization} = 'NFD'
0116f5dc 282 if ! exists $self->{normalization};
3756e7ca 283 $self->{rearrange} = $self->{rearrangeTable} ||
284 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 285 if ! exists $self->{rearrange};
0116f5dc 286 $self->{backwards} = $self->{backwardsTable}
287 if ! exists $self->{backwards};
a7fbee98 288
3756e7ca 289 $self->checkCollator();
a7fbee98 290
291 return $self;
292}
905aa9f0 293
294sub read_table {
a7fbee98 295 my $self = shift;
a7fbee98 296
e7f779c8 297 my($f, $fh);
298 foreach my $d (@INC) {
299 $f = File::Spec->catfile($d, @Path, $self->{table});
300 last if open($fh, $f);
301 $f = undef;
302 }
0d50d293 303 if (!defined $f) {
304 $f = File::Spec->catfile(@Path, $self->{table});
305 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
306 }
a7fbee98 307
e7f779c8 308 while (<$fh>) {
a7fbee98 309 next if /^\s*#/;
abd1ec54 310 unless (s/^\s*\@//) {
311 $self->parseEntry($_);
a7fbee98 312 next;
313 }
abd1ec54 314
315 if (/^version\s*(\S*)/) {
316 $self->{versionTable} ||= $1;
317 }
318 elsif (/^variable\s+(\S*)/) { # since UTS #10-9
319 $self->{variableTable} ||= $1;
320 }
321 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
322 $self->{alternateTable} ||= $1;
323 }
324 elsif (/^backwards\s+(\S*)/) {
325 push @{ $self->{backwardsTable} }, $1;
326 }
327 elsif (/^forwards\s+(\S*)/) { # parhaps no use
328 push @{ $self->{forwardsTable} }, $1;
329 }
330 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
331 push @{ $self->{rearrangeTable} }, _getHexArray($1);
332 }
45394607 333 }
e7f779c8 334 close $fh;
45394607 335}
336
905aa9f0 337
45394607 338##
339## get $line, parse it, and write an entry in $self
340##
341sub parseEntry
342{
a7fbee98 343 my $self = shift;
344 my $line = shift;
4d36a948 345 my($name, $entry, @uv, @key);
a7fbee98 346
347 return if $line !~ /^\s*[0-9A-Fa-f]/;
348
349 # removes comment and gets name
350 $name = $1
351 if $line =~ s/[#%]\s*(.*)//;
352 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
353
354 # gets element
355 my($e, $k) = split /;/, $line;
356 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
357 if ! $k;
358
4d36a948 359 @uv = _getHexArray($e);
360 return if !@uv;
361
362 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 363
4d36a948 364 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 365 my $ele = pack_U(@uv);
a7fbee98 366
4d36a948 367 # regarded as if it were not entried in the table
368 return
369 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 370
4d36a948 371 # replaced as completely ignorable
372 $k = '[.0000.0000.0000.0000]'
373 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 374 }
0116f5dc 375
4d36a948 376 # replaced as completely ignorable
377 $k = '[.0000.0000.0000.0000]'
378 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
379
4c843366 380 my $is_L3_ignorable = TRUE;
4d36a948 381
caffd4cf 382 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
383 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 384 my @wt = _getHexArray($arr);
385 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 386 $is_L3_ignorable = FALSE
3756e7ca 387 if $wt[0] || $wt[1] || $wt[2];
388 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
389 # is completely ignorable.
4c843366 390 # For expansion, an entry $is_L3_ignorable
391 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 392 }
caffd4cf 393
e7f779c8 394 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 395
91ae00cb 396 if (@uv > 1) {
397 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
398 and $self->{maxlength}{$uv[0]} = @uv;
399 }
45394607 400}
401
9f1f04a1 402
45394607 403##
abd1ec54 404## VCE = _varCE(variable term, VCE)
45394607 405##
abd1ec54 406sub _varCE
45394607 407{
abd1ec54 408 my $vbl = shift;
409 my $vce = shift;
410 if ($vbl eq 'non-ignorable') {
411 return $vce;
412 }
413 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
414
415 if ($var) {
416 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
417 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
418 }
419 elsif ($vbl eq 'blanked') {
420 return $vce;
421 }
422 else {
423 return pack(VCE_TEMPLATE, $var, @wt[0..2],
424 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
425 }
45394607 426}
427
45394607 428sub viewSortKey
429{
a7fbee98 430 my $self = shift;
9f1f04a1 431 $self->visualizeSortKey($self->getSortKey(@_));
432}
0116f5dc 433
9f1f04a1 434sub visualizeSortKey
435{
436 my $self = shift;
437 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 438
9f1f04a1 439 if ($self->{UCA_Version} <= 8) {
0116f5dc 440 $view =~ s/ ?0000 ?/|/g;
441 } else {
442 $view =~ s/\b0000\b/|/g;
443 }
a7fbee98 444 return "[$view]";
45394607 445}
446
d16e9e3d 447
45394607 448##
91ae00cb 449## arrayref of JCPS = splitEnt(string to be collated)
450## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 451##
91ae00cb 452sub splitEnt
45394607 453{
a7fbee98 454 my $self = shift;
4d36a948 455 my $wLen = $_[1];
456
a7fbee98 457 my $code = $self->{preprocess};
0116f5dc 458 my $norm = $self->{normCode};
91ae00cb 459 my $map = $self->{mapping};
a7fbee98 460 my $max = $self->{maxlength};
461 my $reH = $self->{rearrangeHash};
3756e7ca 462 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
a7fbee98 463
4d36a948 464 my ($str, @buf);
a7fbee98 465
4d36a948 466 if ($wLen) {
467 $code and croak "Preprocess breaks character positions. "
468 . "Don't use with index(), match(), etc.";
469 $norm and croak "Normalization breaks character positions. "
470 . "Don't use with index(), match(), etc.";
471 $str = $_[0];
472 }
473 else {
474 $str = $_[0];
475 $str = &$code($str) if ref $code;
476 $str = &$norm($str) if ref $norm;
477 }
a7fbee98 478
4d36a948 479 # get array of Unicode code point of string.
9f1f04a1 480 my @src = unpack_U($str);
4d36a948 481
482 # rearrangement:
483 # Character positions are not kept if rearranged,
484 # then neglected if $wLen is true.
485 if ($reH && ! $wLen) {
a7fbee98 486 for (my $i = 0; $i < @src; $i++) {
487 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
488 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
489 $i++;
490 }
491 }
45394607 492 }
45394607 493
3756e7ca 494 # remove a code point marked as a completely ignorable.
abd1ec54 495 for (my $i = 0; $i < @src; $i++) {
496 $src[$i] = undef
e7f779c8 497 if _isIllegal($src[$i]) || ($ver9 &&
498 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
0116f5dc 499 }
500
a7fbee98 501 for (my $i = 0; $i < @src; $i++) {
91ae00cb 502 my $jcps = $src[$i];
3756e7ca 503
504 # skip removed code point
505 if (! defined $jcps) {
506 if ($wLen && @buf) {
507 $buf[-1][2] = $i + 1;
508 }
509 next;
510 }
511
abd1ec54 512 my $i_orig = $i;
4d36a948 513
3756e7ca 514 # find contraction
515 if ($max->{$jcps}) {
91ae00cb 516 my $temp_jcps = $jcps;
517 my $jcpsLen = 1;
518 my $maxLen = $max->{$jcps};
4d36a948 519
91ae00cb 520 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 521 next if ! defined $src[$p];
91ae00cb 522 $temp_jcps .= CODE_SEP . $src[$p];
523 $jcpsLen++;
524 if ($map->{$temp_jcps}) {
525 $jcps = $temp_jcps;
4d36a948 526 $i = $p;
527 }
528 }
4d36a948 529
06c8fc8f 530 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
531 # This process requires Unicode::Normalize.
91ae00cb 532 # If "normalization" is undef, here should be skipped *always*
06c8fc8f 533 # (in spite of bool value of $CVgetCombinClass),
534 # since canonical ordering cannot be expected.
535 # Blocked combining character should not be contracted.
536
537 if ($self->{normalization})
538 # $self->{normCode} is false in the case of "prenormalized".
539 {
540 my $preCC = 0;
541 my $curCC = 0;
542
543 for (my $p = $i + 1; $p < @src; $p++) {
544 next if ! defined $src[$p];
545 $curCC = $CVgetCombinClass->($src[$p]);
546 last unless $curCC;
547 my $tail = CODE_SEP . $src[$p];
91ae00cb 548 if ($preCC != $curCC && $map->{$jcps.$tail}) {
549 $jcps .= $tail;
06c8fc8f 550 $src[$p] = undef;
551 } else {
552 $preCC = $curCC;
553 }
4d36a948 554 }
a7fbee98 555 }
a7fbee98 556 }
557
3756e7ca 558 # skip completely ignorable
559 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
560 if ($wLen && @buf) {
561 $buf[-1][2] = $i + 1;
a7fbee98 562 }
3756e7ca 563 next;
a7fbee98 564 }
4d36a948 565
91ae00cb 566 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 567 }
4d36a948 568 return \@buf;
d16e9e3d 569}
45394607 570
d16e9e3d 571
572##
abd1ec54 573## list of VCE = getWt(JCPS)
d16e9e3d 574##
575sub getWt
576{
a7fbee98 577 my $self = shift;
91ae00cb 578 my $u = shift;
abd1ec54 579 my $vbl = $self->{variable};
91ae00cb 580 my $map = $self->{mapping};
0116f5dc 581 my $der = $self->{derivCode};
a7fbee98 582
91ae00cb 583 return if !defined $u;
abd1ec54 584 return map(_varCE($vbl, $_), @{ $map->{$u} })
91ae00cb 585 if $map->{$u};
a7fbee98 586
91ae00cb 587 # JCPS must not be a contraction, then it's a code point.
588 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 589 my $hang = $self->{overrideHangul};
590 my @hangulCE;
591 if ($hang) {
592 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
593 }
594 elsif (!defined $hang) {
595 @hangulCE = $der->($u);
596 }
597 else {
598 my $max = $self->{maxlength};
599 my @decH = _decompHangul($u);
600
601 if (@decH == 2) {
602 my $contract = join(CODE_SEP, @decH);
91ae00cb 603 @decH = ($contract) if $map->{$contract};
1d2654e1 604 } else { # must be <@decH == 3>
605 if ($max->{$decH[0]}) {
606 my $contract = join(CODE_SEP, @decH);
91ae00cb 607 if ($map->{$contract}) {
1d2654e1 608 @decH = ($contract);
609 } else {
610 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 611 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1 612 }
613 # even if V's ignorable, LT contraction is not supported.
614 # If such a situatution were required, NFD should be used.
615 }
616 if (@decH == 3 && $max->{$decH[1]}) {
617 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 618 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1 619 }
620 }
621
622 @hangulCE = map({
91ae00cb 623 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1 624 } @decH);
625 }
abd1ec54 626 return map _varCE($vbl, $_), @hangulCE;
a7fbee98 627 }
3756e7ca 628 elsif (_isUIdeo($u, $self->{UCA_Version})) {
1d2654e1 629 my $cjk = $self->{overrideCJK};
abd1ec54 630 return map _varCE($vbl, $_),
0116f5dc 631 $cjk
4d36a948 632 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
3756e7ca 633 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
634 ? _uideoCE_8($u)
0116f5dc 635 : $der->($u);
a7fbee98 636 }
637 else {
abd1ec54 638 return map _varCE($vbl, $_), $der->($u);
a7fbee98 639 }
d16e9e3d 640}
641
d16e9e3d 642
643##
644## string sortkey = getSortKey(string arg)
645##
646sub getSortKey
647{
a7fbee98 648 my $self = shift;
649 my $lev = $self->{level};
91ae00cb 650 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
3756e7ca 651 my $v2i = $self->{UCA_Version} >= 9 &&
652 $self->{variable} ne 'non-ignorable';
91ae00cb 653
abd1ec54 654 my @buf; # weight arrays
91ae00cb 655 if ($self->{hangul_terminator}) {
656 my $preHST = '';
657 foreach my $jcps (@$rEnt) {
658 # weird things like VL, TL-contraction are not considered!
659 my $curHST = '';
660 foreach my $u (split /;/, $jcps) {
661 $curHST .= getHST($u);
662 }
663 if ($preHST && !$curHST || # hangul before non-hangul
664 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
665 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
666 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
0116f5dc 667
abd1ec54 668 push @buf, $self->getWtHangulTerm();
91ae00cb 669 }
670 $preHST = $curHST;
671
abd1ec54 672 push @buf, $self->getWt($jcps);
91ae00cb 673 }
674 $preHST # end at hangul
abd1ec54 675 and push @buf, $self->getWtHangulTerm();
91ae00cb 676 }
677 else {
678 foreach my $jcps (@$rEnt) {
abd1ec54 679 push @buf, $self->getWt($jcps);
91ae00cb 680 }
681 }
682
abd1ec54 683 # make sort key
684 my @ret = ([],[],[],[]);
685 my $last_is_variable;
686
687 foreach my $vwt (@buf) {
688 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
3756e7ca 689
690 # "Ignorable (L1, L2) after Variable" since track. v. 9
abd1ec54 691 if ($v2i) {
692 if ($var) {
693 $last_is_variable = TRUE;
694 }
695 elsif (!$wt[0]) { # ignorable
4d36a948 696 next if $last_is_variable;
abd1ec54 697 }
698 else {
699 $last_is_variable = FALSE;
0116f5dc 700 }
701 }
abd1ec54 702 foreach my $v (0..$lev-1) {
703 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
a7fbee98 704 }
705 }
45394607 706
a7fbee98 707 # modification of tertiary weights
708 if ($self->{upper_before_lower}) {
709 foreach (@{ $ret[2] }) {
710 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
711 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
712 elsif ($_ == 0x1C) { $_ += 1 } # square upper
713 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
714 }
45394607 715 }
a7fbee98 716 if ($self->{katakana_before_hiragana}) {
717 foreach (@{ $ret[2] }) {
718 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
719 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
720 }
45394607 721 }
9f1f04a1 722
723 if ($self->{backwardsFlag}) {
724 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
725 if ($self->{backwardsFlag} & (1 << $v)) {
726 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
727 }
728 }
729 }
730
4d36a948 731 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 732}
733
734
735##
d16e9e3d 736## int compare = cmp(string a, string b)
45394607 737##
5398038e 738sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
739sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
740sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
741sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
742sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
743sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
744sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 745
746##
d16e9e3d 747## list[strings] sorted = sort(list[strings] arg)
45394607 748##
a7fbee98 749sub sort {
750 my $obj = shift;
751 return
752 map { $_->[1] }
753 sort{ $a->[0] cmp $b->[0] }
754 map [ $obj->getSortKey($_), $_ ], @_;
45394607 755}
756
0116f5dc 757
3756e7ca 758sub _derivCE_14 {
759 my $u = shift;
760 my $base =
761 (CJK_UidIni <= $u && $u <= CJK_UidF41)
762 ? 0xFB40 : # CJK
763 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
764 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
765 ? 0xFB80 # CJK ext.
766 : 0xFBC0; # others
767
768 my $aaaa = $base + ($u >> 15);
769 my $bbbb = ($u & 0x7FFF) | 0x8000;
770 return
771 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
772 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
773}
774
4d36a948 775sub _derivCE_9 {
0116f5dc 776 my $u = shift;
777 my $base =
91ae00cb 778 (CJK_UidIni <= $u && $u <= CJK_UidFin)
4d36a948 779 ? 0xFB40 : # CJK
91ae00cb 780 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
781 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
4d36a948 782 ? 0xFB80 # CJK ext.
783 : 0xFBC0; # others
0116f5dc 784
785 my $aaaa = $base + ($u >> 15);
786 my $bbbb = ($u & 0x7FFF) | 0x8000;
787 return
9f1f04a1 788 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
789 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 790}
791
4d36a948 792sub _derivCE_8 {
0116f5dc 793 my $code = shift;
794 my $aaaa = 0xFF80 + ($code >> 15);
795 my $bbbb = ($code & 0x7FFF) | 0x8000;
796 return
4d36a948 797 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
798 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 799}
800
3756e7ca 801sub _uideoCE_8 {
802 my $u = shift;
803 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
804}
805
806sub _isUIdeo {
807 my ($u, $uca_vers) = @_;
808 return(
809 (CJK_UidIni <= $u &&
810 ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
811 ||
812 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
813 ||
814 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
815 );
816}
817
91ae00cb 818
abd1ec54 819sub getWtHangulTerm {
91ae00cb 820 my $self = shift;
abd1ec54 821 return _varCE($self->{variable},
822 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
91ae00cb 823}
824
825
45394607 826##
827## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
828##
a7fbee98 829sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 830
a7fbee98 831#
4d36a948 832# $code *must* be in Hangul syllable.
a7fbee98 833# Check it before you enter here.
834#
5398038e 835sub _decompHangul {
836 my $code = shift;
3756e7ca 837 my $si = $code - Hangul_SBase;
838 my $li = int( $si / Hangul_NCount);
839 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
840 my $ti = $si % Hangul_TCount;
5398038e 841 return (
3756e7ca 842 Hangul_LBase + $li,
843 Hangul_VBase + $vi,
844 $ti ? (Hangul_TBase + $ti) : (),
5398038e 845 );
45394607 846}
847
10d7ec48 848sub _isIllegal {
4d36a948 849 my $code = shift;
850 return ! defined $code # removed
851 || ($code < 0 || 0x10FFFF < $code) # out of range
852 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
853 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
854 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
855 ;
856}
857
91ae00cb 858# Hangul Syllable Type
859sub getHST {
860 my $u = shift;
861 return
862 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
863 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
864 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
865 Hangul_SIni <= $u && $u <= Hangul_SFin ?
866 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
867}
868
4d36a948 869
870##
871## bool _nonIgnorAtLevel(arrayref weights, int level)
872##
873sub _nonIgnorAtLevel($$)
874{
875 my $wt = shift;
876 return if ! defined $wt;
877 my $lv = shift;
9f1f04a1 878 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 879}
880
881##
882## bool _eqArray(
883## arrayref of arrayref[weights] source,
884## arrayref of arrayref[weights] substr,
885## int level)
886## * comparison of graphemes vs graphemes.
887## @$source >= @$substr must be true (check it before call this);
888##
889sub _eqArray($$$)
890{
891 my $source = shift;
892 my $substr = shift;
893 my $lev = shift;
894
895 for my $g (0..@$substr-1){
896 # Do the $g'th graphemes have the same number of AV weigths?
897 return if @{ $source->[$g] } != @{ $substr->[$g] };
898
899 for my $w (0..@{ $substr->[$g] }-1) {
900 for my $v (0..$lev-1) {
901 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
902 }
903 }
904 }
905 return 1;
906}
907
908##
909## (int position, int length)
910## int position = index(string, substring, position, [undoc'ed grobal])
911##
912## With "grobal" (only for the list context),
913## returns list of arrayref[position, length].
914##
915sub index
916{
91ae00cb 917 my $self = shift;
918 my $str = shift;
919 my $len = length($str);
920 my $subE = $self->splitEnt(shift);
921 my $pos = @_ ? shift : 0;
922 $pos = 0 if $pos < 0;
923 my $grob = shift;
924
925 my $lev = $self->{level};
3756e7ca 926 my $v2i = $self->{UCA_Version} >= 9 &&
927 $self->{variable} ne 'non-ignorable';
91ae00cb 928
929 if (! @$subE) {
4d36a948 930 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
931 return $grob
932 ? map([$_, 0], $temp..$len)
933 : wantarray ? ($temp,0) : $temp;
934 }
abd1ec54 935 $len < $pos
936 and return wantarray ? () : NOMATCHPOS;
91ae00cb 937 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54 938 @$strE
939 or return wantarray ? () : NOMATCHPOS;
940
4d36a948 941 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
942
abd1ec54 943 my $last_is_variable;
944 for my $vwt (map $self->getWt($_), @$subE) {
945 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
946 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 947
3756e7ca 948 # "Ignorable (L1, L2) after Variable" since track. v. 9
949 if ($v2i) {
abd1ec54 950 if ($var) {
951 $last_is_variable = TRUE;
952 }
953 elsif (!$wt[0]) { # ignorable
4d36a948 954 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54 955 }
956 else {
957 $last_is_variable = FALSE;
4d36a948 958 }
959 }
960
abd1ec54 961 if (@subWt && !$var && !$wt[0]) {
962 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
4d36a948 963 } else {
abd1ec54 964 push @subWt, [ \@wt ];
4d36a948 965 }
966 }
967
968 my $count = 0;
91ae00cb 969 my $end = @$strE - 1;
4d36a948 970
abd1ec54 971 $last_is_variable = FALSE; # reuse
4d36a948 972 for (my $i = 0; $i <= $end; ) { # no $i++
973 my $found_base = 0;
974
975 # fetch a grapheme
976 while ($i <= $end && $found_base == 0) {
abd1ec54 977 for my $vwt ($self->getWt($strE->[$i][0])) {
978 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
979 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 980
3756e7ca 981 # "Ignorable (L1, L2) after Variable" since track. v. 9
982 if ($v2i) {
abd1ec54 983 if ($var) {
984 $last_is_variable = TRUE;
985 }
986 elsif (!$wt[0]) { # ignorable
4d36a948 987 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54 988 }
989 else {
990 $last_is_variable = FALSE;
4d36a948 991 }
992 }
993
abd1ec54 994 if (@strWt && !$var && !$wt[0]) {
995 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 996 $finPos[-1] = $strE->[$i][2];
4d36a948 997 } elsif ($to_be_pushed) {
abd1ec54 998 push @strWt, [ \@wt ];
91ae00cb 999 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 1000 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 1001 push @finPos, $strE->[$i][2];
4d36a948 1002 $found_base++;
1003 }
1004 # else ===> no-op
1005 }
1006 $i++;
1007 }
1008
1009 # try to match
1010 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1011 if ($iniPos[0] != NOMATCHPOS &&
1012 $finPos[$#subWt] != NOMATCHPOS &&
1013 _eqArray(\@strWt, \@subWt, $lev)) {
1014 my $temp = $iniPos[0] + $pos;
1015
1016 if ($grob) {
1017 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1018 splice @strWt, 0, $#subWt;
1019 splice @iniPos, 0, $#subWt;
1020 splice @finPos, 0, $#subWt;
1021 }
1022 else {
1023 return wantarray
1024 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1025 : $temp;
1026 }
1027 }
1028 shift @strWt;
1029 shift @iniPos;
1030 shift @finPos;
1031 }
1032 }
1033
1034 return $grob
1035 ? @g_ret
1036 : wantarray ? () : NOMATCHPOS;
1037}
1038
1039##
1040## scalarref to matching part = match(string, substring)
1041##
1042sub match
1043{
1044 my $self = shift;
1045 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1046 my $temp = substr($_[0], $pos, $len);
1047 return wantarray ? $temp : \$temp;
1048 # An lvalue ref \substr should be avoided,
1049 # since its value is affected by modification of its referent.
1050 }
1051 else {
1052 return;
1053 }
1054}
1055
1056##
1057## arrayref matching parts = gmatch(string, substring)
1058##
1059sub gmatch
1060{
1061 my $self = shift;
1062 my $str = shift;
1063 my $sub = shift;
1064 return map substr($str, $_->[0], $_->[1]),
1065 $self->index($str, $sub, 0, 'g');
1066}
1067
1068##
1069## bool subst'ed = subst(string, substring, replace)
1070##
1071sub subst
1072{
1073 my $self = shift;
1074 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1075
1076 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1077 if ($code) {
1078 my $mat = substr($_[0], $pos, $len);
1079 substr($_[0], $pos, $len, $code->($mat));
1080 } else {
1081 substr($_[0], $pos, $len, $_[2]);
1082 }
1083 return TRUE;
1084 }
1085 else {
1086 return FALSE;
1087 }
1088}
1089
1090##
1091## int count = gsubst(string, substring, replace)
1092##
1093sub gsubst
1094{
1095 my $self = shift;
1096 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1097 my $cnt = 0;
1098
1099 # Replacement is carried out from the end, then use reverse.
1100 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1101 if ($code) {
1102 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1103 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1104 } else {
1105 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1106 }
1107 $cnt++;
1108 }
1109 return $cnt;
1110}
1111
45394607 11121;
1113__END__
1114
1115=head1 NAME
1116
a7fbee98 1117Unicode::Collate - Unicode Collation Algorithm
45394607 1118
1119=head1 SYNOPSIS
1120
1121 use Unicode::Collate;
1122
1123 #construct
5398038e 1124 $Collator = Unicode::Collate->new(%tailoring);
45394607 1125
1126 #sort
5398038e 1127 @sorted = $Collator->sort(@not_sorted);
45394607 1128
1129 #compare
a7fbee98 1130 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1131
91ae00cb 1132 # If %tailoring is false (i.e. empty),
1133 # $Collator should do the default collation.
1134
45394607 1135=head1 DESCRIPTION
1136
3756e7ca 1137This module is an implementation of Unicode Technical Standard #10
1138(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1139
45394607 1140=head2 Constructor and Tailoring
1141
d16e9e3d 1142The C<new> method returns a collator object.
1143
5398038e 1144 $Collator = Unicode::Collate->new(
0116f5dc 1145 UCA_Version => $UCA_Version,
91ae00cb 1146 alternate => $alternate, # deprecated: use of 'variable' is recommended.
45394607 1147 backwards => $levelNumber, # or \@levelNumbers
1148 entry => $element,
91ae00cb 1149 hangul_terminator => $term_primary_weight,
45394607 1150 ignoreName => qr/$ignoreName/,
1151 ignoreChar => qr/$ignoreChar/,
1152 katakana_before_hiragana => $bool,
1153 level => $collationLevel,
91ae00cb 1154 normalization => $normalization_form,
45394607 1155 overrideCJK => \&overrideCJK,
1156 overrideHangul => \&overrideHangul,
1157 preprocess => \&preprocess,
1158 rearrange => \@charList,
1159 table => $filename,
1160 undefName => qr/$undefName/,
1161 undefChar => qr/$undefChar/,
1162 upper_before_lower => $bool,
91ae00cb 1163 variable => $variable,
45394607 1164 );
45394607 1165
1166=over 4
1167
0116f5dc 1168=item UCA_Version
1169
3756e7ca 1170If the tracking version number of UCA is given,
1171behavior of that tracking version is emulated on collating.
0116f5dc 1172If omitted, the return value of C<UCA_Version()> is used.
3756e7ca 1173C<UCA_Version()> should return the latest tracking version supported.
1174
1175The supported tracking version: 8, 9, 11, or 14.
1176
0d50d293 1177 UCA Unicode Standard DUCET (@version)
1178 ---------------------------------------------------
1179 8 3.1 3.0.1 (3.0.1d9)
1180 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1181 11 4.0 4.0.0 (4.0.0)
1182 14 4.1.0 4.1.0 (4.1.0)
3756e7ca 1183
1184Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1185
1186=item alternate
0116f5dc 1187
3756e7ca 1188-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1189
3756e7ca 1190For backward compatibility, C<alternate> (old name) can be used
1191as an alias for C<variable>.
0116f5dc 1192
45394607 1193=item backwards
1194
4d36a948 1195-- see 3.1.2 French Accents, UTS #10.
45394607 1196
1197 backwards => $levelNumber or \@levelNumbers
1198
1199Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1200If omitted, forwards at all the levels.
1201
1202=item entry
1203
4d36a948 1204-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1205
91ae00cb 1206If the same character (or a sequence of characters) exists
1207in the collation element table through C<table>,
1208mapping to collation elements is overrided.
1209If it does not exist, the mapping is defined additionally.
45394607 1210
abd1ec54 1211 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
12120063 0068 ; [.0E6A.0020.0002.0063] # ch
12130043 0068 ; [.0E6A.0020.0007.0043] # Ch
12140043 0048 ; [.0E6A.0020.0008.0043] # CH
1215006C 006C ; [.0F4C.0020.0002.006C] # ll
1216004C 006C ; [.0F4C.0020.0007.004C] # Ll
1217004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8 121800F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1219006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
122000D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1221004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54 1222ENTRY
1223
1224 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
122500E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
122600C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1227ENTRY
45394607 1228
4d36a948 1229B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54 1230B<must> be a Unicode code point (defined as hexadecimal),
1231but not a native code point.
4d36a948 1232So C<0063> must always denote C<U+0063>,
1233but not a character of C<"\x63">.
1234
abd1ec54 1235Weighting may vary depending on collation element table.
1236So ensure the weights defined in C<entry> will be consistent with
1237those in the collation element table loaded via C<table>.
1238
1239In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1240and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1241(as a value between C<0E60> and C<0E6D>)
1242makes ordering as C<C E<lt> CH E<lt> D>.
1243Exactly speaking DUCET already has some characters between C<C> and C<D>:
1244C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1245C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1246and C<c-curl> (C<U+0255>) with C<0E69>.
1247Then primary weight C<0E6A> for C<CH> makes C<CH>
1248ordered between C<c-curl> and C<D>.
1249
91ae00cb 1250=item hangul_terminator
1251
3756e7ca 1252-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb 1253
1254If a true value is given (non-zero but should be positive),
1255it will be added as a terminator primary weight to the end of
1256every standard Hangul syllable. Secondary and any higher weights
1257for terminator are set to zero.
1258If the value is false or C<hangul_terminator> key does not exist,
1259insertion of terminator weights will not be performed.
1260
1261Boundaries of Hangul syllables are determined
1262according to conjoining Jamo behavior in F<the Unicode Standard>
1263and F<HangulSyllableType.txt>.
1264
1265B<Implementation Note:>
1266(1) For expansion mapping (Unicode character mapped
1267to a sequence of collation elements), a terminator will not be added
1268between collation elements, even if Hangul syllable boundary exists there.
1269Addition of terminator is restricted to the next position
1270to the last collation element.
1271
1272(2) Non-conjoining Hangul letters
1273(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1274automatically terminated with a terminator primary weight.
1275These characters may need terminator included in a collation element
1276table beforehand.
1277
45394607 1278=item ignoreChar
1279
3756e7ca 1280=item ignoreName
1281
1282-- see 3.2.2 Variable Weighting, UTS #10.
45394607 1283
caffd4cf 1284Makes the entry in the table completely ignorable;
1285i.e. as if the weights were zero at all level.
45394607 1286
3756e7ca 1287Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1288will be ignored. Through C<ignoreName>, any character whose name
1289(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1290will be ignored.
1291
a7fbee98 1292E.g. when 'a' and 'e' are ignorable,
45394607 1293'element' is equal to 'lament' (or 'lmnt').
1294
3756e7ca 1295=item katakana_before_hiragana
1296
1297-- see 7.3.1 Tertiary Weight Table, UTS #10.
1298
1299By default, hiragana is before katakana.
1300If the parameter is made true, this is reversed.
1301
1302B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1303distinctions must occur in level 3, and their weights at level 3 must be
1304same as those mentioned in 7.3.1, UTS #10.
1305If you define your collation elements which violate this requirement,
1306this parameter does not work validly.
1307
45394607 1308=item level
1309
3756e7ca 1310-- see 4.3 Form Sort Key, UTS #10.
45394607 1311
1312Set the maximum level.
1313Any higher levels than the specified one are ignored.
1314
1315 Level 1: alphabetic ordering
1316 Level 2: diacritic ordering
1317 Level 3: case ordering
91ae00cb 1318 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607 1319
1320 ex.level => 2,
1321
a7fbee98 1322If omitted, the maximum is the 4th.
1323
45394607 1324=item normalization
1325
3756e7ca 1326-- see 4.1 Normalize, UTS #10.
45394607 1327
905aa9f0 1328If specified, strings are normalized before preparation of sort keys
45394607 1329(the normalization is executed after preprocess).
1330
1d2654e1 1331A form name C<Unicode::Normalize::normalize()> accepts will be applied
1332as C<$normalization_form>.
06c8fc8f 1333Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1 1334See C<Unicode::Normalize::normalize()> for detail.
1335If omitted, C<'NFD'> is used.
45394607 1336
91ae00cb 1337C<normalization> is performed after C<preprocess> (if defined).
45394607 1338
06c8fc8f 1339Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1340though they are not concerned with C<Unicode::Normalize::normalize()>.
1341
1342If C<undef> (not a string C<"undef">) is passed explicitly
1343as the value for this key,
45394607 1344any normalization is not carried out (this may make tailoring easier
abd1ec54 1345if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1346only contiguous contractions are resolved;
1347e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1348C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f 1349In this point,
1350C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1351B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1352
1353In the case of C<(normalization =E<gt> "prenormalized")>,
1354any normalization is not performed, but
1355non-contiguous contractions with combining characters are performed.
1356Therefore
1357C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1358B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1359If source strings are finely prenormalized,
1360C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1361
1362Except C<(normalization =E<gt> undef)>,
1363B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607 1364
1365=item overrideCJK
1366
4d36a948 1367-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1368
91ae00cb 1369By default, CJK Unified Ideographs are ordered in Unicode codepoint order
3756e7ca 1370but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1371C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1372are lesser than C<CJK Unified Ideographs Extension> (its range is
1373C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
91ae00cb 1374
1375Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
45394607 1376
a7fbee98 1377ex. CJK Unified Ideographs in the JIS code point order.
45394607 1378
1379 overrideCJK => sub {
a7fbee98 1380 my $u = shift; # get a Unicode codepoint
1381 my $b = pack('n', $u); # to UTF-16BE
1382 my $s = your_unicode_to_sjis_converter($b); # convert
1383 my $n = unpack('n', $s); # convert sjis to short
1384 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1385 },
1386
a7fbee98 1387ex. ignores all CJK Unified Ideographs.
1388
1389 overrideCJK => sub {()}, # CODEREF returning empty list
1390
1391 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1392 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1393
1394If C<undef> is passed explicitly as the value for this key,
1395weights for CJK Unified Ideographs are treated as undefined.
1396But assignment of weight for CJK Unified Ideographs
91ae00cb 1397in table or C<entry> is still valid.
a7fbee98 1398
1399=item overrideHangul
1400
4d36a948 1401-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1402
abd1ec54 1403By default, Hangul Syllables are decomposed into Hangul Jamo,
1404even if C<(normalization =E<gt> undef)>.
a7fbee98 1405But the mapping of Hangul Syllables may be overrided.
1406
3756e7ca 1407This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1408
45394607 1409If you want to override the mapping of Hangul Syllables,
abd1ec54 1410NFD, NFKD, and FCD are not appropriate,
1411since they will decompose Hangul Syllables before overriding.
45394607 1412
a7fbee98 1413If C<undef> is passed explicitly as the value for this key,
1414weight for Hangul Syllables is treated as undefined
1415without decomposition into Hangul Jamo.
1416But definition of weight for Hangul Syllables
91ae00cb 1417in table or C<entry> is still valid.
a7fbee98 1418
45394607 1419=item preprocess
1420
4d36a948 1421-- see 5.1 Preprocessing, UTS #10.
45394607 1422
1423If specified, the coderef is used to preprocess
1424before the formation of sort keys.
1425
a7fbee98 1426ex. dropping English articles, such as "a" or "the".
45394607 1427Then, "the pen" is before "a pencil".
1428
1429 preprocess => sub {
1430 my $str = shift;
a7fbee98 1431 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1432 return $str;
45394607 1433 },
1434
91ae00cb 1435C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1436
45394607 1437=item rearrange
1438
4d36a948 1439-- see 3.1.3 Rearrangement, UTS #10.
45394607 1440
1441Characters that are not coded in logical order and to be rearranged.
3756e7ca 1442If C<UCA_Version> is equal to or lesser than 11, default is:
45394607 1443
1444 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1445
3756e7ca 1446If you want to disallow any rearrangement, pass C<undef> or C<[]>
1447(a reference to empty list) as the value for this key.
1448
1449If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
a7fbee98 1450
0116f5dc 1451B<According to the version 9 of UCA, this parameter shall not be used;
1452but it is not warned at present.>
1453
45394607 1454=item table
1455
4d36a948 1456-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1457
91ae00cb 1458You can use another collation element table if desired.
45394607 1459
e7f779c8 1460The table file should locate in the F<Unicode/Collate> directory
3756e7ca 1461on C<@INC>. Say, if the filename is F<Foo.txt>,
1462the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8 1463
1464By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293 1465If you will prepare your own table file, any name other than F<allkeys.txt>
1466may be better to avoid namespace conflict.
45394607 1467
a7fbee98 1468If C<undef> is passed explicitly as the value for this key,
91ae00cb 1469no file is read (but you can define collation elements via C<entry>).
a7fbee98 1470
1471A typical way to define a collation element table
1472without any file of table:
1473
1474 $onlyABC = Unicode::Collate->new(
1475 table => undef,
1476 entry => << 'ENTRIES',
14770061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
14780041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
14790062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
14800042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
14810063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
14820043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1483ENTRIES
1484 );
905aa9f0 1485
3756e7ca 1486If C<ignoreName> or C<undefName> is used, character names should be
1487specified as a comment (following C<#>) on each line.
45394607 1488
1489=item undefChar
1490
3756e7ca 1491=item undefName
1492
4d36a948 1493-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1494
1495Undefines the collation element as if it were unassigned in the table.
1496This reduces the size of the table.
1497If an unassigned character appears in the string to be collated,
1498the sort key is made from its codepoint
1499as a single-character collation element,
1500as it is greater than any other assigned collation elements
1501(in the codepoint order among the unassigned characters).
1502But, it'd be better to ignore characters
1503unfamiliar to you and maybe never used.
1504
3756e7ca 1505Through C<undefChar>, any character matching C<qr/$undefChar/>
1506will be undefined. Through C<undefName>, any character whose name
1507(given in the C<table> file as a comment) matches C<qr/$undefName/>
1508will be undefined.
1509
e7f779c8 1510ex. Collation weights for beyond-BMP characters are not stored in object:
1511
1512 undefChar => qr/[^\0-\x{fffd}]/,
1513
45394607 1514=item upper_before_lower
1515
3756e7ca 1516-- see 6.6 Case Comparisons, UTS #10.
45394607 1517
3756e7ca 1518By default, lowercase is before uppercase.
1519If the parameter is made true, this is reversed.
45394607 1520
3756e7ca 1521B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1522distinctions must occur in level 3, and their weights at level 3 must be
1523same as those mentioned in 7.3.1, UTS #10.
1524If you define your collation elements which differs from this requirement,
1525this parameter doesn't work validly.
45394607 1526
91ae00cb 1527=item variable
1528
91ae00cb 1529-- see 3.2.2 Variable Weighting, UTS #10.
1530
91ae00cb 1531This key allows to variable weighting for variable collation elements,
1532which are marked with an ASTERISK in the table
1533(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1534
1535 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1536
1537These names are case-insensitive.
1538By default (if specification is omitted), 'shifted' is adopted.
1539
1540 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1541 considered at the 4th level.
1542
abd1ec54 1543 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb 1544
1545 'Shifted' Variable elements are made ignorable at levels 1 through 3
1546 their level 4 weight is replaced by the old level 1 weight.
1547 Level 4 weight for Non-Variable elements is 0xFFFF.
1548
1549 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1550 are trimmed.
1551
45394607 1552=back
1553
3164dd77 1554=head2 Methods for Collation
45394607 1555
1556=over 4
1557
5398038e 1558=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1559
1560Sorts a list of strings.
1561
5398038e 1562=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1563
1564Returns 1 (when C<$a> is greater than C<$b>)
1565or 0 (when C<$a> is equal to C<$b>)
1566or -1 (when C<$a> is lesser than C<$b>).
1567
5398038e 1568=item C<$result = $Collator-E<gt>eq($a, $b)>
1569
1570=item C<$result = $Collator-E<gt>ne($a, $b)>
1571
1572=item C<$result = $Collator-E<gt>lt($a, $b)>
1573
1574=item C<$result = $Collator-E<gt>le($a, $b)>
1575
1576=item C<$result = $Collator-E<gt>gt($a, $b)>
1577
1578=item C<$result = $Collator-E<gt>ge($a, $b)>
1579
a7fbee98 1580They works like the same name operators as theirs.
5398038e 1581
1582 eq : whether $a is equal to $b.
1583 ne : whether $a is not equal to $b.
1584 lt : whether $a is lesser than $b.
1585 le : whether $a is lesser than $b or equal to $b.
1586 gt : whether $a is greater than $b.
1587 ge : whether $a is greater than $b or equal to $b.
1588
1589=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1590
3756e7ca 1591-- see 4.3 Form Sort Key, UTS #10.
45394607 1592
1593Returns a sort key.
1594
1595You compare the sort keys using a binary comparison
1596and get the result of the comparison of the strings using UCA.
1597
5398038e 1598 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1599
1600 is equivalent to
1601
5398038e 1602 $Collator->cmp($a, $b)
45394607 1603
a7fbee98 1604=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1605
3756e7ca 1606Converts a sorting key into its representation form.
1607If C<UCA_Version> is 8, the output is slightly different.
1608
a7fbee98 1609 use Unicode::Collate;
1610 my $c = Unicode::Collate->new();
1611 print $c->viewSortKey("Perl"),"\n";
1612
0116f5dc 1613 # output:
1614 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1615 # Level 1 Level 2 Level 3 Level 4
1616
4d36a948 1617=back
1618
1619=head2 Methods for Searching
d16e9e3d 1620
3756e7ca 1621B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
4d36a948 1622for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1623C<subst>, C<gsubst>) is croaked,
1624as the position and the length might differ
1625from those on the specified string.
3756e7ca 1626(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
d16e9e3d 1627
4d36a948 1628The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1629like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1630but they are not aware of any pattern, but only a literal substring.
1631
1632=over 4
1633
1634=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1635
1636=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1637
1638If C<$substring> matches a part of C<$string>, returns
1639the position of the first occurrence of the matching part in scalar context;
1640in list context, returns a two-element list of
1641the position and the length of the matching part.
1642
d16e9e3d 1643If C<$substring> does not match any part of C<$string>,
1644returns C<-1> in scalar context and
1645an empty list in list context.
1646
1647e.g. you say
1648
5398038e 1649 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1650 # (normalization => undef) is REQUIRED.
1651 my $str = "Ich muß studieren Perl.";
1652 my $sub = "MÜSS";
d16e9e3d 1653 my $match;
a7fbee98 1654 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1655 $match = substr($str, $pos, $len);
d16e9e3d 1656 }
1657
4d36a948 1658and get C<"muß"> in C<$match> since C<"muß">
3756e7ca 1659is primary equal to C<"MÜSS">.
4d36a948 1660
1661=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1662
1663=item C<($match) = $Collator-E<gt>match($string, $substring)>
1664
1665If C<$substring> matches a part of C<$string>, in scalar context, returns
1666B<a reference to> the first occurrence of the matching part
1667(C<$match_ref> is always true if matches,
1668since every reference is B<true>);
1669in list context, returns the first occurrence of the matching part.
1670
1671If C<$substring> does not match any part of C<$string>,
1672returns C<undef> in scalar context and
1673an empty list in list context.
1674
1675e.g.
1676
1677 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1678 print "matches [$$match_ref].\n";
1679 } else {
1680 print "doesn't match.\n";
1681 }
1682
3756e7ca 1683 or
4d36a948 1684
1685 if (($match) = $Collator->match($str, $sub)) { # list context
1686 print "matches [$match].\n";
1687 } else {
1688 print "doesn't match.\n";
1689 }
1690
1691=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1692
1693If C<$substring> matches a part of C<$string>, returns
1694all the matching parts (or matching count in scalar context).
1695
1696If C<$substring> does not match any part of C<$string>,
1697returns an empty list.
1698
1699=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1700
1701If C<$substring> matches a part of C<$string>,
1702the first occurrence of the matching part is replaced by C<$replacement>
1703(C<$string> is modified) and return C<$count> (always equals to C<1>).
1704
1705C<$replacement> can be a C<CODEREF>,
1706taking the matching part as an argument,
1707and returning a string to replace the matching part
1708(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1709
1710=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1711
1712If C<$substring> matches a part of C<$string>,
1713all the occurrences of the matching part is replaced by C<$replacement>
1714(C<$string> is modified) and return C<$count>.
1715
1716C<$replacement> can be a C<CODEREF>,
1717taking the matching part as an argument,
1718and returning a string to replace the matching part
1719(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1720
1721e.g.
1722
1723 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1724 # (normalization => undef) is REQUIRED.
3756e7ca 1725 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
4d36a948 1726 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1727
3756e7ca 1728 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
4d36a948 1729 # i.e., all the camels are made bold-faced.
d16e9e3d 1730
45394607 1731=back
1732
3164dd77 1733=head2 Other Methods
1734
1735=over 4
1736
0116f5dc 1737=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1738
1739Change the value of specified keys and returns the changed part.
1740
1741 $Collator = Unicode::Collate->new(level => 4);
1742
1743 $Collator->eq("perl", "PERL"); # false
1744
1745 %old = $Collator->change(level => 2); # returns (level => 4).
1746
1747 $Collator->eq("perl", "PERL"); # true
1748
1749 $Collator->change(%old); # returns (level => 2).
1750
1751 $Collator->eq("perl", "PERL"); # false
1752
1753Not all C<(key,value)>s are allowed to be changed.
1754See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1755
1756In the scalar context, returns the modified collator
1757(but it is B<not> a clone from the original).
1758
1759 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1760
1761 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1762
1763 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1764
91ae00cb 1765=item C<$version = $Collator-E<gt>version()>
3164dd77 1766
91ae00cb 1767Returns the version number (a string) of the Unicode Standard
1768which the C<table> file used by the collator object is based on.
1769If the table does not include a version line (starting with C<@version>),
1770returns C<"unknown">.
1771
1772=item C<UCA_Version()>
3164dd77 1773
91ae00cb 1774Returns the tracking version number of UTS #10 this module consults.
3164dd77 1775
91ae00cb 1776=item C<Base_Unicode_Version()>
1777
1778Returns the version number of UTS #10 this module consults.
3164dd77 1779
1780=back
1781
3756e7ca 1782=head1 EXPORT
1783
1784No method will be exported.
45394607 1785
0d50d293 1786=head1 INSTALL
1787
1788Though this module can be used without any C<table> file,
1789to use this module easily, it is recommended to install a table file
1790in the UCA format, by copying it under the directory
1791<a place in @INC>/Unicode/Collate.
1792
1793The most preferable one is "The Default Unicode Collation Element Table",
1794available from the Unicode Consortium's website:
1795
1796 http://www.unicode.org/Public/UCA/
1797
1798 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1799
1800If DUCET is not installed, it is recommended to copy the file
1801from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1802to <a place in @INC>/Unicode/Collate/allkeys.txt
1803manually.
1804
3756e7ca 1805=head1 CAVEATS
1806
1807=over 4
45394607 1808
3756e7ca 1809=item Normalization
45394607 1810
3756e7ca 1811Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1812module (see L<Unicode::Normalize>).
45394607 1813
5398038e 1814If you need not it (say, in the case when you need not
45394607 1815handle any combining characters),
1816assign C<normalization =E<gt> undef> explicitly.
1817
4d36a948 1818-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1819
3756e7ca 1820=item Conformance Test
0116f5dc 1821
10d7ec48 1822The Conformance Test for the UCA is available
1823under L<http://www.unicode.org/Public/UCA/>.
0116f5dc 1824
1825For F<CollationTest_SHIFTED.txt>,
1826a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1827for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1828C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1829
4d36a948 1830B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1831
3756e7ca 1832=back
1833
0d50d293 1834=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 1835
0d50d293 1836The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1837<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1838SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1839
0d50d293 1840This module is free software; you can redistribute it and/or
1841modify it under the same terms as Perl itself.
45394607 1842
0d50d293 1843The file Unicode/Collate/allkeys.txt was copied directly
1844from http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt (aka DUCET).
1845This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
1846Distributed under the Terms of Use in http://www.unicode.org/copyright.html
45394607 1847
1848=head1 SEE ALSO
1849
1850=over 4
1851
91ae00cb 1852=item Unicode Collation Algorithm - UTS #10
1853
1854L<http://www.unicode.org/reports/tr10/>
1855
1856=item The Default Unicode Collation Element Table (DUCET)
1857
10d7ec48 1858L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1859
91ae00cb 1860=item The conformance test for the UCA
45394607 1861
10d7ec48 1862L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1863
10d7ec48 1864L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1865
91ae00cb 1866=item Hangul Syllable Type
0116f5dc 1867
10d7ec48 1868L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1869
91ae00cb 1870=item Unicode Normalization Forms - UAX #15
a7fbee98 1871
91ae00cb 1872L<http://www.unicode.org/reports/tr15/>
a7fbee98 1873
45394607 1874=back
1875
1876=cut