Fix typo in win32.c
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
7}
8
45394607 9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48 15no warnings 'utf8';
16
3756e7ca 17our $VERSION = '0.50';
45394607 18our $PACKAGE = __PACKAGE__;
19
e7f779c8 20my @Path = qw(Unicode Collate);
21my $KeyFile = "allkeys.txt";
45394607 22
4d36a948 23# Perl's boolean
24use constant TRUE => 1;
25use constant FALSE => "";
26use constant NOMATCHPOS => -1;
27
28# A coderef to get combining class imported from Unicode::Normalize
29# (i.e. \&Unicode::Normalize::getCombinClass).
30# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 31my $CVgetCombinClass;
4d36a948 32
9f1f04a1 33# Supported Levels
34use constant MinLevel => 1;
35use constant MaxLevel => 4;
36
4d36a948 37# Minimum weights at level 2 and 3, respectively
9f1f04a1 38use constant Min2Wt => 0x20;
39use constant Min3Wt => 0x02;
4d36a948 40
41# Shifted weight at 4th level
9f1f04a1 42use constant Shift4Wt => 0xFFFF;
4d36a948 43
4d36a948 44# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45# PROBLEM: The Default Unicode Collation Element Table
46# has weights over 0xFFFF at the 4th level.
47# The tie-breaking in the variable weights
48# other than "shift" (as well as "shift-trimmed") is unreliable.
49use constant VCE_TEMPLATE => 'Cn4';
50
4d36a948 51# A sort key: 16-bit weights
52# See also the PROBLEM on VCE_TEMPLATE above.
53use constant KEY_TEMPLATE => 'n*';
54
55# Level separator in a sort key:
56# i.e. pack(KEY_TEMPLATE, 0)
57use constant LEVEL_SEP => "\0\0";
58
59# As Unicode code point separator for hash keys.
60# A joined code point string (denoted by JCPS below)
61# like "65;768" is used for internal processing
62# instead of Perl's Unicode string like "\x41\x{300}",
63# as the native code point is different from the Unicode code point
64# on EBCDIC platform.
65# This character must not be included in any stringified
66# representation of an integer.
67use constant CODE_SEP => ';';
68
69# boolean values of variable weights
0116f5dc 70use constant NON_VAR => 0; # Non-Variable character
71use constant VAR => 1; # Variable character
3164dd77 72
91ae00cb 73# specific code points
74use constant Hangul_LBase => 0x1100;
75use constant Hangul_LIni => 0x1100;
76use constant Hangul_LFin => 0x1159;
77use constant Hangul_LFill => 0x115F;
78use constant Hangul_VBase => 0x1161;
3756e7ca 79use constant Hangul_VIni => 0x1160; # from Vowel Filler
91ae00cb 80use constant Hangul_VFin => 0x11A2;
3756e7ca 81use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
91ae00cb 82use constant Hangul_TIni => 0x11A8;
83use constant Hangul_TFin => 0x11F9;
84use constant Hangul_TCount => 28;
85use constant Hangul_NCount => 588;
86use constant Hangul_SBase => 0xAC00;
87use constant Hangul_SIni => 0xAC00;
88use constant Hangul_SFin => 0xD7A3;
89use constant CJK_UidIni => 0x4E00;
90use constant CJK_UidFin => 0x9FA5;
3756e7ca 91use constant CJK_UidF41 => 0x9FBB;
91ae00cb 92use constant CJK_ExtAIni => 0x3400;
93use constant CJK_ExtAFin => 0x4DB5;
94use constant CJK_ExtBIni => 0x20000;
95use constant CJK_ExtBFin => 0x2A6D6;
96use constant BMP_Max => 0xFFFF;
97
4d36a948 98# Logical_Order_Exception in PropList.txt
e7f779c8 99my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 100
3756e7ca 101sub UCA_Version { "14" }
a7fbee98 102
3756e7ca 103sub Base_Unicode_Version { "4.1.0" }
a7fbee98 104
9f1f04a1 105######
106
9f1f04a1 107sub pack_U {
ae6aa562 108 return pack('U*', @_);
9f1f04a1 109}
110
111sub unpack_U {
3756e7ca 112 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1 113}
114
115######
116
91ae00cb 117my (%VariableOK);
118@VariableOK{ qw/
0116f5dc 119 blanked non-ignorable shifted shift-trimmed
91ae00cb 120 / } = (); # keys lowercased
0116f5dc 121
122our @ChangeOK = qw/
123 alternate backwards level normalization rearrange
124 katakana_before_hiragana upper_before_lower
125 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 126 hangul_terminator variable
0116f5dc 127 /;
128
129our @ChangeNG = qw/
91ae00cb 130 entry mapping table maxlength
131 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
e7f779c8 133 derivCode normCode rearrangeHash
9f1f04a1 134 backwardsFlag
0116f5dc 135 /;
9f1f04a1 136# The hash key 'ignored' is deleted at v 0.21.
137# The hash key 'isShift' is deleted at v 0.23.
138# The hash key 'combining' is deleted at v 0.24.
91ae00cb 139# The hash key 'entries' is deleted at v 0.30.
e7f779c8 140# The hash key 'L3_ignorable' is deleted at v 0.40.
91ae00cb 141
142sub version {
143 my $self = shift;
144 return $self->{versionTable} || 'unknown';
145}
0116f5dc 146
147my (%ChangeOK, %ChangeNG);
148@ChangeOK{ @ChangeOK } = ();
149@ChangeNG{ @ChangeNG } = ();
150
151sub change {
152 my $self = shift;
153 my %hash = @_;
154 my %old;
91ae00cb 155 if (exists $hash{variable} && exists $hash{alternate}) {
156 delete $hash{alternate};
157 }
158 elsif (!exists $hash{variable} && exists $hash{alternate}) {
159 $hash{variable} = $hash{alternate};
160 }
0116f5dc 161 foreach my $k (keys %hash) {
162 if (exists $ChangeOK{$k}) {
163 $old{$k} = $self->{$k};
164 $self->{$k} = $hash{$k};
165 }
166 elsif (exists $ChangeNG{$k}) {
167 croak "change of $k via change() is not allowed!";
168 }
169 # else => ignored
170 }
3756e7ca 171 $self->checkCollator();
0116f5dc 172 return wantarray ? %old : $self;
173}
a7fbee98 174
9f1f04a1 175sub _checkLevel {
176 my $level = shift;
abd1ec54 177 my $key = shift; # 'level' or 'backwards'
178 MinLevel <= $level or croak sprintf
179 "Illegal level %d (in value for key '%s') lower than %d.",
180 $level, $key, MinLevel;
181 $level <= MaxLevel or croak sprintf
182 "Unsupported level %d (in value for key '%s') higher than %d.",
183 $level, $key, MaxLevel;
9f1f04a1 184}
185
91ae00cb 186my %DerivCode = (
187 8 => \&_derivCE_8,
188 9 => \&_derivCE_9,
189 11 => \&_derivCE_9, # 11 == 9
3756e7ca 190 14 => \&_derivCE_14,
91ae00cb 191);
192
0116f5dc 193sub checkCollator {
194 my $self = shift;
9f1f04a1 195 _checkLevel($self->{level}, "level");
a7fbee98 196
91ae00cb 197 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
198 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 199
91ae00cb 200 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 201 $self->{alternateTable} || 'shifted';
91ae00cb 202 $self->{variable} = $self->{alternate} = lc($self->{variable});
203 exists $VariableOK{ $self->{variable} }
3756e7ca 204 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 205
9f1f04a1 206 if (! defined $self->{backwards}) {
207 $self->{backwardsFlag} = 0;
208 }
209 elsif (! ref $self->{backwards}) {
210 _checkLevel($self->{backwards}, "backwards");
211 $self->{backwardsFlag} = 1 << $self->{backwards};
212 }
213 else {
214 my %level;
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
218 $level{$b} = 1;
219 }
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
222 }
223 }
0116f5dc 224
91ae00cb 225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc 228
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
231
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234 }
235
236 $self->{normCode} = undef;
a7fbee98 237
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
91ae00cb 240 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 241
91ae00cb 242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 243
91ae00cb 244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
246 }
247 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f 248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
1d2654e1 250 Unicode::Normalize::normalize($norm, shift);
251 };
06c8fc8f 252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 254 }
a7fbee98 255 }
0116f5dc 256 return;
257}
258
259sub new
260{
261 my $class = shift;
262 my $self = bless { @_ }, $class;
45394607 263
a7fbee98 264 # If undef is passed explicitly, no file is read.
0116f5dc 265 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 266 $self->read_table() if defined $self->{table};
905aa9f0 267
a7fbee98 268 if ($self->{entry}) {
e7f779c8 269 while ($self->{entry} =~ /([^\n]+)/g) {
270 $self->parseEntry($1);
271 }
a7fbee98 272 }
905aa9f0 273
9f1f04a1 274 $self->{level} ||= MaxLevel;
0116f5dc 275 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 276
abd1ec54 277 $self->{overrideHangul} = FALSE
0116f5dc 278 if ! exists $self->{overrideHangul};
abd1ec54 279 $self->{overrideCJK} = FALSE
0116f5dc 280 if ! exists $self->{overrideCJK};
06c8fc8f 281 $self->{normalization} = 'NFD'
0116f5dc 282 if ! exists $self->{normalization};
3756e7ca 283 $self->{rearrange} = $self->{rearrangeTable} ||
284 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 285 if ! exists $self->{rearrange};
0116f5dc 286 $self->{backwards} = $self->{backwardsTable}
287 if ! exists $self->{backwards};
a7fbee98 288
3756e7ca 289 $self->checkCollator();
a7fbee98 290
291 return $self;
292}
905aa9f0 293
294sub read_table {
a7fbee98 295 my $self = shift;
a7fbee98 296
e7f779c8 297 my($f, $fh);
298 foreach my $d (@INC) {
299 $f = File::Spec->catfile($d, @Path, $self->{table});
300 last if open($fh, $f);
301 $f = undef;
302 }
303 defined $f
304 or croak "$PACKAGE: $self->{table} is not found in @INC";
a7fbee98 305
e7f779c8 306 while (<$fh>) {
a7fbee98 307 next if /^\s*#/;
abd1ec54 308 unless (s/^\s*\@//) {
309 $self->parseEntry($_);
a7fbee98 310 next;
311 }
abd1ec54 312
313 if (/^version\s*(\S*)/) {
314 $self->{versionTable} ||= $1;
315 }
316 elsif (/^variable\s+(\S*)/) { # since UTS #10-9
317 $self->{variableTable} ||= $1;
318 }
319 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
320 $self->{alternateTable} ||= $1;
321 }
322 elsif (/^backwards\s+(\S*)/) {
323 push @{ $self->{backwardsTable} }, $1;
324 }
325 elsif (/^forwards\s+(\S*)/) { # parhaps no use
326 push @{ $self->{forwardsTable} }, $1;
327 }
328 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
329 push @{ $self->{rearrangeTable} }, _getHexArray($1);
330 }
45394607 331 }
e7f779c8 332 close $fh;
45394607 333}
334
905aa9f0 335
45394607 336##
337## get $line, parse it, and write an entry in $self
338##
339sub parseEntry
340{
a7fbee98 341 my $self = shift;
342 my $line = shift;
4d36a948 343 my($name, $entry, @uv, @key);
a7fbee98 344
345 return if $line !~ /^\s*[0-9A-Fa-f]/;
346
347 # removes comment and gets name
348 $name = $1
349 if $line =~ s/[#%]\s*(.*)//;
350 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
351
352 # gets element
353 my($e, $k) = split /;/, $line;
354 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
355 if ! $k;
356
4d36a948 357 @uv = _getHexArray($e);
358 return if !@uv;
359
360 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 361
4d36a948 362 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 363 my $ele = pack_U(@uv);
a7fbee98 364
4d36a948 365 # regarded as if it were not entried in the table
366 return
367 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 368
4d36a948 369 # replaced as completely ignorable
370 $k = '[.0000.0000.0000.0000]'
371 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 372 }
0116f5dc 373
4d36a948 374 # replaced as completely ignorable
375 $k = '[.0000.0000.0000.0000]'
376 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
377
4c843366 378 my $is_L3_ignorable = TRUE;
4d36a948 379
caffd4cf 380 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
381 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948 382 my @wt = _getHexArray($arr);
383 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 384 $is_L3_ignorable = FALSE
3756e7ca 385 if $wt[0] || $wt[1] || $wt[2];
386 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
387 # is completely ignorable.
4c843366 388 # For expansion, an entry $is_L3_ignorable
389 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 390 }
caffd4cf 391
e7f779c8 392 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 393
91ae00cb 394 if (@uv > 1) {
395 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
396 and $self->{maxlength}{$uv[0]} = @uv;
397 }
45394607 398}
399
9f1f04a1 400
45394607 401##
abd1ec54 402## VCE = _varCE(variable term, VCE)
45394607 403##
abd1ec54 404sub _varCE
45394607 405{
abd1ec54 406 my $vbl = shift;
407 my $vce = shift;
408 if ($vbl eq 'non-ignorable') {
409 return $vce;
410 }
411 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
412
413 if ($var) {
414 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
415 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
416 }
417 elsif ($vbl eq 'blanked') {
418 return $vce;
419 }
420 else {
421 return pack(VCE_TEMPLATE, $var, @wt[0..2],
422 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
423 }
45394607 424}
425
45394607 426sub viewSortKey
427{
a7fbee98 428 my $self = shift;
9f1f04a1 429 $self->visualizeSortKey($self->getSortKey(@_));
430}
0116f5dc 431
9f1f04a1 432sub visualizeSortKey
433{
434 my $self = shift;
435 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 436
9f1f04a1 437 if ($self->{UCA_Version} <= 8) {
0116f5dc 438 $view =~ s/ ?0000 ?/|/g;
439 } else {
440 $view =~ s/\b0000\b/|/g;
441 }
a7fbee98 442 return "[$view]";
45394607 443}
444
d16e9e3d 445
45394607 446##
91ae00cb 447## arrayref of JCPS = splitEnt(string to be collated)
448## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 449##
91ae00cb 450sub splitEnt
45394607 451{
a7fbee98 452 my $self = shift;
4d36a948 453 my $wLen = $_[1];
454
a7fbee98 455 my $code = $self->{preprocess};
0116f5dc 456 my $norm = $self->{normCode};
91ae00cb 457 my $map = $self->{mapping};
a7fbee98 458 my $max = $self->{maxlength};
459 my $reH = $self->{rearrangeHash};
3756e7ca 460 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
a7fbee98 461
4d36a948 462 my ($str, @buf);
a7fbee98 463
4d36a948 464 if ($wLen) {
465 $code and croak "Preprocess breaks character positions. "
466 . "Don't use with index(), match(), etc.";
467 $norm and croak "Normalization breaks character positions. "
468 . "Don't use with index(), match(), etc.";
469 $str = $_[0];
470 }
471 else {
472 $str = $_[0];
473 $str = &$code($str) if ref $code;
474 $str = &$norm($str) if ref $norm;
475 }
a7fbee98 476
4d36a948 477 # get array of Unicode code point of string.
9f1f04a1 478 my @src = unpack_U($str);
4d36a948 479
480 # rearrangement:
481 # Character positions are not kept if rearranged,
482 # then neglected if $wLen is true.
483 if ($reH && ! $wLen) {
a7fbee98 484 for (my $i = 0; $i < @src; $i++) {
485 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
486 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
487 $i++;
488 }
489 }
45394607 490 }
45394607 491
3756e7ca 492 # remove a code point marked as a completely ignorable.
abd1ec54 493 for (my $i = 0; $i < @src; $i++) {
494 $src[$i] = undef
e7f779c8 495 if _isIllegal($src[$i]) || ($ver9 &&
496 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
0116f5dc 497 }
498
a7fbee98 499 for (my $i = 0; $i < @src; $i++) {
91ae00cb 500 my $jcps = $src[$i];
3756e7ca 501
502 # skip removed code point
503 if (! defined $jcps) {
504 if ($wLen && @buf) {
505 $buf[-1][2] = $i + 1;
506 }
507 next;
508 }
509
abd1ec54 510 my $i_orig = $i;
4d36a948 511
3756e7ca 512 # find contraction
513 if ($max->{$jcps}) {
91ae00cb 514 my $temp_jcps = $jcps;
515 my $jcpsLen = 1;
516 my $maxLen = $max->{$jcps};
4d36a948 517
91ae00cb 518 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 519 next if ! defined $src[$p];
91ae00cb 520 $temp_jcps .= CODE_SEP . $src[$p];
521 $jcpsLen++;
522 if ($map->{$temp_jcps}) {
523 $jcps = $temp_jcps;
4d36a948 524 $i = $p;
525 }
526 }
4d36a948 527
06c8fc8f 528 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
529 # This process requires Unicode::Normalize.
91ae00cb 530 # If "normalization" is undef, here should be skipped *always*
06c8fc8f 531 # (in spite of bool value of $CVgetCombinClass),
532 # since canonical ordering cannot be expected.
533 # Blocked combining character should not be contracted.
534
535 if ($self->{normalization})
536 # $self->{normCode} is false in the case of "prenormalized".
537 {
538 my $preCC = 0;
539 my $curCC = 0;
540
541 for (my $p = $i + 1; $p < @src; $p++) {
542 next if ! defined $src[$p];
543 $curCC = $CVgetCombinClass->($src[$p]);
544 last unless $curCC;
545 my $tail = CODE_SEP . $src[$p];
91ae00cb 546 if ($preCC != $curCC && $map->{$jcps.$tail}) {
547 $jcps .= $tail;
06c8fc8f 548 $src[$p] = undef;
549 } else {
550 $preCC = $curCC;
551 }
4d36a948 552 }
a7fbee98 553 }
a7fbee98 554 }
555
3756e7ca 556 # skip completely ignorable
557 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
558 if ($wLen && @buf) {
559 $buf[-1][2] = $i + 1;
a7fbee98 560 }
3756e7ca 561 next;
a7fbee98 562 }
4d36a948 563
91ae00cb 564 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 565 }
4d36a948 566 return \@buf;
d16e9e3d 567}
45394607 568
d16e9e3d 569
570##
abd1ec54 571## list of VCE = getWt(JCPS)
d16e9e3d 572##
573sub getWt
574{
a7fbee98 575 my $self = shift;
91ae00cb 576 my $u = shift;
abd1ec54 577 my $vbl = $self->{variable};
91ae00cb 578 my $map = $self->{mapping};
0116f5dc 579 my $der = $self->{derivCode};
a7fbee98 580
91ae00cb 581 return if !defined $u;
abd1ec54 582 return map(_varCE($vbl, $_), @{ $map->{$u} })
91ae00cb 583 if $map->{$u};
a7fbee98 584
91ae00cb 585 # JCPS must not be a contraction, then it's a code point.
586 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 587 my $hang = $self->{overrideHangul};
588 my @hangulCE;
589 if ($hang) {
590 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
591 }
592 elsif (!defined $hang) {
593 @hangulCE = $der->($u);
594 }
595 else {
596 my $max = $self->{maxlength};
597 my @decH = _decompHangul($u);
598
599 if (@decH == 2) {
600 my $contract = join(CODE_SEP, @decH);
91ae00cb 601 @decH = ($contract) if $map->{$contract};
1d2654e1 602 } else { # must be <@decH == 3>
603 if ($max->{$decH[0]}) {
604 my $contract = join(CODE_SEP, @decH);
91ae00cb 605 if ($map->{$contract}) {
1d2654e1 606 @decH = ($contract);
607 } else {
608 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 609 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1 610 }
611 # even if V's ignorable, LT contraction is not supported.
612 # If such a situatution were required, NFD should be used.
613 }
614 if (@decH == 3 && $max->{$decH[1]}) {
615 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 616 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1 617 }
618 }
619
620 @hangulCE = map({
91ae00cb 621 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1 622 } @decH);
623 }
abd1ec54 624 return map _varCE($vbl, $_), @hangulCE;
a7fbee98 625 }
3756e7ca 626 elsif (_isUIdeo($u, $self->{UCA_Version})) {
1d2654e1 627 my $cjk = $self->{overrideCJK};
abd1ec54 628 return map _varCE($vbl, $_),
0116f5dc 629 $cjk
4d36a948 630 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
3756e7ca 631 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
632 ? _uideoCE_8($u)
0116f5dc 633 : $der->($u);
a7fbee98 634 }
635 else {
abd1ec54 636 return map _varCE($vbl, $_), $der->($u);
a7fbee98 637 }
d16e9e3d 638}
639
d16e9e3d 640
641##
642## string sortkey = getSortKey(string arg)
643##
644sub getSortKey
645{
a7fbee98 646 my $self = shift;
647 my $lev = $self->{level};
91ae00cb 648 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
3756e7ca 649 my $v2i = $self->{UCA_Version} >= 9 &&
650 $self->{variable} ne 'non-ignorable';
91ae00cb 651
abd1ec54 652 my @buf; # weight arrays
91ae00cb 653 if ($self->{hangul_terminator}) {
654 my $preHST = '';
655 foreach my $jcps (@$rEnt) {
656 # weird things like VL, TL-contraction are not considered!
657 my $curHST = '';
658 foreach my $u (split /;/, $jcps) {
659 $curHST .= getHST($u);
660 }
661 if ($preHST && !$curHST || # hangul before non-hangul
662 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
663 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
664 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
0116f5dc 665
abd1ec54 666 push @buf, $self->getWtHangulTerm();
91ae00cb 667 }
668 $preHST = $curHST;
669
abd1ec54 670 push @buf, $self->getWt($jcps);
91ae00cb 671 }
672 $preHST # end at hangul
abd1ec54 673 and push @buf, $self->getWtHangulTerm();
91ae00cb 674 }
675 else {
676 foreach my $jcps (@$rEnt) {
abd1ec54 677 push @buf, $self->getWt($jcps);
91ae00cb 678 }
679 }
680
abd1ec54 681 # make sort key
682 my @ret = ([],[],[],[]);
683 my $last_is_variable;
684
685 foreach my $vwt (@buf) {
686 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
3756e7ca 687
688 # "Ignorable (L1, L2) after Variable" since track. v. 9
abd1ec54 689 if ($v2i) {
690 if ($var) {
691 $last_is_variable = TRUE;
692 }
693 elsif (!$wt[0]) { # ignorable
4d36a948 694 next if $last_is_variable;
abd1ec54 695 }
696 else {
697 $last_is_variable = FALSE;
0116f5dc 698 }
699 }
abd1ec54 700 foreach my $v (0..$lev-1) {
701 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
a7fbee98 702 }
703 }
45394607 704
a7fbee98 705 # modification of tertiary weights
706 if ($self->{upper_before_lower}) {
707 foreach (@{ $ret[2] }) {
708 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
709 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
710 elsif ($_ == 0x1C) { $_ += 1 } # square upper
711 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
712 }
45394607 713 }
a7fbee98 714 if ($self->{katakana_before_hiragana}) {
715 foreach (@{ $ret[2] }) {
716 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
717 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
718 }
45394607 719 }
9f1f04a1 720
721 if ($self->{backwardsFlag}) {
722 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
723 if ($self->{backwardsFlag} & (1 << $v)) {
724 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
725 }
726 }
727 }
728
4d36a948 729 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607 730}
731
732
733##
d16e9e3d 734## int compare = cmp(string a, string b)
45394607 735##
5398038e 736sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
737sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
738sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
739sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
740sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
741sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
742sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607 743
744##
d16e9e3d 745## list[strings] sorted = sort(list[strings] arg)
45394607 746##
a7fbee98 747sub sort {
748 my $obj = shift;
749 return
750 map { $_->[1] }
751 sort{ $a->[0] cmp $b->[0] }
752 map [ $obj->getSortKey($_), $_ ], @_;
45394607 753}
754
0116f5dc 755
3756e7ca 756sub _derivCE_14 {
757 my $u = shift;
758 my $base =
759 (CJK_UidIni <= $u && $u <= CJK_UidF41)
760 ? 0xFB40 : # CJK
761 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
762 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
763 ? 0xFB80 # CJK ext.
764 : 0xFBC0; # others
765
766 my $aaaa = $base + ($u >> 15);
767 my $bbbb = ($u & 0x7FFF) | 0x8000;
768 return
769 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
770 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
771}
772
4d36a948 773sub _derivCE_9 {
0116f5dc 774 my $u = shift;
775 my $base =
91ae00cb 776 (CJK_UidIni <= $u && $u <= CJK_UidFin)
4d36a948 777 ? 0xFB40 : # CJK
91ae00cb 778 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
779 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
4d36a948 780 ? 0xFB80 # CJK ext.
781 : 0xFBC0; # others
0116f5dc 782
783 my $aaaa = $base + ($u >> 15);
784 my $bbbb = ($u & 0x7FFF) | 0x8000;
785 return
9f1f04a1 786 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
787 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc 788}
789
4d36a948 790sub _derivCE_8 {
0116f5dc 791 my $code = shift;
792 my $aaaa = 0xFF80 + ($code >> 15);
793 my $bbbb = ($code & 0x7FFF) | 0x8000;
794 return
4d36a948 795 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
796 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607 797}
798
3756e7ca 799sub _uideoCE_8 {
800 my $u = shift;
801 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
802}
803
804sub _isUIdeo {
805 my ($u, $uca_vers) = @_;
806 return(
807 (CJK_UidIni <= $u &&
808 ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
809 ||
810 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
811 ||
812 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
813 );
814}
815
91ae00cb 816
abd1ec54 817sub getWtHangulTerm {
91ae00cb 818 my $self = shift;
abd1ec54 819 return _varCE($self->{variable},
820 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
91ae00cb 821}
822
823
45394607 824##
825## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
826##
a7fbee98 827sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 828
a7fbee98 829#
4d36a948 830# $code *must* be in Hangul syllable.
a7fbee98 831# Check it before you enter here.
832#
5398038e 833sub _decompHangul {
834 my $code = shift;
3756e7ca 835 my $si = $code - Hangul_SBase;
836 my $li = int( $si / Hangul_NCount);
837 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
838 my $ti = $si % Hangul_TCount;
5398038e 839 return (
3756e7ca 840 Hangul_LBase + $li,
841 Hangul_VBase + $vi,
842 $ti ? (Hangul_TBase + $ti) : (),
5398038e 843 );
45394607 844}
845
10d7ec48 846sub _isIllegal {
4d36a948 847 my $code = shift;
848 return ! defined $code # removed
849 || ($code < 0 || 0x10FFFF < $code) # out of range
850 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
851 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
852 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
853 ;
854}
855
91ae00cb 856# Hangul Syllable Type
857sub getHST {
858 my $u = shift;
859 return
860 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
861 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
862 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
863 Hangul_SIni <= $u && $u <= Hangul_SFin ?
864 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
865}
866
4d36a948 867
868##
869## bool _nonIgnorAtLevel(arrayref weights, int level)
870##
871sub _nonIgnorAtLevel($$)
872{
873 my $wt = shift;
874 return if ! defined $wt;
875 my $lv = shift;
9f1f04a1 876 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948 877}
878
879##
880## bool _eqArray(
881## arrayref of arrayref[weights] source,
882## arrayref of arrayref[weights] substr,
883## int level)
884## * comparison of graphemes vs graphemes.
885## @$source >= @$substr must be true (check it before call this);
886##
887sub _eqArray($$$)
888{
889 my $source = shift;
890 my $substr = shift;
891 my $lev = shift;
892
893 for my $g (0..@$substr-1){
894 # Do the $g'th graphemes have the same number of AV weigths?
895 return if @{ $source->[$g] } != @{ $substr->[$g] };
896
897 for my $w (0..@{ $substr->[$g] }-1) {
898 for my $v (0..$lev-1) {
899 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
900 }
901 }
902 }
903 return 1;
904}
905
906##
907## (int position, int length)
908## int position = index(string, substring, position, [undoc'ed grobal])
909##
910## With "grobal" (only for the list context),
911## returns list of arrayref[position, length].
912##
913sub index
914{
91ae00cb 915 my $self = shift;
916 my $str = shift;
917 my $len = length($str);
918 my $subE = $self->splitEnt(shift);
919 my $pos = @_ ? shift : 0;
920 $pos = 0 if $pos < 0;
921 my $grob = shift;
922
923 my $lev = $self->{level};
3756e7ca 924 my $v2i = $self->{UCA_Version} >= 9 &&
925 $self->{variable} ne 'non-ignorable';
91ae00cb 926
927 if (! @$subE) {
4d36a948 928 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
929 return $grob
930 ? map([$_, 0], $temp..$len)
931 : wantarray ? ($temp,0) : $temp;
932 }
abd1ec54 933 $len < $pos
934 and return wantarray ? () : NOMATCHPOS;
91ae00cb 935 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54 936 @$strE
937 or return wantarray ? () : NOMATCHPOS;
938
4d36a948 939 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
940
abd1ec54 941 my $last_is_variable;
942 for my $vwt (map $self->getWt($_), @$subE) {
943 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
944 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 945
3756e7ca 946 # "Ignorable (L1, L2) after Variable" since track. v. 9
947 if ($v2i) {
abd1ec54 948 if ($var) {
949 $last_is_variable = TRUE;
950 }
951 elsif (!$wt[0]) { # ignorable
4d36a948 952 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54 953 }
954 else {
955 $last_is_variable = FALSE;
4d36a948 956 }
957 }
958
abd1ec54 959 if (@subWt && !$var && !$wt[0]) {
960 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
4d36a948 961 } else {
abd1ec54 962 push @subWt, [ \@wt ];
4d36a948 963 }
964 }
965
966 my $count = 0;
91ae00cb 967 my $end = @$strE - 1;
4d36a948 968
abd1ec54 969 $last_is_variable = FALSE; # reuse
4d36a948 970 for (my $i = 0; $i <= $end; ) { # no $i++
971 my $found_base = 0;
972
973 # fetch a grapheme
974 while ($i <= $end && $found_base == 0) {
abd1ec54 975 for my $vwt ($self->getWt($strE->[$i][0])) {
976 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
977 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 978
3756e7ca 979 # "Ignorable (L1, L2) after Variable" since track. v. 9
980 if ($v2i) {
abd1ec54 981 if ($var) {
982 $last_is_variable = TRUE;
983 }
984 elsif (!$wt[0]) { # ignorable
4d36a948 985 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54 986 }
987 else {
988 $last_is_variable = FALSE;
4d36a948 989 }
990 }
991
abd1ec54 992 if (@strWt && !$var && !$wt[0]) {
993 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 994 $finPos[-1] = $strE->[$i][2];
4d36a948 995 } elsif ($to_be_pushed) {
abd1ec54 996 push @strWt, [ \@wt ];
91ae00cb 997 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 998 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 999 push @finPos, $strE->[$i][2];
4d36a948 1000 $found_base++;
1001 }
1002 # else ===> no-op
1003 }
1004 $i++;
1005 }
1006
1007 # try to match
1008 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1009 if ($iniPos[0] != NOMATCHPOS &&
1010 $finPos[$#subWt] != NOMATCHPOS &&
1011 _eqArray(\@strWt, \@subWt, $lev)) {
1012 my $temp = $iniPos[0] + $pos;
1013
1014 if ($grob) {
1015 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1016 splice @strWt, 0, $#subWt;
1017 splice @iniPos, 0, $#subWt;
1018 splice @finPos, 0, $#subWt;
1019 }
1020 else {
1021 return wantarray
1022 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1023 : $temp;
1024 }
1025 }
1026 shift @strWt;
1027 shift @iniPos;
1028 shift @finPos;
1029 }
1030 }
1031
1032 return $grob
1033 ? @g_ret
1034 : wantarray ? () : NOMATCHPOS;
1035}
1036
1037##
1038## scalarref to matching part = match(string, substring)
1039##
1040sub match
1041{
1042 my $self = shift;
1043 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1044 my $temp = substr($_[0], $pos, $len);
1045 return wantarray ? $temp : \$temp;
1046 # An lvalue ref \substr should be avoided,
1047 # since its value is affected by modification of its referent.
1048 }
1049 else {
1050 return;
1051 }
1052}
1053
1054##
1055## arrayref matching parts = gmatch(string, substring)
1056##
1057sub gmatch
1058{
1059 my $self = shift;
1060 my $str = shift;
1061 my $sub = shift;
1062 return map substr($str, $_->[0], $_->[1]),
1063 $self->index($str, $sub, 0, 'g');
1064}
1065
1066##
1067## bool subst'ed = subst(string, substring, replace)
1068##
1069sub subst
1070{
1071 my $self = shift;
1072 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1073
1074 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1075 if ($code) {
1076 my $mat = substr($_[0], $pos, $len);
1077 substr($_[0], $pos, $len, $code->($mat));
1078 } else {
1079 substr($_[0], $pos, $len, $_[2]);
1080 }
1081 return TRUE;
1082 }
1083 else {
1084 return FALSE;
1085 }
1086}
1087
1088##
1089## int count = gsubst(string, substring, replace)
1090##
1091sub gsubst
1092{
1093 my $self = shift;
1094 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1095 my $cnt = 0;
1096
1097 # Replacement is carried out from the end, then use reverse.
1098 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1099 if ($code) {
1100 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1101 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1102 } else {
1103 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1104 }
1105 $cnt++;
1106 }
1107 return $cnt;
1108}
1109
45394607 11101;
1111__END__
1112
1113=head1 NAME
1114
a7fbee98 1115Unicode::Collate - Unicode Collation Algorithm
45394607 1116
1117=head1 SYNOPSIS
1118
1119 use Unicode::Collate;
1120
1121 #construct
5398038e 1122 $Collator = Unicode::Collate->new(%tailoring);
45394607 1123
1124 #sort
5398038e 1125 @sorted = $Collator->sort(@not_sorted);
45394607 1126
1127 #compare
a7fbee98 1128 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1129
91ae00cb 1130 # If %tailoring is false (i.e. empty),
1131 # $Collator should do the default collation.
1132
45394607 1133=head1 DESCRIPTION
1134
3756e7ca 1135This module is an implementation of Unicode Technical Standard #10
1136(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1137
45394607 1138=head2 Constructor and Tailoring
1139
d16e9e3d 1140The C<new> method returns a collator object.
1141
5398038e 1142 $Collator = Unicode::Collate->new(
0116f5dc 1143 UCA_Version => $UCA_Version,
91ae00cb 1144 alternate => $alternate, # deprecated: use of 'variable' is recommended.
45394607 1145 backwards => $levelNumber, # or \@levelNumbers
1146 entry => $element,
91ae00cb 1147 hangul_terminator => $term_primary_weight,
45394607 1148 ignoreName => qr/$ignoreName/,
1149 ignoreChar => qr/$ignoreChar/,
1150 katakana_before_hiragana => $bool,
1151 level => $collationLevel,
91ae00cb 1152 normalization => $normalization_form,
45394607 1153 overrideCJK => \&overrideCJK,
1154 overrideHangul => \&overrideHangul,
1155 preprocess => \&preprocess,
1156 rearrange => \@charList,
1157 table => $filename,
1158 undefName => qr/$undefName/,
1159 undefChar => qr/$undefChar/,
1160 upper_before_lower => $bool,
91ae00cb 1161 variable => $variable,
45394607 1162 );
45394607 1163
1164=over 4
1165
0116f5dc 1166=item UCA_Version
1167
3756e7ca 1168If the tracking version number of UCA is given,
1169behavior of that tracking version is emulated on collating.
0116f5dc 1170If omitted, the return value of C<UCA_Version()> is used.
3756e7ca 1171C<UCA_Version()> should return the latest tracking version supported.
1172
1173The supported tracking version: 8, 9, 11, or 14.
1174
1175 UCA tracking version Unicode version
1176 8 3.1
1177 9 3.1 with Corrigendum 3
1178 11 4.0
1179 14 4.1.0
1180
1181Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1182
1183=item alternate
0116f5dc 1184
3756e7ca 1185-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1186
3756e7ca 1187For backward compatibility, C<alternate> (old name) can be used
1188as an alias for C<variable>.
0116f5dc 1189
45394607 1190=item backwards
1191
4d36a948 1192-- see 3.1.2 French Accents, UTS #10.
45394607 1193
1194 backwards => $levelNumber or \@levelNumbers
1195
1196Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1197If omitted, forwards at all the levels.
1198
1199=item entry
1200
4d36a948 1201-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1202
91ae00cb 1203If the same character (or a sequence of characters) exists
1204in the collation element table through C<table>,
1205mapping to collation elements is overrided.
1206If it does not exist, the mapping is defined additionally.
45394607 1207
abd1ec54 1208 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
12090063 0068 ; [.0E6A.0020.0002.0063] # ch
12100043 0068 ; [.0E6A.0020.0007.0043] # Ch
12110043 0048 ; [.0E6A.0020.0008.0043] # CH
1212006C 006C ; [.0F4C.0020.0002.006C] # ll
1213004C 006C ; [.0F4C.0020.0007.004C] # Ll
1214004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8 121500F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1216006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
121700D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1218004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54 1219ENTRY
1220
1221 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
122200E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
122300C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1224ENTRY
45394607 1225
4d36a948 1226B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54 1227B<must> be a Unicode code point (defined as hexadecimal),
1228but not a native code point.
4d36a948 1229So C<0063> must always denote C<U+0063>,
1230but not a character of C<"\x63">.
1231
abd1ec54 1232Weighting may vary depending on collation element table.
1233So ensure the weights defined in C<entry> will be consistent with
1234those in the collation element table loaded via C<table>.
1235
1236In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1237and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1238(as a value between C<0E60> and C<0E6D>)
1239makes ordering as C<C E<lt> CH E<lt> D>.
1240Exactly speaking DUCET already has some characters between C<C> and C<D>:
1241C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1242C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1243and C<c-curl> (C<U+0255>) with C<0E69>.
1244Then primary weight C<0E6A> for C<CH> makes C<CH>
1245ordered between C<c-curl> and C<D>.
1246
91ae00cb 1247=item hangul_terminator
1248
3756e7ca 1249-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb 1250
1251If a true value is given (non-zero but should be positive),
1252it will be added as a terminator primary weight to the end of
1253every standard Hangul syllable. Secondary and any higher weights
1254for terminator are set to zero.
1255If the value is false or C<hangul_terminator> key does not exist,
1256insertion of terminator weights will not be performed.
1257
1258Boundaries of Hangul syllables are determined
1259according to conjoining Jamo behavior in F<the Unicode Standard>
1260and F<HangulSyllableType.txt>.
1261
1262B<Implementation Note:>
1263(1) For expansion mapping (Unicode character mapped
1264to a sequence of collation elements), a terminator will not be added
1265between collation elements, even if Hangul syllable boundary exists there.
1266Addition of terminator is restricted to the next position
1267to the last collation element.
1268
1269(2) Non-conjoining Hangul letters
1270(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1271automatically terminated with a terminator primary weight.
1272These characters may need terminator included in a collation element
1273table beforehand.
1274
45394607 1275=item ignoreChar
1276
3756e7ca 1277=item ignoreName
1278
1279-- see 3.2.2 Variable Weighting, UTS #10.
45394607 1280
caffd4cf 1281Makes the entry in the table completely ignorable;
1282i.e. as if the weights were zero at all level.
45394607 1283
3756e7ca 1284Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1285will be ignored. Through C<ignoreName>, any character whose name
1286(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1287will be ignored.
1288
a7fbee98 1289E.g. when 'a' and 'e' are ignorable,
45394607 1290'element' is equal to 'lament' (or 'lmnt').
1291
3756e7ca 1292=item katakana_before_hiragana
1293
1294-- see 7.3.1 Tertiary Weight Table, UTS #10.
1295
1296By default, hiragana is before katakana.
1297If the parameter is made true, this is reversed.
1298
1299B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1300distinctions must occur in level 3, and their weights at level 3 must be
1301same as those mentioned in 7.3.1, UTS #10.
1302If you define your collation elements which violate this requirement,
1303this parameter does not work validly.
1304
45394607 1305=item level
1306
3756e7ca 1307-- see 4.3 Form Sort Key, UTS #10.
45394607 1308
1309Set the maximum level.
1310Any higher levels than the specified one are ignored.
1311
1312 Level 1: alphabetic ordering
1313 Level 2: diacritic ordering
1314 Level 3: case ordering
91ae00cb 1315 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607 1316
1317 ex.level => 2,
1318
a7fbee98 1319If omitted, the maximum is the 4th.
1320
45394607 1321=item normalization
1322
3756e7ca 1323-- see 4.1 Normalize, UTS #10.
45394607 1324
905aa9f0 1325If specified, strings are normalized before preparation of sort keys
45394607 1326(the normalization is executed after preprocess).
1327
1d2654e1 1328A form name C<Unicode::Normalize::normalize()> accepts will be applied
1329as C<$normalization_form>.
06c8fc8f 1330Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1 1331See C<Unicode::Normalize::normalize()> for detail.
1332If omitted, C<'NFD'> is used.
45394607 1333
91ae00cb 1334C<normalization> is performed after C<preprocess> (if defined).
45394607 1335
06c8fc8f 1336Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1337though they are not concerned with C<Unicode::Normalize::normalize()>.
1338
1339If C<undef> (not a string C<"undef">) is passed explicitly
1340as the value for this key,
45394607 1341any normalization is not carried out (this may make tailoring easier
abd1ec54 1342if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1343only contiguous contractions are resolved;
1344e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1345C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f 1346In this point,
1347C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1348B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1349
1350In the case of C<(normalization =E<gt> "prenormalized")>,
1351any normalization is not performed, but
1352non-contiguous contractions with combining characters are performed.
1353Therefore
1354C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1355B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1356If source strings are finely prenormalized,
1357C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1358
1359Except C<(normalization =E<gt> undef)>,
1360B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607 1361
1362=item overrideCJK
1363
4d36a948 1364-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1365
91ae00cb 1366By default, CJK Unified Ideographs are ordered in Unicode codepoint order
3756e7ca 1367but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1368C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1369are lesser than C<CJK Unified Ideographs Extension> (its range is
1370C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
91ae00cb 1371
1372Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
45394607 1373
a7fbee98 1374ex. CJK Unified Ideographs in the JIS code point order.
45394607 1375
1376 overrideCJK => sub {
a7fbee98 1377 my $u = shift; # get a Unicode codepoint
1378 my $b = pack('n', $u); # to UTF-16BE
1379 my $s = your_unicode_to_sjis_converter($b); # convert
1380 my $n = unpack('n', $s); # convert sjis to short
1381 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607 1382 },
1383
a7fbee98 1384ex. ignores all CJK Unified Ideographs.
1385
1386 overrideCJK => sub {()}, # CODEREF returning empty list
1387
1388 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1389 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1390
1391If C<undef> is passed explicitly as the value for this key,
1392weights for CJK Unified Ideographs are treated as undefined.
1393But assignment of weight for CJK Unified Ideographs
91ae00cb 1394in table or C<entry> is still valid.
a7fbee98 1395
1396=item overrideHangul
1397
4d36a948 1398-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1399
abd1ec54 1400By default, Hangul Syllables are decomposed into Hangul Jamo,
1401even if C<(normalization =E<gt> undef)>.
a7fbee98 1402But the mapping of Hangul Syllables may be overrided.
1403
3756e7ca 1404This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1405
45394607 1406If you want to override the mapping of Hangul Syllables,
abd1ec54 1407NFD, NFKD, and FCD are not appropriate,
1408since they will decompose Hangul Syllables before overriding.
45394607 1409
a7fbee98 1410If C<undef> is passed explicitly as the value for this key,
1411weight for Hangul Syllables is treated as undefined
1412without decomposition into Hangul Jamo.
1413But definition of weight for Hangul Syllables
91ae00cb 1414in table or C<entry> is still valid.
a7fbee98 1415
45394607 1416=item preprocess
1417
4d36a948 1418-- see 5.1 Preprocessing, UTS #10.
45394607 1419
1420If specified, the coderef is used to preprocess
1421before the formation of sort keys.
1422
a7fbee98 1423ex. dropping English articles, such as "a" or "the".
45394607 1424Then, "the pen" is before "a pencil".
1425
1426 preprocess => sub {
1427 my $str = shift;
a7fbee98 1428 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1429 return $str;
45394607 1430 },
1431
91ae00cb 1432C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1433
45394607 1434=item rearrange
1435
4d36a948 1436-- see 3.1.3 Rearrangement, UTS #10.
45394607 1437
1438Characters that are not coded in logical order and to be rearranged.
3756e7ca 1439If C<UCA_Version> is equal to or lesser than 11, default is:
45394607 1440
1441 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1442
3756e7ca 1443If you want to disallow any rearrangement, pass C<undef> or C<[]>
1444(a reference to empty list) as the value for this key.
1445
1446If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
a7fbee98 1447
0116f5dc 1448B<According to the version 9 of UCA, this parameter shall not be used;
1449but it is not warned at present.>
1450
45394607 1451=item table
1452
4d36a948 1453-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1454
91ae00cb 1455You can use another collation element table if desired.
45394607 1456
e7f779c8 1457The table file should locate in the F<Unicode/Collate> directory
3756e7ca 1458on C<@INC>. Say, if the filename is F<Foo.txt>,
1459the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8 1460
1461By default, F<allkeys.txt> (as the filename of DUCET) is used.
45394607 1462
a7fbee98 1463If C<undef> is passed explicitly as the value for this key,
91ae00cb 1464no file is read (but you can define collation elements via C<entry>).
a7fbee98 1465
1466A typical way to define a collation element table
1467without any file of table:
1468
1469 $onlyABC = Unicode::Collate->new(
1470 table => undef,
1471 entry => << 'ENTRIES',
14720061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
14730041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
14740062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
14750042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
14760063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
14770043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1478ENTRIES
1479 );
905aa9f0 1480
3756e7ca 1481If C<ignoreName> or C<undefName> is used, character names should be
1482specified as a comment (following C<#>) on each line.
45394607 1483
1484=item undefChar
1485
3756e7ca 1486=item undefName
1487
4d36a948 1488-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1489
1490Undefines the collation element as if it were unassigned in the table.
1491This reduces the size of the table.
1492If an unassigned character appears in the string to be collated,
1493the sort key is made from its codepoint
1494as a single-character collation element,
1495as it is greater than any other assigned collation elements
1496(in the codepoint order among the unassigned characters).
1497But, it'd be better to ignore characters
1498unfamiliar to you and maybe never used.
1499
3756e7ca 1500Through C<undefChar>, any character matching C<qr/$undefChar/>
1501will be undefined. Through C<undefName>, any character whose name
1502(given in the C<table> file as a comment) matches C<qr/$undefName/>
1503will be undefined.
1504
e7f779c8 1505ex. Collation weights for beyond-BMP characters are not stored in object:
1506
1507 undefChar => qr/[^\0-\x{fffd}]/,
1508
45394607 1509=item upper_before_lower
1510
3756e7ca 1511-- see 6.6 Case Comparisons, UTS #10.
45394607 1512
3756e7ca 1513By default, lowercase is before uppercase.
1514If the parameter is made true, this is reversed.
45394607 1515
3756e7ca 1516B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1517distinctions must occur in level 3, and their weights at level 3 must be
1518same as those mentioned in 7.3.1, UTS #10.
1519If you define your collation elements which differs from this requirement,
1520this parameter doesn't work validly.
45394607 1521
91ae00cb 1522=item variable
1523
91ae00cb 1524-- see 3.2.2 Variable Weighting, UTS #10.
1525
91ae00cb 1526This key allows to variable weighting for variable collation elements,
1527which are marked with an ASTERISK in the table
1528(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1529
1530 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1531
1532These names are case-insensitive.
1533By default (if specification is omitted), 'shifted' is adopted.
1534
1535 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1536 considered at the 4th level.
1537
abd1ec54 1538 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb 1539
1540 'Shifted' Variable elements are made ignorable at levels 1 through 3
1541 their level 4 weight is replaced by the old level 1 weight.
1542 Level 4 weight for Non-Variable elements is 0xFFFF.
1543
1544 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1545 are trimmed.
1546
45394607 1547=back
1548
3164dd77 1549=head2 Methods for Collation
45394607 1550
1551=over 4
1552
5398038e 1553=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607 1554
1555Sorts a list of strings.
1556
5398038e 1557=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607 1558
1559Returns 1 (when C<$a> is greater than C<$b>)
1560or 0 (when C<$a> is equal to C<$b>)
1561or -1 (when C<$a> is lesser than C<$b>).
1562
5398038e 1563=item C<$result = $Collator-E<gt>eq($a, $b)>
1564
1565=item C<$result = $Collator-E<gt>ne($a, $b)>
1566
1567=item C<$result = $Collator-E<gt>lt($a, $b)>
1568
1569=item C<$result = $Collator-E<gt>le($a, $b)>
1570
1571=item C<$result = $Collator-E<gt>gt($a, $b)>
1572
1573=item C<$result = $Collator-E<gt>ge($a, $b)>
1574
a7fbee98 1575They works like the same name operators as theirs.
5398038e 1576
1577 eq : whether $a is equal to $b.
1578 ne : whether $a is not equal to $b.
1579 lt : whether $a is lesser than $b.
1580 le : whether $a is lesser than $b or equal to $b.
1581 gt : whether $a is greater than $b.
1582 ge : whether $a is greater than $b or equal to $b.
1583
1584=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1585
3756e7ca 1586-- see 4.3 Form Sort Key, UTS #10.
45394607 1587
1588Returns a sort key.
1589
1590You compare the sort keys using a binary comparison
1591and get the result of the comparison of the strings using UCA.
1592
5398038e 1593 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607 1594
1595 is equivalent to
1596
5398038e 1597 $Collator->cmp($a, $b)
45394607 1598
a7fbee98 1599=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1600
3756e7ca 1601Converts a sorting key into its representation form.
1602If C<UCA_Version> is 8, the output is slightly different.
1603
a7fbee98 1604 use Unicode::Collate;
1605 my $c = Unicode::Collate->new();
1606 print $c->viewSortKey("Perl"),"\n";
1607
0116f5dc 1608 # output:
1609 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1610 # Level 1 Level 2 Level 3 Level 4
1611
4d36a948 1612=back
1613
1614=head2 Methods for Searching
d16e9e3d 1615
3756e7ca 1616B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
4d36a948 1617for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1618C<subst>, C<gsubst>) is croaked,
1619as the position and the length might differ
1620from those on the specified string.
3756e7ca 1621(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
d16e9e3d 1622
4d36a948 1623The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1624like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1625but they are not aware of any pattern, but only a literal substring.
1626
1627=over 4
1628
1629=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1630
1631=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d 1632
1633If C<$substring> matches a part of C<$string>, returns
1634the position of the first occurrence of the matching part in scalar context;
1635in list context, returns a two-element list of
1636the position and the length of the matching part.
1637
d16e9e3d 1638If C<$substring> does not match any part of C<$string>,
1639returns C<-1> in scalar context and
1640an empty list in list context.
1641
1642e.g. you say
1643
5398038e 1644 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1645 # (normalization => undef) is REQUIRED.
1646 my $str = "Ich muß studieren Perl.";
1647 my $sub = "MÜSS";
d16e9e3d 1648 my $match;
a7fbee98 1649 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1650 $match = substr($str, $pos, $len);
d16e9e3d 1651 }
1652
4d36a948 1653and get C<"muß"> in C<$match> since C<"muß">
3756e7ca 1654is primary equal to C<"MÜSS">.
4d36a948 1655
1656=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1657
1658=item C<($match) = $Collator-E<gt>match($string, $substring)>
1659
1660If C<$substring> matches a part of C<$string>, in scalar context, returns
1661B<a reference to> the first occurrence of the matching part
1662(C<$match_ref> is always true if matches,
1663since every reference is B<true>);
1664in list context, returns the first occurrence of the matching part.
1665
1666If C<$substring> does not match any part of C<$string>,
1667returns C<undef> in scalar context and
1668an empty list in list context.
1669
1670e.g.
1671
1672 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1673 print "matches [$$match_ref].\n";
1674 } else {
1675 print "doesn't match.\n";
1676 }
1677
3756e7ca 1678 or
4d36a948 1679
1680 if (($match) = $Collator->match($str, $sub)) { # list context
1681 print "matches [$match].\n";
1682 } else {
1683 print "doesn't match.\n";
1684 }
1685
1686=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1687
1688If C<$substring> matches a part of C<$string>, returns
1689all the matching parts (or matching count in scalar context).
1690
1691If C<$substring> does not match any part of C<$string>,
1692returns an empty list.
1693
1694=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1695
1696If C<$substring> matches a part of C<$string>,
1697the first occurrence of the matching part is replaced by C<$replacement>
1698(C<$string> is modified) and return C<$count> (always equals to C<1>).
1699
1700C<$replacement> can be a C<CODEREF>,
1701taking the matching part as an argument,
1702and returning a string to replace the matching part
1703(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1704
1705=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1706
1707If C<$substring> matches a part of C<$string>,
1708all the occurrences of the matching part is replaced by C<$replacement>
1709(C<$string> is modified) and return C<$count>.
1710
1711C<$replacement> can be a C<CODEREF>,
1712taking the matching part as an argument,
1713and returning a string to replace the matching part
1714(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1715
1716e.g.
1717
1718 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1719 # (normalization => undef) is REQUIRED.
3756e7ca 1720 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
4d36a948 1721 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1722
3756e7ca 1723 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
4d36a948 1724 # i.e., all the camels are made bold-faced.
d16e9e3d 1725
45394607 1726=back
1727
3164dd77 1728=head2 Other Methods
1729
1730=over 4
1731
0116f5dc 1732=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1733
1734Change the value of specified keys and returns the changed part.
1735
1736 $Collator = Unicode::Collate->new(level => 4);
1737
1738 $Collator->eq("perl", "PERL"); # false
1739
1740 %old = $Collator->change(level => 2); # returns (level => 4).
1741
1742 $Collator->eq("perl", "PERL"); # true
1743
1744 $Collator->change(%old); # returns (level => 2).
1745
1746 $Collator->eq("perl", "PERL"); # false
1747
1748Not all C<(key,value)>s are allowed to be changed.
1749See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1750
1751In the scalar context, returns the modified collator
1752(but it is B<not> a clone from the original).
1753
1754 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1755
1756 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1757
1758 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1759
91ae00cb 1760=item C<$version = $Collator-E<gt>version()>
3164dd77 1761
91ae00cb 1762Returns the version number (a string) of the Unicode Standard
1763which the C<table> file used by the collator object is based on.
1764If the table does not include a version line (starting with C<@version>),
1765returns C<"unknown">.
1766
1767=item C<UCA_Version()>
3164dd77 1768
91ae00cb 1769Returns the tracking version number of UTS #10 this module consults.
3164dd77 1770
91ae00cb 1771=item C<Base_Unicode_Version()>
1772
1773Returns the version number of UTS #10 this module consults.
3164dd77 1774
1775=back
1776
3756e7ca 1777=head1 EXPORT
1778
1779No method will be exported.
45394607 1780
3756e7ca 1781=head1 CAVEATS
1782
1783=over 4
45394607 1784
3756e7ca 1785=item Normalization
45394607 1786
3756e7ca 1787Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1788module (see L<Unicode::Normalize>).
45394607 1789
5398038e 1790If you need not it (say, in the case when you need not
45394607 1791handle any combining characters),
1792assign C<normalization =E<gt> undef> explicitly.
1793
4d36a948 1794-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1795
3756e7ca 1796=item Conformance Test
0116f5dc 1797
10d7ec48 1798The Conformance Test for the UCA is available
1799under L<http://www.unicode.org/Public/UCA/>.
0116f5dc 1800
1801For F<CollationTest_SHIFTED.txt>,
1802a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1803for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1804C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1805
4d36a948 1806B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1807
3756e7ca 1808=back
1809
45394607 1810=head1 AUTHOR
1811
10d7ec48 1812SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
45394607 1813
3756e7ca 1814Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1815
3756e7ca 1816This module is free software; you can redistribute it
1817and/or modify it under the same terms as Perl itself.
45394607 1818
1819=head1 SEE ALSO
1820
1821=over 4
1822
91ae00cb 1823=item Unicode Collation Algorithm - UTS #10
1824
1825L<http://www.unicode.org/reports/tr10/>
1826
1827=item The Default Unicode Collation Element Table (DUCET)
1828
10d7ec48 1829L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1830
91ae00cb 1831=item The conformance test for the UCA
45394607 1832
10d7ec48 1833L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1834
10d7ec48 1835L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1836
91ae00cb 1837=item Hangul Syllable Type
0116f5dc 1838
10d7ec48 1839L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1840
91ae00cb 1841=item Unicode Normalization Forms - UAX #15
a7fbee98 1842
91ae00cb 1843L<http://www.unicode.org/reports/tr15/>
a7fbee98 1844
45394607 1845=back
1846
1847=cut