1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
19 our $VERSION = '0.31';
20 our $PACKAGE = __PACKAGE__;
22 our @ISA = qw(Exporter);
24 our %EXPORT_TAGS = ();
28 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
29 our $KeyFile = "allkeys.txt";
32 use constant TRUE => 1;
33 use constant FALSE => "";
34 use constant NOMATCHPOS => -1;
36 # A coderef to get combining class imported from Unicode::Normalize
37 # (i.e. \&Unicode::Normalize::getCombinClass).
38 # This is also used as a HAS_UNICODE_NORMALIZE flag.
39 our $CVgetCombinClass;
42 use constant MinLevel => 1;
43 use constant MaxLevel => 4;
45 # Minimum weights at level 2 and 3, respectively
46 use constant Min2Wt => 0x20;
47 use constant Min3Wt => 0x02;
49 # Shifted weight at 4th level
50 use constant Shift4Wt => 0xFFFF;
52 # Variable weight at 1st level.
53 # This is a negative value but should be regarded as zero on collation.
54 # This is for distinction of variable chars from level 3 ignorable chars.
55 use constant Var1Wt => -1;
58 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
59 # PROBLEM: The Default Unicode Collation Element Table
60 # has weights over 0xFFFF at the 4th level.
61 # The tie-breaking in the variable weights
62 # other than "shift" (as well as "shift-trimmed") is unreliable.
63 use constant VCE_TEMPLATE => 'Cn4';
65 # A sort key: 16-bit weights
66 # See also the PROBLEM on VCE_TEMPLATE above.
67 use constant KEY_TEMPLATE => 'n*';
69 # Level separator in a sort key:
70 # i.e. pack(KEY_TEMPLATE, 0)
71 use constant LEVEL_SEP => "\0\0";
73 # As Unicode code point separator for hash keys.
74 # A joined code point string (denoted by JCPS below)
75 # like "65;768" is used for internal processing
76 # instead of Perl's Unicode string like "\x41\x{300}",
77 # as the native code point is different from the Unicode code point
79 # This character must not be included in any stringified
80 # representation of an integer.
81 use constant CODE_SEP => ';';
83 # boolean values of variable weights
84 use constant NON_VAR => 0; # Non-Variable character
85 use constant VAR => 1; # Variable character
87 # specific code points
88 use constant Hangul_LBase => 0x1100;
89 use constant Hangul_LIni => 0x1100;
90 use constant Hangul_LFin => 0x1159;
91 use constant Hangul_LFill => 0x115F;
92 use constant Hangul_VBase => 0x1161;
93 use constant Hangul_VIni => 0x1160;
94 use constant Hangul_VFin => 0x11A2;
95 use constant Hangul_TBase => 0x11A7;
96 use constant Hangul_TIni => 0x11A8;
97 use constant Hangul_TFin => 0x11F9;
98 use constant Hangul_TCount => 28;
99 use constant Hangul_NCount => 588;
100 use constant Hangul_SBase => 0xAC00;
101 use constant Hangul_SIni => 0xAC00;
102 use constant Hangul_SFin => 0xD7A3;
103 use constant CJK_UidIni => 0x4E00;
104 use constant CJK_UidFin => 0x9FA5;
105 use constant CJK_ExtAIni => 0x3400;
106 use constant CJK_ExtAFin => 0x4DB5;
107 use constant CJK_ExtBIni => 0x20000;
108 use constant CJK_ExtBFin => 0x2A6D6;
109 use constant BMP_Max => 0xFFFF;
111 # Logical_Order_Exception in PropList.txt
112 # TODO: synchronization with change of PropList.txt.
113 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
115 sub UCA_Version { "11" }
117 sub Base_Unicode_Version { "4.0" }
122 return pack('U*', @_);
126 return unpack('U*', pack('U*').shift);
133 blanked non-ignorable shifted shift-trimmed
134 / } = (); # keys lowercased
137 alternate backwards level normalization rearrange
138 katakana_before_hiragana upper_before_lower
139 overrideHangul overrideCJK preprocess UCA_Version
140 hangul_terminator variable
144 entry mapping table maxlength
145 ignoreChar ignoreName undefChar undefName variableTable
146 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
147 derivCode normCode rearrangeHash L3_ignorable
150 # The hash key 'ignored' is deleted at v 0.21.
151 # The hash key 'isShift' is deleted at v 0.23.
152 # The hash key 'combining' is deleted at v 0.24.
153 # The hash key 'entries' is deleted at v 0.30.
157 return $self->{versionTable} || 'unknown';
160 my (%ChangeOK, %ChangeNG);
161 @ChangeOK{ @ChangeOK } = ();
162 @ChangeNG{ @ChangeNG } = ();
168 if (exists $hash{variable} && exists $hash{alternate}) {
169 delete $hash{alternate};
171 elsif (!exists $hash{variable} && exists $hash{alternate}) {
172 $hash{variable} = $hash{alternate};
174 foreach my $k (keys %hash) {
175 if (exists $ChangeOK{$k}) {
176 $old{$k} = $self->{$k};
177 $self->{$k} = $hash{$k};
179 elsif (exists $ChangeNG{$k}) {
180 croak "change of $k via change() is not allowed!";
184 $self->checkCollator;
185 return wantarray ? %old : $self;
191 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
192 $level, $key, MinLevel if MinLevel > $level;
193 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
194 $level, $key, MaxLevel if MaxLevel < $level;
200 11 => \&_derivCE_9, # 11 == 9
205 _checkLevel($self->{level}, "level");
207 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
208 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
210 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
211 $self->{alternateTable} || 'shifted';
212 $self->{variable} = $self->{alternate} = lc($self->{variable});
213 exists $VariableOK{ $self->{variable} }
214 or croak "$PACKAGE unknown variable tag name: $self->{variable}";
216 if (! defined $self->{backwards}) {
217 $self->{backwardsFlag} = 0;
219 elsif (! ref $self->{backwards}) {
220 _checkLevel($self->{backwards}, "backwards");
221 $self->{backwardsFlag} = 1 << $self->{backwards};
225 $self->{backwardsFlag} = 0;
226 for my $b (@{ $self->{backwards} }) {
227 _checkLevel($b, "backwards");
230 for my $v (sort keys %level) {
231 $self->{backwardsFlag} += 1 << $v;
235 defined $self->{rearrange} or $self->{rearrange} = [];
236 ref $self->{rearrange}
237 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
239 # keys of $self->{rearrangeHash} are $self->{rearrange}.
240 $self->{rearrangeHash} = undef;
242 if (@{ $self->{rearrange} }) {
243 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
246 $self->{normCode} = undef;
248 if (defined $self->{normalization}) {
249 eval { require Unicode::Normalize };
250 $@ and croak "Unicode::Normalize is required to normalize strings";
252 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
254 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
255 $self->{normCode} = \&Unicode::Normalize::NFD;
257 elsif ($self->{normalization} ne 'prenormalized') {
258 my $norm = $self->{normalization};
259 $self->{normCode} = sub {
260 Unicode::Normalize::normalize($norm, shift);
262 eval { $self->{normCode}->("") }; # try
263 $@ and croak "$PACKAGE unknown normalization form name: $norm";
272 my $self = bless { @_ }, $class;
274 # If undef is passed explicitly, no file is read.
275 $self->{table} = $KeyFile if ! exists $self->{table};
276 $self->read_table if defined $self->{table};
278 if ($self->{entry}) {
279 $self->parseEntry($_) foreach split /\n/, $self->{entry};
282 $self->{level} ||= MaxLevel;
283 $self->{UCA_Version} ||= UCA_Version();
285 $self->{overrideHangul} = ''
286 if ! exists $self->{overrideHangul};
287 $self->{overrideCJK} = ''
288 if ! exists $self->{overrideCJK};
289 $self->{normalization} = 'NFD'
290 if ! exists $self->{normalization};
291 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
292 if ! exists $self->{rearrange};
293 $self->{backwards} = $self->{backwardsTable}
294 if ! exists $self->{backwards};
296 $self->checkCollator;
303 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
305 my $filepath = File::Spec->catfile($Path, $file);
306 open my $fk, "<$filepath"
307 or croak "File does not exist at $filepath";
312 if (/^\s*\@version\s*(\S*)/) {
313 $self->{versionTable} ||= $1;
315 elsif (/^\s*\@variable\s+(\S*)/) { # since UTS #10-9
316 $self->{variableTable} ||= $1;
318 elsif (/^\s*\@alternate\s+(\S*)/) { # till UTS #10-8
319 $self->{alternateTable} ||= $1;
321 elsif (/^\s*\@backwards\s+(\S*)/) {
322 push @{ $self->{backwardsTable} }, $1;
324 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
325 push @{ $self->{forwardsTable} }, $1;
327 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
328 push @{ $self->{rearrangeTable} }, _getHexArray($1);
332 $self->parseEntry($_);
339 ## get $line, parse it, and write an entry in $self
345 my($name, $entry, @uv, @key);
347 return if $line !~ /^\s*[0-9A-Fa-f]/;
349 # removes comment and gets name
351 if $line =~ s/[#%]\s*(.*)//;
352 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
355 my($e, $k) = split /;/, $line;
356 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
359 @uv = _getHexArray($e);
362 $entry = join(CODE_SEP, @uv); # in JCPS
364 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
365 my $ele = pack_U(@uv);
367 # regarded as if it were not entried in the table
369 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
371 # replaced as completely ignorable
372 $k = '[.0000.0000.0000.0000]'
373 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
376 # replaced as completely ignorable
377 $k = '[.0000.0000.0000.0000]'
378 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
380 my $is_L3_ignorable = TRUE;
382 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
383 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
384 my @wt = _getHexArray($arr);
385 push @key, pack(VCE_TEMPLATE, $var, @wt);
386 $is_L3_ignorable = FALSE
387 if $wt[0] + $wt[1] + $wt[2] != 0;
388 # if $arr !~ /[1-9A-Fa-f]/; NG
389 # Conformance Test shows L3-ignorable is completely ignorable.
390 # For expansion, an entry $is_L3_ignorable
391 # if and only if "all" CEs are [.0000.0000.0000].
394 $self->{mapping}{$entry} = \@key;
397 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
398 and $self->{maxlength}{$uv[0]} = @uv;
402 ? ($self->{L3_ignorable}{$uv[0]} = TRUE)
403 : ($self->{L3_ignorable}{$uv[0]} and
404 $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key.
410 ## arrayref[weights] = varCE(VCE)
415 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
417 $self->{variable} eq 'blanked' ?
418 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
419 $self->{variable} eq 'non-ignorable' ?
421 $self->{variable} eq 'shifted' ?
422 $var ? [Var1Wt, 0, 0, $wt[0] ]
423 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
424 $self->{variable} eq 'shift-trimmed' ?
425 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
426 croak "$PACKAGE unknown variable name: $self->{variable}";
432 $self->visualizeSortKey($self->getSortKey(@_));
438 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
440 if ($self->{UCA_Version} <= 8) {
441 $view =~ s/ ?0000 ?/|/g;
443 $view =~ s/\b0000\b/|/g;
450 ## arrayref of JCPS = splitEnt(string to be collated)
451 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
458 my $code = $self->{preprocess};
459 my $norm = $self->{normCode};
460 my $map = $self->{mapping};
461 my $max = $self->{maxlength};
462 my $reH = $self->{rearrangeHash};
463 my $ign = $self->{L3_ignorable};
464 my $ver9 = $self->{UCA_Version} >= 9;
469 $code and croak "Preprocess breaks character positions. "
470 . "Don't use with index(), match(), etc.";
471 $norm and croak "Normalization breaks character positions. "
472 . "Don't use with index(), match(), etc.";
477 $str = &$code($str) if ref $code;
478 $str = &$norm($str) if ref $norm;
481 # get array of Unicode code point of string.
482 my @src = unpack_U($str);
485 # Character positions are not kept if rearranged,
486 # then neglected if $wLen is true.
487 if ($reH && ! $wLen) {
488 for (my $i = 0; $i < @src; $i++) {
489 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
490 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
497 # To remove a character marked as a completely ignorable.
498 for (my $i = 0; $i < @src; $i++) {
499 $src[$i] = undef if $ign->{ $src[$i] };
503 for (my $i = 0; $i < @src; $i++) {
504 next if _isIllegal($src[$i]);
509 if ($max->{$jcps}) { # contract
510 my $temp_jcps = $jcps;
512 my $maxLen = $max->{$jcps};
514 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
515 next if ! defined $src[$p];
516 $temp_jcps .= CODE_SEP . $src[$p];
518 if ($map->{$temp_jcps}) {
524 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
525 # This process requires Unicode::Normalize.
526 # If "normalization" is undef, here should be skipped *always*
527 # (in spite of bool value of $CVgetCombinClass),
528 # since canonical ordering cannot be expected.
529 # Blocked combining character should not be contracted.
531 if ($self->{normalization})
532 # $self->{normCode} is false in the case of "prenormalized".
537 for (my $p = $i + 1; $p < @src; $p++) {
538 next if ! defined $src[$p];
539 $curCC = $CVgetCombinClass->($src[$p]);
541 my $tail = CODE_SEP . $src[$p];
542 if ($preCC != $curCC && $map->{$jcps.$tail}) {
553 for (my $p = $i + 1; $p < @src; $p++) {
554 last if defined $src[$p];
559 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
566 ## list of arrayrefs of weights = getWt(JCPS)
572 my $map = $self->{mapping};
573 my $der = $self->{derivCode};
575 return if !defined $u;
576 return map($self->varCE($_), @{ $map->{$u} })
579 # JCPS must not be a contraction, then it's a code point.
580 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
581 my $hang = $self->{overrideHangul};
584 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
586 elsif (!defined $hang) {
587 @hangulCE = $der->($u);
590 my $max = $self->{maxlength};
591 my @decH = _decompHangul($u);
594 my $contract = join(CODE_SEP, @decH);
595 @decH = ($contract) if $map->{$contract};
596 } else { # must be <@decH == 3>
597 if ($max->{$decH[0]}) {
598 my $contract = join(CODE_SEP, @decH);
599 if ($map->{$contract}) {
602 $contract = join(CODE_SEP, @decH[0,1]);
603 $map->{$contract} and @decH = ($contract, $decH[2]);
605 # even if V's ignorable, LT contraction is not supported.
606 # If such a situatution were required, NFD should be used.
608 if (@decH == 3 && $max->{$decH[1]}) {
609 my $contract = join(CODE_SEP, @decH[1,2]);
610 $map->{$contract} and @decH = ($decH[0], $contract);
615 $map->{$_} ? @{ $map->{$_} } : $der->($_);
618 return map $self->varCE($_), @hangulCE;
620 elsif (CJK_UidIni <= $u && $u <= CJK_UidFin ||
621 CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
622 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
623 my $cjk = $self->{overrideCJK};
624 return map $self->varCE($_),
626 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
627 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
628 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
632 return map $self->varCE($_), $der->($u);
638 ## string sortkey = getSortKey(string arg)
643 my $lev = $self->{level};
644 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
645 my $ver9 = $self->{UCA_Version} >= 9;
646 my $v2i = $self->{variable} ne 'non-ignorable';
649 my (@wts, @buf, $last_is_variable);
651 if ($self->{hangul_terminator}) {
653 foreach my $jcps (@$rEnt) {
654 # weird things like VL, TL-contraction are not considered!
656 foreach my $u (split /;/, $jcps) {
657 $curHST .= getHST($u);
659 if ($preHST && !$curHST || # hangul before non-hangul
660 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
661 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
662 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
664 push @wts, $self->varCE_HangulTerm;
668 push @wts, $self->getWt($jcps);
670 $preHST # end at hangul
671 and push @wts, $self->varCE_HangulTerm;
674 foreach my $jcps (@$rEnt) {
675 push @wts, $self->getWt($jcps);
679 foreach my $wt (@wts) {
681 if ($wt->[0] == 0) { # ignorable
682 next if $last_is_variable;
684 $last_is_variable = ($wt->[0] == Var1Wt);
691 my @ret = ([],[],[],[]);
692 foreach my $v (0..$lev-1) {
693 foreach my $b (@buf) {
694 push @{ $ret[$v] }, $b->[$v]
699 # modification of tertiary weights
700 if ($self->{upper_before_lower}) {
701 foreach (@{ $ret[2] }) {
702 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
703 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
704 elsif ($_ == 0x1C) { $_ += 1 } # square upper
705 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
708 if ($self->{katakana_before_hiragana}) {
709 foreach (@{ $ret[2] }) {
710 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
711 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
715 if ($self->{backwardsFlag}) {
716 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
717 if ($self->{backwardsFlag} & (1 << $v)) {
718 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
723 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
728 ## int compare = cmp(string a, string b)
730 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
731 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
732 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
733 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
734 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
735 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
736 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
739 ## list[strings] sorted = sort(list[strings] arg)
745 sort{ $a->[0] cmp $b->[0] }
746 map [ $obj->getSortKey($_), $_ ], @_;
753 (CJK_UidIni <= $u && $u <= CJK_UidFin)
755 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
756 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
760 my $aaaa = $base + ($u >> 15);
761 my $bbbb = ($u & 0x7FFF) | 0x8000;
763 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
764 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
769 my $aaaa = 0xFF80 + ($code >> 15);
770 my $bbbb = ($code & 0x7FFF) | 0x8000;
772 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
773 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
777 sub varCE_HangulTerm {
779 return $self->varCE(pack(VCE_TEMPLATE,
780 NON_VAR, $self->{hangul_terminator}, 0,0,0));
785 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
787 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
790 # $code *must* be in Hangul syllable.
791 # Check it before you enter here.
795 my $SIndex = $code - Hangul_SBase;
796 my $LIndex = int( $SIndex / Hangul_NCount);
797 my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
798 my $TIndex = $SIndex % Hangul_TCount;
800 Hangul_LBase + $LIndex,
801 Hangul_VBase + $VIndex,
802 $TIndex ? (Hangul_TBase + $TIndex) : (),
808 return ! defined $code # removed
809 || ($code < 0 || 0x10FFFF < $code) # out of range
810 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
811 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
812 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
816 # Hangul Syllable Type
820 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
821 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
822 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
823 Hangul_SIni <= $u && $u <= Hangul_SFin ?
824 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
829 ## bool _nonIgnorAtLevel(arrayref weights, int level)
831 sub _nonIgnorAtLevel($$)
834 return if ! defined $wt;
836 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
841 ## arrayref of arrayref[weights] source,
842 ## arrayref of arrayref[weights] substr,
844 ## * comparison of graphemes vs graphemes.
845 ## @$source >= @$substr must be true (check it before call this);
853 for my $g (0..@$substr-1){
854 # Do the $g'th graphemes have the same number of AV weigths?
855 return if @{ $source->[$g] } != @{ $substr->[$g] };
857 for my $w (0..@{ $substr->[$g] }-1) {
858 for my $v (0..$lev-1) {
859 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
867 ## (int position, int length)
868 ## int position = index(string, substring, position, [undoc'ed grobal])
870 ## With "grobal" (only for the list context),
871 ## returns list of arrayref[position, length].
877 my $len = length($str);
878 my $subE = $self->splitEnt(shift);
879 my $pos = @_ ? shift : 0;
880 $pos = 0 if $pos < 0;
883 my $lev = $self->{level};
884 my $ver9 = $self->{UCA_Version} >= 9;
885 my $v2i = $self->{variable} ne 'non-ignorable';
888 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
890 ? map([$_, 0], $temp..$len)
891 : wantarray ? ($temp,0) : $temp;
894 return wantarray ? () : NOMATCHPOS;
896 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
898 return wantarray ? () : NOMATCHPOS;
900 my $last_is_variable;
901 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
903 $last_is_variable = FALSE;
904 for my $wt (map $self->getWt($_), @$subE) {
905 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
909 $to_be_pushed = FALSE if $last_is_variable;
911 $last_is_variable = ($wt->[0] == Var1Wt);
915 if (@subWt && $wt->[0] == 0) {
916 push @{ $subWt[-1] }, $wt if $to_be_pushed;
918 $wt->[0] = 0 if $wt->[0] == Var1Wt;
919 push @subWt, [ $wt ];
924 my $end = @$strE - 1;
926 $last_is_variable = FALSE;
928 for (my $i = 0; $i <= $end; ) { # no $i++
932 while ($i <= $end && $found_base == 0) {
933 for my $wt ($self->getWt($strE->[$i][0])) {
934 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
938 $to_be_pushed = FALSE if $last_is_variable;
940 $last_is_variable = ($wt->[0] == Var1Wt);
944 if (@strWt && $wt->[0] == 0) {
945 push @{ $strWt[-1] }, $wt if $to_be_pushed;
946 $finPos[-1] = $strE->[$i][2];
947 } elsif ($to_be_pushed) {
948 $wt->[0] = 0 if $wt->[0] == Var1Wt;
949 push @strWt, [ $wt ];
950 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
951 $finPos[-1] = NOMATCHPOS if $found_base;
952 push @finPos, $strE->[$i][2];
961 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
962 if ($iniPos[0] != NOMATCHPOS &&
963 $finPos[$#subWt] != NOMATCHPOS &&
964 _eqArray(\@strWt, \@subWt, $lev)) {
965 my $temp = $iniPos[0] + $pos;
968 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
969 splice @strWt, 0, $#subWt;
970 splice @iniPos, 0, $#subWt;
971 splice @finPos, 0, $#subWt;
975 ? ($temp, $finPos[$#subWt] - $iniPos[0])
987 : wantarray ? () : NOMATCHPOS;
991 ## scalarref to matching part = match(string, substring)
996 if (my($pos,$len) = $self->index($_[0], $_[1])) {
997 my $temp = substr($_[0], $pos, $len);
998 return wantarray ? $temp : \$temp;
999 # An lvalue ref \substr should be avoided,
1000 # since its value is affected by modification of its referent.
1008 ## arrayref matching parts = gmatch(string, substring)
1015 return map substr($str, $_->[0], $_->[1]),
1016 $self->index($str, $sub, 0, 'g');
1020 ## bool subst'ed = subst(string, substring, replace)
1025 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1027 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1029 my $mat = substr($_[0], $pos, $len);
1030 substr($_[0], $pos, $len, $code->($mat));
1032 substr($_[0], $pos, $len, $_[2]);
1042 ## int count = gsubst(string, substring, replace)
1047 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1050 # Replacement is carried out from the end, then use reverse.
1051 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1053 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1054 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1056 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1068 Unicode::Collate - Unicode Collation Algorithm
1072 use Unicode::Collate;
1075 $Collator = Unicode::Collate->new(%tailoring);
1078 @sorted = $Collator->sort(@not_sorted);
1081 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1083 # If %tailoring is false (i.e. empty),
1084 # $Collator should do the default collation.
1088 This module is an implementation
1089 of Unicode Technical Standard #10 (UTS #10)
1090 "Unicode Collation Algorithm."
1092 =head2 Constructor and Tailoring
1094 The C<new> method returns a collator object.
1096 $Collator = Unicode::Collate->new(
1097 UCA_Version => $UCA_Version,
1098 alternate => $alternate, # deprecated: use of 'variable' is recommended.
1099 backwards => $levelNumber, # or \@levelNumbers
1101 hangul_terminator => $term_primary_weight,
1102 ignoreName => qr/$ignoreName/,
1103 ignoreChar => qr/$ignoreChar/,
1104 katakana_before_hiragana => $bool,
1105 level => $collationLevel,
1106 normalization => $normalization_form,
1107 overrideCJK => \&overrideCJK,
1108 overrideHangul => \&overrideHangul,
1109 preprocess => \&preprocess,
1110 rearrange => \@charList,
1112 undefName => qr/$undefName/,
1113 undefChar => qr/$undefChar/,
1114 upper_before_lower => $bool,
1115 variable => $variable,
1122 If the tracking version number of the older UCA is given,
1123 the older behavior of that tracking version is emulated on collating.
1124 If omitted, the return value of C<UCA_Version()> is used.
1126 The supported tracking version: 8, 9, or 11.
1128 B<This parameter may be removed in the future version,
1129 as switching the algorithm would affect the performance.>
1133 -- see 3.1.2 French Accents, UTS #10.
1135 backwards => $levelNumber or \@levelNumbers
1137 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1138 If omitted, forwards at all the levels.
1142 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1144 If the same character (or a sequence of characters) exists
1145 in the collation element table through C<table>,
1146 mapping to collation elements is overrided.
1147 If it does not exist, the mapping is defined additionally.
1149 entry => <<'ENTRIES', # use the UCA file format
1150 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
1151 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
1152 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1155 B<NOTE:> The code point in the UCA file format (before C<';'>)
1156 B<must> be a Unicode code point, but not a native code point.
1157 So C<0063> must always denote C<U+0063>,
1158 but not a character of C<"\x63">.
1160 =item hangul_terminator
1162 -- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1164 If a true value is given (non-zero but should be positive),
1165 it will be added as a terminator primary weight to the end of
1166 every standard Hangul syllable. Secondary and any higher weights
1167 for terminator are set to zero.
1168 If the value is false or C<hangul_terminator> key does not exist,
1169 insertion of terminator weights will not be performed.
1171 Boundaries of Hangul syllables are determined
1172 according to conjoining Jamo behavior in F<the Unicode Standard>
1173 and F<HangulSyllableType.txt>.
1175 B<Implementation Note:>
1176 (1) For expansion mapping (Unicode character mapped
1177 to a sequence of collation elements), a terminator will not be added
1178 between collation elements, even if Hangul syllable boundary exists there.
1179 Addition of terminator is restricted to the next position
1180 to the last collation element.
1182 (2) Non-conjoining Hangul letters
1183 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1184 automatically terminated with a terminator primary weight.
1185 These characters may need terminator included in a collation element
1192 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1194 Makes the entry in the table completely ignorable;
1195 i.e. as if the weights were zero at all level.
1197 E.g. when 'a' and 'e' are ignorable,
1198 'element' is equal to 'lament' (or 'lmnt').
1202 -- see 4.3 Form a sort key for each string, UTS #10.
1204 Set the maximum level.
1205 Any higher levels than the specified one are ignored.
1207 Level 1: alphabetic ordering
1208 Level 2: diacritic ordering
1209 Level 3: case ordering
1210 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1214 If omitted, the maximum is the 4th.
1218 -- see 4.1 Normalize each input string, UTS #10.
1220 If specified, strings are normalized before preparation of sort keys
1221 (the normalization is executed after preprocess).
1223 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1224 as C<$normalization_form>.
1225 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1226 See C<Unicode::Normalize::normalize()> for detail.
1227 If omitted, C<'NFD'> is used.
1229 C<normalization> is performed after C<preprocess> (if defined).
1231 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1232 though they are not concerned with C<Unicode::Normalize::normalize()>.
1234 If C<undef> (not a string C<"undef">) is passed explicitly
1235 as the value for this key,
1236 any normalization is not carried out (this may make tailoring easier
1237 if any normalization is not desired).
1238 Under C<(normalization =E<gt> undef)>, only contiguous contractions
1239 are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
1240 even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
1242 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1243 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1245 In the case of C<(normalization =E<gt> "prenormalized")>,
1246 any normalization is not performed, but
1247 non-contiguous contractions with combining characters are performed.
1249 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1250 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1251 If source strings are finely prenormalized,
1252 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1254 Except C<(normalization =E<gt> undef)>,
1255 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1259 -- see 7.1 Derived Collation Elements, UTS #10.
1261 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1262 (but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than
1263 C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1264 C<U+20000> to C<U+2A6D6>].
1266 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1268 ex. CJK Unified Ideographs in the JIS code point order.
1270 overrideCJK => sub {
1271 my $u = shift; # get a Unicode codepoint
1272 my $b = pack('n', $u); # to UTF-16BE
1273 my $s = your_unicode_to_sjis_converter($b); # convert
1274 my $n = unpack('n', $s); # convert sjis to short
1275 [ $n, 0x20, 0x2, $u ]; # return the collation element
1278 ex. ignores all CJK Unified Ideographs.
1280 overrideCJK => sub {()}, # CODEREF returning empty list
1282 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1283 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1285 If C<undef> is passed explicitly as the value for this key,
1286 weights for CJK Unified Ideographs are treated as undefined.
1287 But assignment of weight for CJK Unified Ideographs
1288 in table or C<entry> is still valid.
1290 =item overrideHangul
1292 -- see 7.1 Derived Collation Elements, UTS #10.
1294 By default, Hangul Syllables are decomposed into Hangul Jamo.
1295 But the mapping of Hangul Syllables may be overrided.
1297 This tag works like C<overrideCJK>, so see there for examples.
1299 If you want to override the mapping of Hangul Syllables,
1300 the Normalization Forms D and KD are not appropriate
1301 (they will be decomposed before overriding).
1303 If C<undef> is passed explicitly as the value for this key,
1304 weight for Hangul Syllables is treated as undefined
1305 without decomposition into Hangul Jamo.
1306 But definition of weight for Hangul Syllables
1307 in table or C<entry> is still valid.
1311 -- see 5.1 Preprocessing, UTS #10.
1313 If specified, the coderef is used to preprocess
1314 before the formation of sort keys.
1316 ex. dropping English articles, such as "a" or "the".
1317 Then, "the pen" is before "a pencil".
1321 $str =~ s/\b(?:an?|the)\s+//gi;
1325 C<preprocess> is performed before C<normalization> (if defined).
1329 -- see 3.1.3 Rearrangement, UTS #10.
1331 Characters that are not coded in logical order and to be rearranged.
1334 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1336 If you want to disallow any rearrangement,
1337 pass C<undef> or C<[]> (a reference to an empty list)
1338 as the value for this key.
1340 B<According to the version 9 of UCA, this parameter shall not be used;
1341 but it is not warned at present.>
1345 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1347 You can use another collation element table if desired.
1348 The table file must be put into a directory
1349 where F<Unicode/Collate.pm> is installed; e.g. into
1350 F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>.
1352 By default, the filename F<allkeys.txt> is used.
1354 If C<undef> is passed explicitly as the value for this key,
1355 no file is read (but you can define collation elements via C<entry>).
1357 A typical way to define a collation element table
1358 without any file of table:
1360 $onlyABC = Unicode::Collate->new(
1362 entry => << 'ENTRIES',
1363 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1364 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1365 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1366 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1367 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1368 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1376 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1378 Undefines the collation element as if it were unassigned in the table.
1379 This reduces the size of the table.
1380 If an unassigned character appears in the string to be collated,
1381 the sort key is made from its codepoint
1382 as a single-character collation element,
1383 as it is greater than any other assigned collation elements
1384 (in the codepoint order among the unassigned characters).
1385 But, it'd be better to ignore characters
1386 unfamiliar to you and maybe never used.
1388 =item katakana_before_hiragana
1390 =item upper_before_lower
1392 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1394 By default, lowercase is before uppercase
1395 and hiragana is before katakana.
1397 If the tag is made true, this is reversed.
1399 B<NOTE>: These tags simplemindedly assume
1400 any lowercase/uppercase or hiragana/katakana distinctions
1401 must occur in level 3, and their weights at level 3
1402 must be same as those mentioned in 7.3.1, UTS #10.
1403 If you define your collation elements which violate this requirement,
1404 these tags don't work validly.
1410 -- see 3.2.2 Variable Weighting, UTS #10.
1412 (the title in UCA version 8: Alternate Weighting)
1414 This key allows to variable weighting for variable collation elements,
1415 which are marked with an ASTERISK in the table
1416 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1418 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1420 These names are case-insensitive.
1421 By default (if specification is omitted), 'shifted' is adopted.
1423 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1424 considered at the 4th level.
1426 'Non-ignorable' Variable elements are not reset to ignorable.
1428 'Shifted' Variable elements are made ignorable at levels 1 through 3
1429 their level 4 weight is replaced by the old level 1 weight.
1430 Level 4 weight for Non-Variable elements is 0xFFFF.
1432 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1435 For backward compatibility, C<alternate> can be used as an alias
1440 =head2 Methods for Collation
1444 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1446 Sorts a list of strings.
1448 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1450 Returns 1 (when C<$a> is greater than C<$b>)
1451 or 0 (when C<$a> is equal to C<$b>)
1452 or -1 (when C<$a> is lesser than C<$b>).
1454 =item C<$result = $Collator-E<gt>eq($a, $b)>
1456 =item C<$result = $Collator-E<gt>ne($a, $b)>
1458 =item C<$result = $Collator-E<gt>lt($a, $b)>
1460 =item C<$result = $Collator-E<gt>le($a, $b)>
1462 =item C<$result = $Collator-E<gt>gt($a, $b)>
1464 =item C<$result = $Collator-E<gt>ge($a, $b)>
1466 They works like the same name operators as theirs.
1468 eq : whether $a is equal to $b.
1469 ne : whether $a is not equal to $b.
1470 lt : whether $a is lesser than $b.
1471 le : whether $a is lesser than $b or equal to $b.
1472 gt : whether $a is greater than $b.
1473 ge : whether $a is greater than $b or equal to $b.
1475 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1477 -- see 4.3 Form a sort key for each string, UTS #10.
1481 You compare the sort keys using a binary comparison
1482 and get the result of the comparison of the strings using UCA.
1484 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1488 $Collator->cmp($a, $b)
1490 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1492 use Unicode::Collate;
1493 my $c = Unicode::Collate->new();
1494 print $c->viewSortKey("Perl"),"\n";
1497 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1498 # Level 1 Level 2 Level 3 Level 4
1500 (If C<UCA_Version> is 8, the output is slightly different.)
1504 =head2 Methods for Searching
1506 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1507 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1508 C<subst>, C<gsubst>) is croaked,
1509 as the position and the length might differ
1510 from those on the specified string.
1511 (And C<rearrange> and C<hangul_terminator> tags are neglected.)
1513 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1514 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1515 but they are not aware of any pattern, but only a literal substring.
1519 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1521 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1523 If C<$substring> matches a part of C<$string>, returns
1524 the position of the first occurrence of the matching part in scalar context;
1525 in list context, returns a two-element list of
1526 the position and the length of the matching part.
1528 If C<$substring> does not match any part of C<$string>,
1529 returns C<-1> in scalar context and
1530 an empty list in list context.
1534 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1535 # (normalization => undef) is REQUIRED.
1536 my $str = "Ich muß studieren Perl.";
1539 if (my($pos,$len) = $Collator->index($str, $sub)) {
1540 $match = substr($str, $pos, $len);
1543 and get C<"muß"> in C<$match> since C<"muß">
1544 is primary equal to C<"MÜSS">.
1546 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1548 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1550 If C<$substring> matches a part of C<$string>, in scalar context, returns
1551 B<a reference to> the first occurrence of the matching part
1552 (C<$match_ref> is always true if matches,
1553 since every reference is B<true>);
1554 in list context, returns the first occurrence of the matching part.
1556 If C<$substring> does not match any part of C<$string>,
1557 returns C<undef> in scalar context and
1558 an empty list in list context.
1562 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1563 print "matches [$$match_ref].\n";
1565 print "doesn't match.\n";
1570 if (($match) = $Collator->match($str, $sub)) { # list context
1571 print "matches [$match].\n";
1573 print "doesn't match.\n";
1576 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1578 If C<$substring> matches a part of C<$string>, returns
1579 all the matching parts (or matching count in scalar context).
1581 If C<$substring> does not match any part of C<$string>,
1582 returns an empty list.
1584 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1586 If C<$substring> matches a part of C<$string>,
1587 the first occurrence of the matching part is replaced by C<$replacement>
1588 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1590 C<$replacement> can be a C<CODEREF>,
1591 taking the matching part as an argument,
1592 and returning a string to replace the matching part
1593 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1595 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1597 If C<$substring> matches a part of C<$string>,
1598 all the occurrences of the matching part is replaced by C<$replacement>
1599 (C<$string> is modified) and return C<$count>.
1601 C<$replacement> can be a C<CODEREF>,
1602 taking the matching part as an argument,
1603 and returning a string to replace the matching part
1604 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1608 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1609 # (normalization => undef) is REQUIRED.
1610 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1611 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1613 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1614 # i.e., all the camels are made bold-faced.
1618 =head2 Other Methods
1622 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1624 Change the value of specified keys and returns the changed part.
1626 $Collator = Unicode::Collate->new(level => 4);
1628 $Collator->eq("perl", "PERL"); # false
1630 %old = $Collator->change(level => 2); # returns (level => 4).
1632 $Collator->eq("perl", "PERL"); # true
1634 $Collator->change(%old); # returns (level => 2).
1636 $Collator->eq("perl", "PERL"); # false
1638 Not all C<(key,value)>s are allowed to be changed.
1639 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1641 In the scalar context, returns the modified collator
1642 (but it is B<not> a clone from the original).
1644 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1646 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1648 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1650 =item C<$version = $Collator-E<gt>version()>
1652 Returns the version number (a string) of the Unicode Standard
1653 which the C<table> file used by the collator object is based on.
1654 If the table does not include a version line (starting with C<@version>),
1655 returns C<"unknown">.
1657 =item C<UCA_Version()>
1659 Returns the tracking version number of UTS #10 this module consults.
1661 =item C<Base_Unicode_Version()>
1663 Returns the version number of UTS #10 this module consults.
1673 Use of the C<normalization> parameter requires
1674 the B<Unicode::Normalize> module.
1676 If you need not it (say, in the case when you need not
1677 handle any combining characters),
1678 assign C<normalization =E<gt> undef> explicitly.
1680 -- see 6.5 Avoiding Normalization, UTS #10.
1682 =head2 Conformance Test
1684 The Conformance Test for the UCA is available
1685 under L<http://www.unicode.org/Public/UCA/>.
1687 For F<CollationTest_SHIFTED.txt>,
1688 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1689 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1690 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1692 B<Unicode::Normalize is required to try The Conformance Test.>
1696 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1698 http://homepage1.nifty.com/nomenclator/perl/
1700 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
1702 This library is free software; you can redistribute it
1703 and/or modify it under the same terms as Perl itself.
1709 =item Unicode Collation Algorithm - UTS #10
1711 L<http://www.unicode.org/reports/tr10/>
1713 =item The Default Unicode Collation Element Table (DUCET)
1715 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1717 =item The conformance test for the UCA
1719 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1721 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1723 =item Hangul Syllable Type
1725 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1727 =item Unicode Normalization Forms - UAX #15
1729 L<http://www.unicode.org/reports/tr15/>
1731 =item L<Unicode::Normalize>