1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
19 our $VERSION = '0.33';
20 our $PACKAGE = __PACKAGE__;
22 our @ISA = qw(Exporter);
24 our %EXPORT_TAGS = ();
28 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
29 our $KeyFile = "allkeys.txt";
32 use constant TRUE => 1;
33 use constant FALSE => "";
34 use constant NOMATCHPOS => -1;
36 # A coderef to get combining class imported from Unicode::Normalize
37 # (i.e. \&Unicode::Normalize::getCombinClass).
38 # This is also used as a HAS_UNICODE_NORMALIZE flag.
39 our $CVgetCombinClass;
42 use constant MinLevel => 1;
43 use constant MaxLevel => 4;
45 # Minimum weights at level 2 and 3, respectively
46 use constant Min2Wt => 0x20;
47 use constant Min3Wt => 0x02;
49 # Shifted weight at 4th level
50 use constant Shift4Wt => 0xFFFF;
52 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
53 # PROBLEM: The Default Unicode Collation Element Table
54 # has weights over 0xFFFF at the 4th level.
55 # The tie-breaking in the variable weights
56 # other than "shift" (as well as "shift-trimmed") is unreliable.
57 use constant VCE_TEMPLATE => 'Cn4';
59 # A sort key: 16-bit weights
60 # See also the PROBLEM on VCE_TEMPLATE above.
61 use constant KEY_TEMPLATE => 'n*';
63 # Level separator in a sort key:
64 # i.e. pack(KEY_TEMPLATE, 0)
65 use constant LEVEL_SEP => "\0\0";
67 # As Unicode code point separator for hash keys.
68 # A joined code point string (denoted by JCPS below)
69 # like "65;768" is used for internal processing
70 # instead of Perl's Unicode string like "\x41\x{300}",
71 # as the native code point is different from the Unicode code point
73 # This character must not be included in any stringified
74 # representation of an integer.
75 use constant CODE_SEP => ';';
77 # boolean values of variable weights
78 use constant NON_VAR => 0; # Non-Variable character
79 use constant VAR => 1; # Variable character
81 # specific code points
82 use constant Hangul_LBase => 0x1100;
83 use constant Hangul_LIni => 0x1100;
84 use constant Hangul_LFin => 0x1159;
85 use constant Hangul_LFill => 0x115F;
86 use constant Hangul_VBase => 0x1161;
87 use constant Hangul_VIni => 0x1160;
88 use constant Hangul_VFin => 0x11A2;
89 use constant Hangul_TBase => 0x11A7;
90 use constant Hangul_TIni => 0x11A8;
91 use constant Hangul_TFin => 0x11F9;
92 use constant Hangul_TCount => 28;
93 use constant Hangul_NCount => 588;
94 use constant Hangul_SBase => 0xAC00;
95 use constant Hangul_SIni => 0xAC00;
96 use constant Hangul_SFin => 0xD7A3;
97 use constant CJK_UidIni => 0x4E00;
98 use constant CJK_UidFin => 0x9FA5;
99 use constant CJK_ExtAIni => 0x3400;
100 use constant CJK_ExtAFin => 0x4DB5;
101 use constant CJK_ExtBIni => 0x20000;
102 use constant CJK_ExtBFin => 0x2A6D6;
103 use constant BMP_Max => 0xFFFF;
105 # Logical_Order_Exception in PropList.txt
106 # TODO: synchronization with change of PropList.txt.
107 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
109 sub UCA_Version { "11" }
111 sub Base_Unicode_Version { "4.0" }
116 return pack('U*', @_);
120 return unpack('U*', pack('U*').shift);
127 blanked non-ignorable shifted shift-trimmed
128 / } = (); # keys lowercased
131 alternate backwards level normalization rearrange
132 katakana_before_hiragana upper_before_lower
133 overrideHangul overrideCJK preprocess UCA_Version
134 hangul_terminator variable
138 entry mapping table maxlength
139 ignoreChar ignoreName undefChar undefName variableTable
140 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
141 derivCode normCode rearrangeHash L3_ignorable
144 # The hash key 'ignored' is deleted at v 0.21.
145 # The hash key 'isShift' is deleted at v 0.23.
146 # The hash key 'combining' is deleted at v 0.24.
147 # The hash key 'entries' is deleted at v 0.30.
151 return $self->{versionTable} || 'unknown';
154 my (%ChangeOK, %ChangeNG);
155 @ChangeOK{ @ChangeOK } = ();
156 @ChangeNG{ @ChangeNG } = ();
162 if (exists $hash{variable} && exists $hash{alternate}) {
163 delete $hash{alternate};
165 elsif (!exists $hash{variable} && exists $hash{alternate}) {
166 $hash{variable} = $hash{alternate};
168 foreach my $k (keys %hash) {
169 if (exists $ChangeOK{$k}) {
170 $old{$k} = $self->{$k};
171 $self->{$k} = $hash{$k};
173 elsif (exists $ChangeNG{$k}) {
174 croak "change of $k via change() is not allowed!";
178 $self->checkCollator;
179 return wantarray ? %old : $self;
184 my $key = shift; # 'level' or 'backwards'
185 MinLevel <= $level or croak sprintf
186 "Illegal level %d (in value for key '%s') lower than %d.",
187 $level, $key, MinLevel;
188 $level <= MaxLevel or croak sprintf
189 "Unsupported level %d (in value for key '%s') higher than %d.",
190 $level, $key, MaxLevel;
196 11 => \&_derivCE_9, # 11 == 9
201 _checkLevel($self->{level}, "level");
203 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
204 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
206 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
207 $self->{alternateTable} || 'shifted';
208 $self->{variable} = $self->{alternate} = lc($self->{variable});
209 exists $VariableOK{ $self->{variable} }
210 or croak "$PACKAGE unknown variable tag name: $self->{variable}";
212 if (! defined $self->{backwards}) {
213 $self->{backwardsFlag} = 0;
215 elsif (! ref $self->{backwards}) {
216 _checkLevel($self->{backwards}, "backwards");
217 $self->{backwardsFlag} = 1 << $self->{backwards};
221 $self->{backwardsFlag} = 0;
222 for my $b (@{ $self->{backwards} }) {
223 _checkLevel($b, "backwards");
226 for my $v (sort keys %level) {
227 $self->{backwardsFlag} += 1 << $v;
231 defined $self->{rearrange} or $self->{rearrange} = [];
232 ref $self->{rearrange}
233 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
235 # keys of $self->{rearrangeHash} are $self->{rearrange}.
236 $self->{rearrangeHash} = undef;
238 if (@{ $self->{rearrange} }) {
239 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
242 $self->{normCode} = undef;
244 if (defined $self->{normalization}) {
245 eval { require Unicode::Normalize };
246 $@ and croak "Unicode::Normalize is required to normalize strings";
248 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
250 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
251 $self->{normCode} = \&Unicode::Normalize::NFD;
253 elsif ($self->{normalization} ne 'prenormalized') {
254 my $norm = $self->{normalization};
255 $self->{normCode} = sub {
256 Unicode::Normalize::normalize($norm, shift);
258 eval { $self->{normCode}->("") }; # try
259 $@ and croak "$PACKAGE unknown normalization form name: $norm";
268 my $self = bless { @_ }, $class;
270 # If undef is passed explicitly, no file is read.
271 $self->{table} = $KeyFile if ! exists $self->{table};
272 $self->read_table if defined $self->{table};
274 if ($self->{entry}) {
275 $self->parseEntry($_) foreach split /\n/, $self->{entry};
278 $self->{level} ||= MaxLevel;
279 $self->{UCA_Version} ||= UCA_Version();
281 $self->{overrideHangul} = FALSE
282 if ! exists $self->{overrideHangul};
283 $self->{overrideCJK} = FALSE
284 if ! exists $self->{overrideCJK};
285 $self->{normalization} = 'NFD'
286 if ! exists $self->{normalization};
287 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
288 if ! exists $self->{rearrange};
289 $self->{backwards} = $self->{backwardsTable}
290 if ! exists $self->{backwards};
292 $self->checkCollator;
300 my $filepath = File::Spec->catfile($Path, $self->{table});
301 open my $fk, "<$filepath"
302 or croak "File does not exist at $filepath";
306 unless (s/^\s*\@//) {
307 $self->parseEntry($_);
311 if (/^version\s*(\S*)/) {
312 $self->{versionTable} ||= $1;
314 elsif (/^variable\s+(\S*)/) { # since UTS #10-9
315 $self->{variableTable} ||= $1;
317 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
318 $self->{alternateTable} ||= $1;
320 elsif (/^backwards\s+(\S*)/) {
321 push @{ $self->{backwardsTable} }, $1;
323 elsif (/^forwards\s+(\S*)/) { # parhaps no use
324 push @{ $self->{forwardsTable} }, $1;
326 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
327 push @{ $self->{rearrangeTable} }, _getHexArray($1);
335 ## get $line, parse it, and write an entry in $self
341 my($name, $entry, @uv, @key);
343 return if $line !~ /^\s*[0-9A-Fa-f]/;
345 # removes comment and gets name
347 if $line =~ s/[#%]\s*(.*)//;
348 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
351 my($e, $k) = split /;/, $line;
352 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
355 @uv = _getHexArray($e);
358 $entry = join(CODE_SEP, @uv); # in JCPS
360 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
361 my $ele = pack_U(@uv);
363 # regarded as if it were not entried in the table
365 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
367 # replaced as completely ignorable
368 $k = '[.0000.0000.0000.0000]'
369 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
372 # replaced as completely ignorable
373 $k = '[.0000.0000.0000.0000]'
374 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
376 my $is_L3_ignorable = TRUE;
378 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
379 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
380 my @wt = _getHexArray($arr);
381 push @key, pack(VCE_TEMPLATE, $var, @wt);
382 $is_L3_ignorable = FALSE
383 if $wt[0] + $wt[1] + $wt[2] != 0;
384 # if $arr !~ /[1-9A-Fa-f]/; NG
385 # Conformance Test shows L3-ignorable is completely ignorable.
386 # For expansion, an entry $is_L3_ignorable
387 # if and only if "all" CEs are [.0000.0000.0000].
390 $self->{mapping}{$entry} = \@key;
393 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
394 and $self->{maxlength}{$uv[0]} = @uv;
398 ? ($self->{L3_ignorable}{$uv[0]} = TRUE)
399 : ($self->{L3_ignorable}{$uv[0]} and
400 $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key.
406 ## VCE = _varCE(variable term, VCE)
412 if ($vbl eq 'non-ignorable') {
415 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
418 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
419 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
421 elsif ($vbl eq 'blanked') {
425 return pack(VCE_TEMPLATE, $var, @wt[0..2],
426 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
433 $self->visualizeSortKey($self->getSortKey(@_));
439 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
441 if ($self->{UCA_Version} <= 8) {
442 $view =~ s/ ?0000 ?/|/g;
444 $view =~ s/\b0000\b/|/g;
451 ## arrayref of JCPS = splitEnt(string to be collated)
452 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
459 my $code = $self->{preprocess};
460 my $norm = $self->{normCode};
461 my $map = $self->{mapping};
462 my $max = $self->{maxlength};
463 my $reH = $self->{rearrangeHash};
464 my $ign = $self->{L3_ignorable};
465 my $ver9 = $self->{UCA_Version} >= 9;
470 $code and croak "Preprocess breaks character positions. "
471 . "Don't use with index(), match(), etc.";
472 $norm and croak "Normalization breaks character positions. "
473 . "Don't use with index(), match(), etc.";
478 $str = &$code($str) if ref $code;
479 $str = &$norm($str) if ref $norm;
482 # get array of Unicode code point of string.
483 my @src = unpack_U($str);
486 # Character positions are not kept if rearranged,
487 # then neglected if $wLen is true.
488 if ($reH && ! $wLen) {
489 for (my $i = 0; $i < @src; $i++) {
490 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
491 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
497 # To remove a character marked as a completely ignorable.
498 for (my $i = 0; $i < @src; $i++) {
500 if _isIllegal($src[$i]) || ($ver9 && $ign->{ $src[$i] });
503 for (my $i = 0; $i < @src; $i++) {
505 next if ! defined $jcps;
508 if ($max->{$jcps}) { # contract
509 my $temp_jcps = $jcps;
511 my $maxLen = $max->{$jcps};
513 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
514 next if ! defined $src[$p];
515 $temp_jcps .= CODE_SEP . $src[$p];
517 if ($map->{$temp_jcps}) {
523 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
524 # This process requires Unicode::Normalize.
525 # If "normalization" is undef, here should be skipped *always*
526 # (in spite of bool value of $CVgetCombinClass),
527 # since canonical ordering cannot be expected.
528 # Blocked combining character should not be contracted.
530 if ($self->{normalization})
531 # $self->{normCode} is false in the case of "prenormalized".
536 for (my $p = $i + 1; $p < @src; $p++) {
537 next if ! defined $src[$p];
538 $curCC = $CVgetCombinClass->($src[$p]);
540 my $tail = CODE_SEP . $src[$p];
541 if ($preCC != $curCC && $map->{$jcps.$tail}) {
552 for (; $i + 1 < @src; $i++) {
553 last if defined $src[$i + 1];
557 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
564 ## list of VCE = getWt(JCPS)
570 my $vbl = $self->{variable};
571 my $map = $self->{mapping};
572 my $der = $self->{derivCode};
574 return if !defined $u;
575 return map(_varCE($vbl, $_), @{ $map->{$u} })
578 # JCPS must not be a contraction, then it's a code point.
579 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
580 my $hang = $self->{overrideHangul};
583 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
585 elsif (!defined $hang) {
586 @hangulCE = $der->($u);
589 my $max = $self->{maxlength};
590 my @decH = _decompHangul($u);
593 my $contract = join(CODE_SEP, @decH);
594 @decH = ($contract) if $map->{$contract};
595 } else { # must be <@decH == 3>
596 if ($max->{$decH[0]}) {
597 my $contract = join(CODE_SEP, @decH);
598 if ($map->{$contract}) {
601 $contract = join(CODE_SEP, @decH[0,1]);
602 $map->{$contract} and @decH = ($contract, $decH[2]);
604 # even if V's ignorable, LT contraction is not supported.
605 # If such a situatution were required, NFD should be used.
607 if (@decH == 3 && $max->{$decH[1]}) {
608 my $contract = join(CODE_SEP, @decH[1,2]);
609 $map->{$contract} and @decH = ($decH[0], $contract);
614 $map->{$_} ? @{ $map->{$_} } : $der->($_);
617 return map _varCE($vbl, $_), @hangulCE;
619 elsif (CJK_UidIni <= $u && $u <= CJK_UidFin ||
620 CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
621 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
622 my $cjk = $self->{overrideCJK};
623 return map _varCE($vbl, $_),
625 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
626 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
627 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
631 return map _varCE($vbl, $_), $der->($u);
637 ## string sortkey = getSortKey(string arg)
642 my $lev = $self->{level};
643 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
644 my $ver9 = $self->{UCA_Version} >= 9;
645 my $v2i = $ver9 && $self->{variable} ne 'non-ignorable';
647 my @buf; # weight arrays
648 if ($self->{hangul_terminator}) {
650 foreach my $jcps (@$rEnt) {
651 # weird things like VL, TL-contraction are not considered!
653 foreach my $u (split /;/, $jcps) {
654 $curHST .= getHST($u);
656 if ($preHST && !$curHST || # hangul before non-hangul
657 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
658 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
659 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
661 push @buf, $self->getWtHangulTerm();
665 push @buf, $self->getWt($jcps);
667 $preHST # end at hangul
668 and push @buf, $self->getWtHangulTerm();
671 foreach my $jcps (@$rEnt) {
672 push @buf, $self->getWt($jcps);
677 my @ret = ([],[],[],[]);
678 my $last_is_variable;
680 foreach my $vwt (@buf) {
681 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
684 $last_is_variable = TRUE;
686 elsif (!$wt[0]) { # ignorable
687 next if $last_is_variable;
690 $last_is_variable = FALSE;
693 foreach my $v (0..$lev-1) {
694 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
698 # modification of tertiary weights
699 if ($self->{upper_before_lower}) {
700 foreach (@{ $ret[2] }) {
701 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
702 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
703 elsif ($_ == 0x1C) { $_ += 1 } # square upper
704 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
707 if ($self->{katakana_before_hiragana}) {
708 foreach (@{ $ret[2] }) {
709 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
710 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
714 if ($self->{backwardsFlag}) {
715 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
716 if ($self->{backwardsFlag} & (1 << $v)) {
717 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
722 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
727 ## int compare = cmp(string a, string b)
729 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
730 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
731 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
732 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
733 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
734 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
735 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
738 ## list[strings] sorted = sort(list[strings] arg)
744 sort{ $a->[0] cmp $b->[0] }
745 map [ $obj->getSortKey($_), $_ ], @_;
752 (CJK_UidIni <= $u && $u <= CJK_UidFin)
754 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
755 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
759 my $aaaa = $base + ($u >> 15);
760 my $bbbb = ($u & 0x7FFF) | 0x8000;
762 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
763 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
768 my $aaaa = 0xFF80 + ($code >> 15);
769 my $bbbb = ($code & 0x7FFF) | 0x8000;
771 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
772 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
776 sub getWtHangulTerm {
778 return _varCE($self->{variable},
779 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
784 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
786 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
789 # $code *must* be in Hangul syllable.
790 # Check it before you enter here.
794 my $SIndex = $code - Hangul_SBase;
795 my $LIndex = int( $SIndex / Hangul_NCount);
796 my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
797 my $TIndex = $SIndex % Hangul_TCount;
799 Hangul_LBase + $LIndex,
800 Hangul_VBase + $VIndex,
801 $TIndex ? (Hangul_TBase + $TIndex) : (),
807 return ! defined $code # removed
808 || ($code < 0 || 0x10FFFF < $code) # out of range
809 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
810 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
811 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
815 # Hangul Syllable Type
819 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
820 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
821 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
822 Hangul_SIni <= $u && $u <= Hangul_SFin ?
823 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
828 ## bool _nonIgnorAtLevel(arrayref weights, int level)
830 sub _nonIgnorAtLevel($$)
833 return if ! defined $wt;
835 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
840 ## arrayref of arrayref[weights] source,
841 ## arrayref of arrayref[weights] substr,
843 ## * comparison of graphemes vs graphemes.
844 ## @$source >= @$substr must be true (check it before call this);
852 for my $g (0..@$substr-1){
853 # Do the $g'th graphemes have the same number of AV weigths?
854 return if @{ $source->[$g] } != @{ $substr->[$g] };
856 for my $w (0..@{ $substr->[$g] }-1) {
857 for my $v (0..$lev-1) {
858 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
866 ## (int position, int length)
867 ## int position = index(string, substring, position, [undoc'ed grobal])
869 ## With "grobal" (only for the list context),
870 ## returns list of arrayref[position, length].
876 my $len = length($str);
877 my $subE = $self->splitEnt(shift);
878 my $pos = @_ ? shift : 0;
879 $pos = 0 if $pos < 0;
882 my $lev = $self->{level};
883 my $ver9 = $self->{UCA_Version} >= 9;
884 my $v2i = $self->{variable} ne 'non-ignorable';
887 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
889 ? map([$_, 0], $temp..$len)
890 : wantarray ? ($temp,0) : $temp;
893 and return wantarray ? () : NOMATCHPOS;
894 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
896 or return wantarray ? () : NOMATCHPOS;
898 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
900 my $last_is_variable;
901 for my $vwt (map $self->getWt($_), @$subE) {
902 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
903 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
907 $last_is_variable = TRUE;
909 elsif (!$wt[0]) { # ignorable
910 $to_be_pushed = FALSE if $last_is_variable;
913 $last_is_variable = FALSE;
917 if (@subWt && !$var && !$wt[0]) {
918 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
920 push @subWt, [ \@wt ];
925 my $end = @$strE - 1;
927 $last_is_variable = FALSE; # reuse
928 for (my $i = 0; $i <= $end; ) { # no $i++
932 while ($i <= $end && $found_base == 0) {
933 for my $vwt ($self->getWt($strE->[$i][0])) {
934 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
935 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
939 $last_is_variable = TRUE;
941 elsif (!$wt[0]) { # ignorable
942 $to_be_pushed = FALSE if $last_is_variable;
945 $last_is_variable = FALSE;
949 if (@strWt && !$var && !$wt[0]) {
950 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
951 $finPos[-1] = $strE->[$i][2];
952 } elsif ($to_be_pushed) {
953 push @strWt, [ \@wt ];
954 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
955 $finPos[-1] = NOMATCHPOS if $found_base;
956 push @finPos, $strE->[$i][2];
965 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
966 if ($iniPos[0] != NOMATCHPOS &&
967 $finPos[$#subWt] != NOMATCHPOS &&
968 _eqArray(\@strWt, \@subWt, $lev)) {
969 my $temp = $iniPos[0] + $pos;
972 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
973 splice @strWt, 0, $#subWt;
974 splice @iniPos, 0, $#subWt;
975 splice @finPos, 0, $#subWt;
979 ? ($temp, $finPos[$#subWt] - $iniPos[0])
991 : wantarray ? () : NOMATCHPOS;
995 ## scalarref to matching part = match(string, substring)
1000 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1001 my $temp = substr($_[0], $pos, $len);
1002 return wantarray ? $temp : \$temp;
1003 # An lvalue ref \substr should be avoided,
1004 # since its value is affected by modification of its referent.
1012 ## arrayref matching parts = gmatch(string, substring)
1019 return map substr($str, $_->[0], $_->[1]),
1020 $self->index($str, $sub, 0, 'g');
1024 ## bool subst'ed = subst(string, substring, replace)
1029 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1031 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1033 my $mat = substr($_[0], $pos, $len);
1034 substr($_[0], $pos, $len, $code->($mat));
1036 substr($_[0], $pos, $len, $_[2]);
1046 ## int count = gsubst(string, substring, replace)
1051 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1054 # Replacement is carried out from the end, then use reverse.
1055 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1057 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1058 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1060 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1072 Unicode::Collate - Unicode Collation Algorithm
1076 use Unicode::Collate;
1079 $Collator = Unicode::Collate->new(%tailoring);
1082 @sorted = $Collator->sort(@not_sorted);
1085 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1087 # If %tailoring is false (i.e. empty),
1088 # $Collator should do the default collation.
1092 This module is an implementation
1093 of Unicode Technical Standard #10 (UTS #10)
1094 "Unicode Collation Algorithm."
1096 =head2 Constructor and Tailoring
1098 The C<new> method returns a collator object.
1100 $Collator = Unicode::Collate->new(
1101 UCA_Version => $UCA_Version,
1102 alternate => $alternate, # deprecated: use of 'variable' is recommended.
1103 backwards => $levelNumber, # or \@levelNumbers
1105 hangul_terminator => $term_primary_weight,
1106 ignoreName => qr/$ignoreName/,
1107 ignoreChar => qr/$ignoreChar/,
1108 katakana_before_hiragana => $bool,
1109 level => $collationLevel,
1110 normalization => $normalization_form,
1111 overrideCJK => \&overrideCJK,
1112 overrideHangul => \&overrideHangul,
1113 preprocess => \&preprocess,
1114 rearrange => \@charList,
1116 undefName => qr/$undefName/,
1117 undefChar => qr/$undefChar/,
1118 upper_before_lower => $bool,
1119 variable => $variable,
1126 If the tracking version number of the older UCA is given,
1127 the older behavior of that tracking version is emulated on collating.
1128 If omitted, the return value of C<UCA_Version()> is used.
1130 The supported tracking version: 8, 9, or 11.
1132 B<This parameter may be removed in the future version,
1133 as switching the algorithm would affect the performance.>
1137 -- see 3.1.2 French Accents, UTS #10.
1139 backwards => $levelNumber or \@levelNumbers
1141 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1142 If omitted, forwards at all the levels.
1146 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1148 If the same character (or a sequence of characters) exists
1149 in the collation element table through C<table>,
1150 mapping to collation elements is overrided.
1151 If it does not exist, the mapping is defined additionally.
1153 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1154 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1155 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1156 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1157 006C 006C ; [.0F4C.0020.0002.006C] # ll
1158 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1159 004C 004C ; [.0F4C.0020.0008.004C] # LL
1160 006E 0303 ; [.0F7B.0020.0002.006E] # n-tilde
1161 004E 0303 ; [.0F7B.0020.0008.004E] # N-tilde
1164 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1165 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1166 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1169 B<NOTE:> The code point in the UCA file format (before C<';'>)
1170 B<must> be a Unicode code point (defined as hexadecimal),
1171 but not a native code point.
1172 So C<0063> must always denote C<U+0063>,
1173 but not a character of C<"\x63">.
1175 Weighting may vary depending on collation element table.
1176 So ensure the weights defined in C<entry> will be consistent with
1177 those in the collation element table loaded via C<table>.
1179 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1180 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1181 (as a value between C<0E60> and C<0E6D>)
1182 makes ordering as C<C E<lt> CH E<lt> D>.
1183 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1184 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1185 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1186 and C<c-curl> (C<U+0255>) with C<0E69>.
1187 Then primary weight C<0E6A> for C<CH> makes C<CH>
1188 ordered between C<c-curl> and C<D>.
1190 =item hangul_terminator
1192 -- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1194 If a true value is given (non-zero but should be positive),
1195 it will be added as a terminator primary weight to the end of
1196 every standard Hangul syllable. Secondary and any higher weights
1197 for terminator are set to zero.
1198 If the value is false or C<hangul_terminator> key does not exist,
1199 insertion of terminator weights will not be performed.
1201 Boundaries of Hangul syllables are determined
1202 according to conjoining Jamo behavior in F<the Unicode Standard>
1203 and F<HangulSyllableType.txt>.
1205 B<Implementation Note:>
1206 (1) For expansion mapping (Unicode character mapped
1207 to a sequence of collation elements), a terminator will not be added
1208 between collation elements, even if Hangul syllable boundary exists there.
1209 Addition of terminator is restricted to the next position
1210 to the last collation element.
1212 (2) Non-conjoining Hangul letters
1213 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1214 automatically terminated with a terminator primary weight.
1215 These characters may need terminator included in a collation element
1222 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1224 Makes the entry in the table completely ignorable;
1225 i.e. as if the weights were zero at all level.
1227 E.g. when 'a' and 'e' are ignorable,
1228 'element' is equal to 'lament' (or 'lmnt').
1232 -- see 4.3 Form a sort key for each string, UTS #10.
1234 Set the maximum level.
1235 Any higher levels than the specified one are ignored.
1237 Level 1: alphabetic ordering
1238 Level 2: diacritic ordering
1239 Level 3: case ordering
1240 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1244 If omitted, the maximum is the 4th.
1248 -- see 4.1 Normalize each input string, UTS #10.
1250 If specified, strings are normalized before preparation of sort keys
1251 (the normalization is executed after preprocess).
1253 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1254 as C<$normalization_form>.
1255 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1256 See C<Unicode::Normalize::normalize()> for detail.
1257 If omitted, C<'NFD'> is used.
1259 C<normalization> is performed after C<preprocess> (if defined).
1261 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1262 though they are not concerned with C<Unicode::Normalize::normalize()>.
1264 If C<undef> (not a string C<"undef">) is passed explicitly
1265 as the value for this key,
1266 any normalization is not carried out (this may make tailoring easier
1267 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1268 only contiguous contractions are resolved;
1269 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1270 C<A-cedilla-ring> would be primary equal to C<A>.
1272 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1273 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1275 In the case of C<(normalization =E<gt> "prenormalized")>,
1276 any normalization is not performed, but
1277 non-contiguous contractions with combining characters are performed.
1279 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1280 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1281 If source strings are finely prenormalized,
1282 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1284 Except C<(normalization =E<gt> undef)>,
1285 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1289 -- see 7.1 Derived Collation Elements, UTS #10.
1291 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1292 (but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than
1293 C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1294 C<U+20000> to C<U+2A6D6>].
1296 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1298 ex. CJK Unified Ideographs in the JIS code point order.
1300 overrideCJK => sub {
1301 my $u = shift; # get a Unicode codepoint
1302 my $b = pack('n', $u); # to UTF-16BE
1303 my $s = your_unicode_to_sjis_converter($b); # convert
1304 my $n = unpack('n', $s); # convert sjis to short
1305 [ $n, 0x20, 0x2, $u ]; # return the collation element
1308 ex. ignores all CJK Unified Ideographs.
1310 overrideCJK => sub {()}, # CODEREF returning empty list
1312 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1313 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1315 If C<undef> is passed explicitly as the value for this key,
1316 weights for CJK Unified Ideographs are treated as undefined.
1317 But assignment of weight for CJK Unified Ideographs
1318 in table or C<entry> is still valid.
1320 =item overrideHangul
1322 -- see 7.1 Derived Collation Elements, UTS #10.
1324 By default, Hangul Syllables are decomposed into Hangul Jamo,
1325 even if C<(normalization =E<gt> undef)>.
1326 But the mapping of Hangul Syllables may be overrided.
1328 This tag works like C<overrideCJK>, so see there for examples.
1330 If you want to override the mapping of Hangul Syllables,
1331 NFD, NFKD, and FCD are not appropriate,
1332 since they will decompose Hangul Syllables before overriding.
1334 If C<undef> is passed explicitly as the value for this key,
1335 weight for Hangul Syllables is treated as undefined
1336 without decomposition into Hangul Jamo.
1337 But definition of weight for Hangul Syllables
1338 in table or C<entry> is still valid.
1342 -- see 5.1 Preprocessing, UTS #10.
1344 If specified, the coderef is used to preprocess
1345 before the formation of sort keys.
1347 ex. dropping English articles, such as "a" or "the".
1348 Then, "the pen" is before "a pencil".
1352 $str =~ s/\b(?:an?|the)\s+//gi;
1356 C<preprocess> is performed before C<normalization> (if defined).
1360 -- see 3.1.3 Rearrangement, UTS #10.
1362 Characters that are not coded in logical order and to be rearranged.
1365 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1367 If you want to disallow any rearrangement,
1368 pass C<undef> or C<[]> (a reference to an empty list)
1369 as the value for this key.
1371 B<According to the version 9 of UCA, this parameter shall not be used;
1372 but it is not warned at present.>
1376 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1378 You can use another collation element table if desired.
1379 The table file must be put into a directory
1380 where F<Unicode/Collate.pm> is installed; e.g. into
1381 F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>.
1383 By default, the filename F<allkeys.txt> is used.
1385 If C<undef> is passed explicitly as the value for this key,
1386 no file is read (but you can define collation elements via C<entry>).
1388 A typical way to define a collation element table
1389 without any file of table:
1391 $onlyABC = Unicode::Collate->new(
1393 entry => << 'ENTRIES',
1394 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1395 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1396 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1397 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1398 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1399 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1407 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1409 Undefines the collation element as if it were unassigned in the table.
1410 This reduces the size of the table.
1411 If an unassigned character appears in the string to be collated,
1412 the sort key is made from its codepoint
1413 as a single-character collation element,
1414 as it is greater than any other assigned collation elements
1415 (in the codepoint order among the unassigned characters).
1416 But, it'd be better to ignore characters
1417 unfamiliar to you and maybe never used.
1419 =item katakana_before_hiragana
1421 =item upper_before_lower
1423 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1425 By default, lowercase is before uppercase
1426 and hiragana is before katakana.
1428 If the tag is made true, this is reversed.
1430 B<NOTE>: These tags simplemindedly assume
1431 any lowercase/uppercase or hiragana/katakana distinctions
1432 must occur in level 3, and their weights at level 3
1433 must be same as those mentioned in 7.3.1, UTS #10.
1434 If you define your collation elements which violate this requirement,
1435 these tags don't work validly.
1441 -- see 3.2.2 Variable Weighting, UTS #10.
1443 (the title in UCA version 8: Alternate Weighting)
1445 This key allows to variable weighting for variable collation elements,
1446 which are marked with an ASTERISK in the table
1447 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1449 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1451 These names are case-insensitive.
1452 By default (if specification is omitted), 'shifted' is adopted.
1454 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1455 considered at the 4th level.
1457 'Non-Ignorable' Variable elements are not reset to ignorable.
1459 'Shifted' Variable elements are made ignorable at levels 1 through 3
1460 their level 4 weight is replaced by the old level 1 weight.
1461 Level 4 weight for Non-Variable elements is 0xFFFF.
1463 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1466 For backward compatibility, C<alternate> can be used as an alias
1471 =head2 Methods for Collation
1475 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1477 Sorts a list of strings.
1479 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1481 Returns 1 (when C<$a> is greater than C<$b>)
1482 or 0 (when C<$a> is equal to C<$b>)
1483 or -1 (when C<$a> is lesser than C<$b>).
1485 =item C<$result = $Collator-E<gt>eq($a, $b)>
1487 =item C<$result = $Collator-E<gt>ne($a, $b)>
1489 =item C<$result = $Collator-E<gt>lt($a, $b)>
1491 =item C<$result = $Collator-E<gt>le($a, $b)>
1493 =item C<$result = $Collator-E<gt>gt($a, $b)>
1495 =item C<$result = $Collator-E<gt>ge($a, $b)>
1497 They works like the same name operators as theirs.
1499 eq : whether $a is equal to $b.
1500 ne : whether $a is not equal to $b.
1501 lt : whether $a is lesser than $b.
1502 le : whether $a is lesser than $b or equal to $b.
1503 gt : whether $a is greater than $b.
1504 ge : whether $a is greater than $b or equal to $b.
1506 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1508 -- see 4.3 Form a sort key for each string, UTS #10.
1512 You compare the sort keys using a binary comparison
1513 and get the result of the comparison of the strings using UCA.
1515 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1519 $Collator->cmp($a, $b)
1521 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1523 use Unicode::Collate;
1524 my $c = Unicode::Collate->new();
1525 print $c->viewSortKey("Perl"),"\n";
1528 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1529 # Level 1 Level 2 Level 3 Level 4
1531 (If C<UCA_Version> is 8, the output is slightly different.)
1535 =head2 Methods for Searching
1537 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1538 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1539 C<subst>, C<gsubst>) is croaked,
1540 as the position and the length might differ
1541 from those on the specified string.
1542 (And C<rearrange> and C<hangul_terminator> tags are neglected.)
1544 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1545 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1546 but they are not aware of any pattern, but only a literal substring.
1550 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1552 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1554 If C<$substring> matches a part of C<$string>, returns
1555 the position of the first occurrence of the matching part in scalar context;
1556 in list context, returns a two-element list of
1557 the position and the length of the matching part.
1559 If C<$substring> does not match any part of C<$string>,
1560 returns C<-1> in scalar context and
1561 an empty list in list context.
1565 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1566 # (normalization => undef) is REQUIRED.
1567 my $str = "Ich muß studieren Perl.";
1570 if (my($pos,$len) = $Collator->index($str, $sub)) {
1571 $match = substr($str, $pos, $len);
1574 and get C<"muß"> in C<$match> since C<"muß">
1575 is primary equal to C<"MÜSS">.
1577 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1579 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1581 If C<$substring> matches a part of C<$string>, in scalar context, returns
1582 B<a reference to> the first occurrence of the matching part
1583 (C<$match_ref> is always true if matches,
1584 since every reference is B<true>);
1585 in list context, returns the first occurrence of the matching part.
1587 If C<$substring> does not match any part of C<$string>,
1588 returns C<undef> in scalar context and
1589 an empty list in list context.
1593 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1594 print "matches [$$match_ref].\n";
1596 print "doesn't match.\n";
1601 if (($match) = $Collator->match($str, $sub)) { # list context
1602 print "matches [$match].\n";
1604 print "doesn't match.\n";
1607 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1609 If C<$substring> matches a part of C<$string>, returns
1610 all the matching parts (or matching count in scalar context).
1612 If C<$substring> does not match any part of C<$string>,
1613 returns an empty list.
1615 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1617 If C<$substring> matches a part of C<$string>,
1618 the first occurrence of the matching part is replaced by C<$replacement>
1619 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1621 C<$replacement> can be a C<CODEREF>,
1622 taking the matching part as an argument,
1623 and returning a string to replace the matching part
1624 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1626 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1628 If C<$substring> matches a part of C<$string>,
1629 all the occurrences of the matching part is replaced by C<$replacement>
1630 (C<$string> is modified) and return C<$count>.
1632 C<$replacement> can be a C<CODEREF>,
1633 taking the matching part as an argument,
1634 and returning a string to replace the matching part
1635 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1639 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1640 # (normalization => undef) is REQUIRED.
1641 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1642 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1644 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1645 # i.e., all the camels are made bold-faced.
1649 =head2 Other Methods
1653 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1655 Change the value of specified keys and returns the changed part.
1657 $Collator = Unicode::Collate->new(level => 4);
1659 $Collator->eq("perl", "PERL"); # false
1661 %old = $Collator->change(level => 2); # returns (level => 4).
1663 $Collator->eq("perl", "PERL"); # true
1665 $Collator->change(%old); # returns (level => 2).
1667 $Collator->eq("perl", "PERL"); # false
1669 Not all C<(key,value)>s are allowed to be changed.
1670 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1672 In the scalar context, returns the modified collator
1673 (but it is B<not> a clone from the original).
1675 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1677 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1679 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1681 =item C<$version = $Collator-E<gt>version()>
1683 Returns the version number (a string) of the Unicode Standard
1684 which the C<table> file used by the collator object is based on.
1685 If the table does not include a version line (starting with C<@version>),
1686 returns C<"unknown">.
1688 =item C<UCA_Version()>
1690 Returns the tracking version number of UTS #10 this module consults.
1692 =item C<Base_Unicode_Version()>
1694 Returns the version number of UTS #10 this module consults.
1704 Use of the C<normalization> parameter requires
1705 the B<Unicode::Normalize> module.
1707 If you need not it (say, in the case when you need not
1708 handle any combining characters),
1709 assign C<normalization =E<gt> undef> explicitly.
1711 -- see 6.5 Avoiding Normalization, UTS #10.
1713 =head2 Conformance Test
1715 The Conformance Test for the UCA is available
1716 under L<http://www.unicode.org/Public/UCA/>.
1718 For F<CollationTest_SHIFTED.txt>,
1719 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1720 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1721 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1723 B<Unicode::Normalize is required to try The Conformance Test.>
1727 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1729 http://homepage1.nifty.com/nomenclator/perl/
1731 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
1733 This library is free software; you can redistribute it
1734 and/or modify it under the same terms as Perl itself.
1740 =item Unicode Collation Algorithm - UTS #10
1742 L<http://www.unicode.org/reports/tr10/>
1744 =item The Default Unicode Collation Element Table (DUCET)
1746 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1748 =item The conformance test for the UCA
1750 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1752 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1754 =item Hangul Syllable Type
1756 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1758 =item Unicode Normalization Forms - UAX #15
1760 L<http://www.unicode.org/reports/tr15/>
1762 =item L<Unicode::Normalize>