1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
17 our $VERSION = '0.40';
18 our $PACKAGE = __PACKAGE__;
20 my @Path = qw(Unicode Collate);
21 my $KeyFile = "allkeys.txt";
24 use constant TRUE => 1;
25 use constant FALSE => "";
26 use constant NOMATCHPOS => -1;
28 # A coderef to get combining class imported from Unicode::Normalize
29 # (i.e. \&Unicode::Normalize::getCombinClass).
30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
34 use constant MinLevel => 1;
35 use constant MaxLevel => 4;
37 # Minimum weights at level 2 and 3, respectively
38 use constant Min2Wt => 0x20;
39 use constant Min3Wt => 0x02;
41 # Shifted weight at 4th level
42 use constant Shift4Wt => 0xFFFF;
44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45 # PROBLEM: The Default Unicode Collation Element Table
46 # has weights over 0xFFFF at the 4th level.
47 # The tie-breaking in the variable weights
48 # other than "shift" (as well as "shift-trimmed") is unreliable.
49 use constant VCE_TEMPLATE => 'Cn4';
51 # A sort key: 16-bit weights
52 # See also the PROBLEM on VCE_TEMPLATE above.
53 use constant KEY_TEMPLATE => 'n*';
55 # Level separator in a sort key:
56 # i.e. pack(KEY_TEMPLATE, 0)
57 use constant LEVEL_SEP => "\0\0";
59 # As Unicode code point separator for hash keys.
60 # A joined code point string (denoted by JCPS below)
61 # like "65;768" is used for internal processing
62 # instead of Perl's Unicode string like "\x41\x{300}",
63 # as the native code point is different from the Unicode code point
65 # This character must not be included in any stringified
66 # representation of an integer.
67 use constant CODE_SEP => ';';
69 # boolean values of variable weights
70 use constant NON_VAR => 0; # Non-Variable character
71 use constant VAR => 1; # Variable character
73 # specific code points
74 use constant Hangul_LBase => 0x1100;
75 use constant Hangul_LIni => 0x1100;
76 use constant Hangul_LFin => 0x1159;
77 use constant Hangul_LFill => 0x115F;
78 use constant Hangul_VBase => 0x1161;
79 use constant Hangul_VIni => 0x1160;
80 use constant Hangul_VFin => 0x11A2;
81 use constant Hangul_TBase => 0x11A7;
82 use constant Hangul_TIni => 0x11A8;
83 use constant Hangul_TFin => 0x11F9;
84 use constant Hangul_TCount => 28;
85 use constant Hangul_NCount => 588;
86 use constant Hangul_SBase => 0xAC00;
87 use constant Hangul_SIni => 0xAC00;
88 use constant Hangul_SFin => 0xD7A3;
89 use constant CJK_UidIni => 0x4E00;
90 use constant CJK_UidFin => 0x9FA5;
91 use constant CJK_ExtAIni => 0x3400;
92 use constant CJK_ExtAFin => 0x4DB5;
93 use constant CJK_ExtBIni => 0x20000;
94 use constant CJK_ExtBFin => 0x2A6D6;
95 use constant BMP_Max => 0xFFFF;
97 # Logical_Order_Exception in PropList.txt
98 # TODO: synchronization with change of PropList.txt.
99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
101 sub UCA_Version { "11" }
103 sub Base_Unicode_Version { "4.0" }
108 return pack('U*', @_);
112 return unpack('U*', pack('U*').shift);
119 blanked non-ignorable shifted shift-trimmed
120 / } = (); # keys lowercased
123 alternate backwards level normalization rearrange
124 katakana_before_hiragana upper_before_lower
125 overrideHangul overrideCJK preprocess UCA_Version
126 hangul_terminator variable
130 entry mapping table maxlength
131 ignoreChar ignoreName undefChar undefName variableTable
132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
133 derivCode normCode rearrangeHash
136 # The hash key 'ignored' is deleted at v 0.21.
137 # The hash key 'isShift' is deleted at v 0.23.
138 # The hash key 'combining' is deleted at v 0.24.
139 # The hash key 'entries' is deleted at v 0.30.
140 # The hash key 'L3_ignorable' is deleted at v 0.40.
144 return $self->{versionTable} || 'unknown';
147 my (%ChangeOK, %ChangeNG);
148 @ChangeOK{ @ChangeOK } = ();
149 @ChangeNG{ @ChangeNG } = ();
155 if (exists $hash{variable} && exists $hash{alternate}) {
156 delete $hash{alternate};
158 elsif (!exists $hash{variable} && exists $hash{alternate}) {
159 $hash{variable} = $hash{alternate};
161 foreach my $k (keys %hash) {
162 if (exists $ChangeOK{$k}) {
163 $old{$k} = $self->{$k};
164 $self->{$k} = $hash{$k};
166 elsif (exists $ChangeNG{$k}) {
167 croak "change of $k via change() is not allowed!";
171 $self->checkCollator;
172 return wantarray ? %old : $self;
177 my $key = shift; # 'level' or 'backwards'
178 MinLevel <= $level or croak sprintf
179 "Illegal level %d (in value for key '%s') lower than %d.",
180 $level, $key, MinLevel;
181 $level <= MaxLevel or croak sprintf
182 "Unsupported level %d (in value for key '%s') higher than %d.",
183 $level, $key, MaxLevel;
189 11 => \&_derivCE_9, # 11 == 9
194 _checkLevel($self->{level}, "level");
196 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
197 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
199 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
200 $self->{alternateTable} || 'shifted';
201 $self->{variable} = $self->{alternate} = lc($self->{variable});
202 exists $VariableOK{ $self->{variable} }
203 or croak "$PACKAGE unknown variable tag name: $self->{variable}";
205 if (! defined $self->{backwards}) {
206 $self->{backwardsFlag} = 0;
208 elsif (! ref $self->{backwards}) {
209 _checkLevel($self->{backwards}, "backwards");
210 $self->{backwardsFlag} = 1 << $self->{backwards};
214 $self->{backwardsFlag} = 0;
215 for my $b (@{ $self->{backwards} }) {
216 _checkLevel($b, "backwards");
219 for my $v (sort keys %level) {
220 $self->{backwardsFlag} += 1 << $v;
224 defined $self->{rearrange} or $self->{rearrange} = [];
225 ref $self->{rearrange}
226 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
228 # keys of $self->{rearrangeHash} are $self->{rearrange}.
229 $self->{rearrangeHash} = undef;
231 if (@{ $self->{rearrange} }) {
232 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
235 $self->{normCode} = undef;
237 if (defined $self->{normalization}) {
238 eval { require Unicode::Normalize };
239 $@ and croak "Unicode::Normalize is required to normalize strings";
241 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
243 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
244 $self->{normCode} = \&Unicode::Normalize::NFD;
246 elsif ($self->{normalization} ne 'prenormalized') {
247 my $norm = $self->{normalization};
248 $self->{normCode} = sub {
249 Unicode::Normalize::normalize($norm, shift);
251 eval { $self->{normCode}->("") }; # try
252 $@ and croak "$PACKAGE unknown normalization form name: $norm";
261 my $self = bless { @_ }, $class;
263 # If undef is passed explicitly, no file is read.
264 $self->{table} = $KeyFile if ! exists $self->{table};
265 $self->read_table if defined $self->{table};
267 if ($self->{entry}) {
268 while ($self->{entry} =~ /([^\n]+)/g) {
269 $self->parseEntry($1);
273 $self->{level} ||= MaxLevel;
274 $self->{UCA_Version} ||= UCA_Version();
276 $self->{overrideHangul} = FALSE
277 if ! exists $self->{overrideHangul};
278 $self->{overrideCJK} = FALSE
279 if ! exists $self->{overrideCJK};
280 $self->{normalization} = 'NFD'
281 if ! exists $self->{normalization};
282 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
283 if ! exists $self->{rearrange};
284 $self->{backwards} = $self->{backwardsTable}
285 if ! exists $self->{backwards};
287 $self->checkCollator;
296 foreach my $d (@INC) {
297 $f = File::Spec->catfile($d, @Path, $self->{table});
298 last if open($fh, $f);
302 or croak "$PACKAGE: $self->{table} is not found in @INC";
306 unless (s/^\s*\@//) {
307 $self->parseEntry($_);
311 if (/^version\s*(\S*)/) {
312 $self->{versionTable} ||= $1;
314 elsif (/^variable\s+(\S*)/) { # since UTS #10-9
315 $self->{variableTable} ||= $1;
317 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
318 $self->{alternateTable} ||= $1;
320 elsif (/^backwards\s+(\S*)/) {
321 push @{ $self->{backwardsTable} }, $1;
323 elsif (/^forwards\s+(\S*)/) { # parhaps no use
324 push @{ $self->{forwardsTable} }, $1;
326 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
327 push @{ $self->{rearrangeTable} }, _getHexArray($1);
335 ## get $line, parse it, and write an entry in $self
341 my($name, $entry, @uv, @key);
343 return if $line !~ /^\s*[0-9A-Fa-f]/;
345 # removes comment and gets name
347 if $line =~ s/[#%]\s*(.*)//;
348 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
351 my($e, $k) = split /;/, $line;
352 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
355 @uv = _getHexArray($e);
358 $entry = join(CODE_SEP, @uv); # in JCPS
360 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
361 my $ele = pack_U(@uv);
363 # regarded as if it were not entried in the table
365 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
367 # replaced as completely ignorable
368 $k = '[.0000.0000.0000.0000]'
369 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
372 # replaced as completely ignorable
373 $k = '[.0000.0000.0000.0000]'
374 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
376 my $is_L3_ignorable = TRUE;
378 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
379 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
380 my @wt = _getHexArray($arr);
381 push @key, pack(VCE_TEMPLATE, $var, @wt);
382 $is_L3_ignorable = FALSE
383 if $wt[0] + $wt[1] + $wt[2] != 0;
384 # if $arr !~ /[1-9A-Fa-f]/; NG
385 # Conformance Test shows L3-ignorable is completely ignorable.
386 # For expansion, an entry $is_L3_ignorable
387 # if and only if "all" CEs are [.0000.0000.0000].
390 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
393 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
394 and $self->{maxlength}{$uv[0]} = @uv;
400 ## VCE = _varCE(variable term, VCE)
406 if ($vbl eq 'non-ignorable') {
409 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
412 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
413 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
415 elsif ($vbl eq 'blanked') {
419 return pack(VCE_TEMPLATE, $var, @wt[0..2],
420 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
427 $self->visualizeSortKey($self->getSortKey(@_));
433 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
435 if ($self->{UCA_Version} <= 8) {
436 $view =~ s/ ?0000 ?/|/g;
438 $view =~ s/\b0000\b/|/g;
445 ## arrayref of JCPS = splitEnt(string to be collated)
446 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
453 my $code = $self->{preprocess};
454 my $norm = $self->{normCode};
455 my $map = $self->{mapping};
456 my $max = $self->{maxlength};
457 my $reH = $self->{rearrangeHash};
458 my $ver9 = $self->{UCA_Version} >= 9;
463 $code and croak "Preprocess breaks character positions. "
464 . "Don't use with index(), match(), etc.";
465 $norm and croak "Normalization breaks character positions. "
466 . "Don't use with index(), match(), etc.";
471 $str = &$code($str) if ref $code;
472 $str = &$norm($str) if ref $norm;
475 # get array of Unicode code point of string.
476 my @src = unpack_U($str);
479 # Character positions are not kept if rearranged,
480 # then neglected if $wLen is true.
481 if ($reH && ! $wLen) {
482 for (my $i = 0; $i < @src; $i++) {
483 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
484 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
490 # To remove a character marked as a completely ignorable.
491 for (my $i = 0; $i < @src; $i++) {
493 if _isIllegal($src[$i]) || ($ver9 &&
494 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
497 for (my $i = 0; $i < @src; $i++) {
499 next if ! defined $jcps;
502 if ($max->{$jcps}) { # contract
503 my $temp_jcps = $jcps;
505 my $maxLen = $max->{$jcps};
507 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
508 next if ! defined $src[$p];
509 $temp_jcps .= CODE_SEP . $src[$p];
511 if ($map->{$temp_jcps}) {
517 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
518 # This process requires Unicode::Normalize.
519 # If "normalization" is undef, here should be skipped *always*
520 # (in spite of bool value of $CVgetCombinClass),
521 # since canonical ordering cannot be expected.
522 # Blocked combining character should not be contracted.
524 if ($self->{normalization})
525 # $self->{normCode} is false in the case of "prenormalized".
530 for (my $p = $i + 1; $p < @src; $p++) {
531 next if ! defined $src[$p];
532 $curCC = $CVgetCombinClass->($src[$p]);
534 my $tail = CODE_SEP . $src[$p];
535 if ($preCC != $curCC && $map->{$jcps.$tail}) {
546 for (; $i + 1 < @src; $i++) {
547 last if defined $src[$i + 1];
551 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
558 ## list of VCE = getWt(JCPS)
564 my $vbl = $self->{variable};
565 my $map = $self->{mapping};
566 my $der = $self->{derivCode};
568 return if !defined $u;
569 return map(_varCE($vbl, $_), @{ $map->{$u} })
572 # JCPS must not be a contraction, then it's a code point.
573 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
574 my $hang = $self->{overrideHangul};
577 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
579 elsif (!defined $hang) {
580 @hangulCE = $der->($u);
583 my $max = $self->{maxlength};
584 my @decH = _decompHangul($u);
587 my $contract = join(CODE_SEP, @decH);
588 @decH = ($contract) if $map->{$contract};
589 } else { # must be <@decH == 3>
590 if ($max->{$decH[0]}) {
591 my $contract = join(CODE_SEP, @decH);
592 if ($map->{$contract}) {
595 $contract = join(CODE_SEP, @decH[0,1]);
596 $map->{$contract} and @decH = ($contract, $decH[2]);
598 # even if V's ignorable, LT contraction is not supported.
599 # If such a situatution were required, NFD should be used.
601 if (@decH == 3 && $max->{$decH[1]}) {
602 my $contract = join(CODE_SEP, @decH[1,2]);
603 $map->{$contract} and @decH = ($decH[0], $contract);
608 $map->{$_} ? @{ $map->{$_} } : $der->($_);
611 return map _varCE($vbl, $_), @hangulCE;
613 elsif (CJK_UidIni <= $u && $u <= CJK_UidFin ||
614 CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
615 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
616 my $cjk = $self->{overrideCJK};
617 return map _varCE($vbl, $_),
619 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
620 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
621 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
625 return map _varCE($vbl, $_), $der->($u);
631 ## string sortkey = getSortKey(string arg)
636 my $lev = $self->{level};
637 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
638 my $ver9 = $self->{UCA_Version} >= 9;
639 my $v2i = $ver9 && $self->{variable} ne 'non-ignorable';
641 my @buf; # weight arrays
642 if ($self->{hangul_terminator}) {
644 foreach my $jcps (@$rEnt) {
645 # weird things like VL, TL-contraction are not considered!
647 foreach my $u (split /;/, $jcps) {
648 $curHST .= getHST($u);
650 if ($preHST && !$curHST || # hangul before non-hangul
651 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
652 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
653 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
655 push @buf, $self->getWtHangulTerm();
659 push @buf, $self->getWt($jcps);
661 $preHST # end at hangul
662 and push @buf, $self->getWtHangulTerm();
665 foreach my $jcps (@$rEnt) {
666 push @buf, $self->getWt($jcps);
671 my @ret = ([],[],[],[]);
672 my $last_is_variable;
674 foreach my $vwt (@buf) {
675 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
678 $last_is_variable = TRUE;
680 elsif (!$wt[0]) { # ignorable
681 next if $last_is_variable;
684 $last_is_variable = FALSE;
687 foreach my $v (0..$lev-1) {
688 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
692 # modification of tertiary weights
693 if ($self->{upper_before_lower}) {
694 foreach (@{ $ret[2] }) {
695 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
696 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
697 elsif ($_ == 0x1C) { $_ += 1 } # square upper
698 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
701 if ($self->{katakana_before_hiragana}) {
702 foreach (@{ $ret[2] }) {
703 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
704 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
708 if ($self->{backwardsFlag}) {
709 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
710 if ($self->{backwardsFlag} & (1 << $v)) {
711 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
716 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
721 ## int compare = cmp(string a, string b)
723 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
724 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
725 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
726 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
727 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
728 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
729 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
732 ## list[strings] sorted = sort(list[strings] arg)
738 sort{ $a->[0] cmp $b->[0] }
739 map [ $obj->getSortKey($_), $_ ], @_;
746 (CJK_UidIni <= $u && $u <= CJK_UidFin)
748 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
749 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
753 my $aaaa = $base + ($u >> 15);
754 my $bbbb = ($u & 0x7FFF) | 0x8000;
756 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
757 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
762 my $aaaa = 0xFF80 + ($code >> 15);
763 my $bbbb = ($code & 0x7FFF) | 0x8000;
765 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
766 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
770 sub getWtHangulTerm {
772 return _varCE($self->{variable},
773 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
778 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
780 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
783 # $code *must* be in Hangul syllable.
784 # Check it before you enter here.
788 my $SIndex = $code - Hangul_SBase;
789 my $LIndex = int( $SIndex / Hangul_NCount);
790 my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
791 my $TIndex = $SIndex % Hangul_TCount;
793 Hangul_LBase + $LIndex,
794 Hangul_VBase + $VIndex,
795 $TIndex ? (Hangul_TBase + $TIndex) : (),
801 return ! defined $code # removed
802 || ($code < 0 || 0x10FFFF < $code) # out of range
803 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
804 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
805 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
809 # Hangul Syllable Type
813 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
814 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
815 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
816 Hangul_SIni <= $u && $u <= Hangul_SFin ?
817 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
822 ## bool _nonIgnorAtLevel(arrayref weights, int level)
824 sub _nonIgnorAtLevel($$)
827 return if ! defined $wt;
829 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
834 ## arrayref of arrayref[weights] source,
835 ## arrayref of arrayref[weights] substr,
837 ## * comparison of graphemes vs graphemes.
838 ## @$source >= @$substr must be true (check it before call this);
846 for my $g (0..@$substr-1){
847 # Do the $g'th graphemes have the same number of AV weigths?
848 return if @{ $source->[$g] } != @{ $substr->[$g] };
850 for my $w (0..@{ $substr->[$g] }-1) {
851 for my $v (0..$lev-1) {
852 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
860 ## (int position, int length)
861 ## int position = index(string, substring, position, [undoc'ed grobal])
863 ## With "grobal" (only for the list context),
864 ## returns list of arrayref[position, length].
870 my $len = length($str);
871 my $subE = $self->splitEnt(shift);
872 my $pos = @_ ? shift : 0;
873 $pos = 0 if $pos < 0;
876 my $lev = $self->{level};
877 my $ver9 = $self->{UCA_Version} >= 9;
878 my $v2i = $self->{variable} ne 'non-ignorable';
881 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
883 ? map([$_, 0], $temp..$len)
884 : wantarray ? ($temp,0) : $temp;
887 and return wantarray ? () : NOMATCHPOS;
888 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
890 or return wantarray ? () : NOMATCHPOS;
892 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
894 my $last_is_variable;
895 for my $vwt (map $self->getWt($_), @$subE) {
896 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
897 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
901 $last_is_variable = TRUE;
903 elsif (!$wt[0]) { # ignorable
904 $to_be_pushed = FALSE if $last_is_variable;
907 $last_is_variable = FALSE;
911 if (@subWt && !$var && !$wt[0]) {
912 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
914 push @subWt, [ \@wt ];
919 my $end = @$strE - 1;
921 $last_is_variable = FALSE; # reuse
922 for (my $i = 0; $i <= $end; ) { # no $i++
926 while ($i <= $end && $found_base == 0) {
927 for my $vwt ($self->getWt($strE->[$i][0])) {
928 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
929 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
933 $last_is_variable = TRUE;
935 elsif (!$wt[0]) { # ignorable
936 $to_be_pushed = FALSE if $last_is_variable;
939 $last_is_variable = FALSE;
943 if (@strWt && !$var && !$wt[0]) {
944 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
945 $finPos[-1] = $strE->[$i][2];
946 } elsif ($to_be_pushed) {
947 push @strWt, [ \@wt ];
948 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
949 $finPos[-1] = NOMATCHPOS if $found_base;
950 push @finPos, $strE->[$i][2];
959 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
960 if ($iniPos[0] != NOMATCHPOS &&
961 $finPos[$#subWt] != NOMATCHPOS &&
962 _eqArray(\@strWt, \@subWt, $lev)) {
963 my $temp = $iniPos[0] + $pos;
966 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
967 splice @strWt, 0, $#subWt;
968 splice @iniPos, 0, $#subWt;
969 splice @finPos, 0, $#subWt;
973 ? ($temp, $finPos[$#subWt] - $iniPos[0])
985 : wantarray ? () : NOMATCHPOS;
989 ## scalarref to matching part = match(string, substring)
994 if (my($pos,$len) = $self->index($_[0], $_[1])) {
995 my $temp = substr($_[0], $pos, $len);
996 return wantarray ? $temp : \$temp;
997 # An lvalue ref \substr should be avoided,
998 # since its value is affected by modification of its referent.
1006 ## arrayref matching parts = gmatch(string, substring)
1013 return map substr($str, $_->[0], $_->[1]),
1014 $self->index($str, $sub, 0, 'g');
1018 ## bool subst'ed = subst(string, substring, replace)
1023 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1025 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1027 my $mat = substr($_[0], $pos, $len);
1028 substr($_[0], $pos, $len, $code->($mat));
1030 substr($_[0], $pos, $len, $_[2]);
1040 ## int count = gsubst(string, substring, replace)
1045 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1048 # Replacement is carried out from the end, then use reverse.
1049 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1051 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1052 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1054 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1066 Unicode::Collate - Unicode Collation Algorithm
1070 use Unicode::Collate;
1073 $Collator = Unicode::Collate->new(%tailoring);
1076 @sorted = $Collator->sort(@not_sorted);
1079 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1081 # If %tailoring is false (i.e. empty),
1082 # $Collator should do the default collation.
1086 This module is an implementation
1087 of Unicode Technical Standard #10 (UTS #10)
1088 "Unicode Collation Algorithm."
1090 =head2 Constructor and Tailoring
1092 The C<new> method returns a collator object.
1094 $Collator = Unicode::Collate->new(
1095 UCA_Version => $UCA_Version,
1096 alternate => $alternate, # deprecated: use of 'variable' is recommended.
1097 backwards => $levelNumber, # or \@levelNumbers
1099 hangul_terminator => $term_primary_weight,
1100 ignoreName => qr/$ignoreName/,
1101 ignoreChar => qr/$ignoreChar/,
1102 katakana_before_hiragana => $bool,
1103 level => $collationLevel,
1104 normalization => $normalization_form,
1105 overrideCJK => \&overrideCJK,
1106 overrideHangul => \&overrideHangul,
1107 preprocess => \&preprocess,
1108 rearrange => \@charList,
1110 undefName => qr/$undefName/,
1111 undefChar => qr/$undefChar/,
1112 upper_before_lower => $bool,
1113 variable => $variable,
1120 If the tracking version number of the older UCA is given,
1121 the older behavior of that tracking version is emulated on collating.
1122 If omitted, the return value of C<UCA_Version()> is used.
1124 The supported tracking version: 8, 9, or 11.
1126 B<This parameter may be removed in the future version,
1127 as switching the algorithm would affect the performance.>
1131 -- see 3.1.2 French Accents, UTS #10.
1133 backwards => $levelNumber or \@levelNumbers
1135 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1136 If omitted, forwards at all the levels.
1140 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1142 If the same character (or a sequence of characters) exists
1143 in the collation element table through C<table>,
1144 mapping to collation elements is overrided.
1145 If it does not exist, the mapping is defined additionally.
1147 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1148 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1149 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1150 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1151 006C 006C ; [.0F4C.0020.0002.006C] # ll
1152 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1153 004C 004C ; [.0F4C.0020.0008.004C] # LL
1154 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1155 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1156 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1157 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1160 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1161 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1162 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1165 B<NOTE:> The code point in the UCA file format (before C<';'>)
1166 B<must> be a Unicode code point (defined as hexadecimal),
1167 but not a native code point.
1168 So C<0063> must always denote C<U+0063>,
1169 but not a character of C<"\x63">.
1171 Weighting may vary depending on collation element table.
1172 So ensure the weights defined in C<entry> will be consistent with
1173 those in the collation element table loaded via C<table>.
1175 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1176 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1177 (as a value between C<0E60> and C<0E6D>)
1178 makes ordering as C<C E<lt> CH E<lt> D>.
1179 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1180 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1181 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1182 and C<c-curl> (C<U+0255>) with C<0E69>.
1183 Then primary weight C<0E6A> for C<CH> makes C<CH>
1184 ordered between C<c-curl> and C<D>.
1186 =item hangul_terminator
1188 -- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1190 If a true value is given (non-zero but should be positive),
1191 it will be added as a terminator primary weight to the end of
1192 every standard Hangul syllable. Secondary and any higher weights
1193 for terminator are set to zero.
1194 If the value is false or C<hangul_terminator> key does not exist,
1195 insertion of terminator weights will not be performed.
1197 Boundaries of Hangul syllables are determined
1198 according to conjoining Jamo behavior in F<the Unicode Standard>
1199 and F<HangulSyllableType.txt>.
1201 B<Implementation Note:>
1202 (1) For expansion mapping (Unicode character mapped
1203 to a sequence of collation elements), a terminator will not be added
1204 between collation elements, even if Hangul syllable boundary exists there.
1205 Addition of terminator is restricted to the next position
1206 to the last collation element.
1208 (2) Non-conjoining Hangul letters
1209 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1210 automatically terminated with a terminator primary weight.
1211 These characters may need terminator included in a collation element
1218 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1220 Makes the entry in the table completely ignorable;
1221 i.e. as if the weights were zero at all level.
1223 E.g. when 'a' and 'e' are ignorable,
1224 'element' is equal to 'lament' (or 'lmnt').
1228 -- see 4.3 Form a sort key for each string, UTS #10.
1230 Set the maximum level.
1231 Any higher levels than the specified one are ignored.
1233 Level 1: alphabetic ordering
1234 Level 2: diacritic ordering
1235 Level 3: case ordering
1236 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1240 If omitted, the maximum is the 4th.
1244 -- see 4.1 Normalize each input string, UTS #10.
1246 If specified, strings are normalized before preparation of sort keys
1247 (the normalization is executed after preprocess).
1249 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1250 as C<$normalization_form>.
1251 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1252 See C<Unicode::Normalize::normalize()> for detail.
1253 If omitted, C<'NFD'> is used.
1255 C<normalization> is performed after C<preprocess> (if defined).
1257 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1258 though they are not concerned with C<Unicode::Normalize::normalize()>.
1260 If C<undef> (not a string C<"undef">) is passed explicitly
1261 as the value for this key,
1262 any normalization is not carried out (this may make tailoring easier
1263 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1264 only contiguous contractions are resolved;
1265 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1266 C<A-cedilla-ring> would be primary equal to C<A>.
1268 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1269 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1271 In the case of C<(normalization =E<gt> "prenormalized")>,
1272 any normalization is not performed, but
1273 non-contiguous contractions with combining characters are performed.
1275 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1276 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1277 If source strings are finely prenormalized,
1278 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1280 Except C<(normalization =E<gt> undef)>,
1281 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1285 -- see 7.1 Derived Collation Elements, UTS #10.
1287 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1288 (but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than
1289 C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1290 C<U+20000> to C<U+2A6D6>].
1292 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1294 ex. CJK Unified Ideographs in the JIS code point order.
1296 overrideCJK => sub {
1297 my $u = shift; # get a Unicode codepoint
1298 my $b = pack('n', $u); # to UTF-16BE
1299 my $s = your_unicode_to_sjis_converter($b); # convert
1300 my $n = unpack('n', $s); # convert sjis to short
1301 [ $n, 0x20, 0x2, $u ]; # return the collation element
1304 ex. ignores all CJK Unified Ideographs.
1306 overrideCJK => sub {()}, # CODEREF returning empty list
1308 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1309 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1311 If C<undef> is passed explicitly as the value for this key,
1312 weights for CJK Unified Ideographs are treated as undefined.
1313 But assignment of weight for CJK Unified Ideographs
1314 in table or C<entry> is still valid.
1316 =item overrideHangul
1318 -- see 7.1 Derived Collation Elements, UTS #10.
1320 By default, Hangul Syllables are decomposed into Hangul Jamo,
1321 even if C<(normalization =E<gt> undef)>.
1322 But the mapping of Hangul Syllables may be overrided.
1324 This tag works like C<overrideCJK>, so see there for examples.
1326 If you want to override the mapping of Hangul Syllables,
1327 NFD, NFKD, and FCD are not appropriate,
1328 since they will decompose Hangul Syllables before overriding.
1330 If C<undef> is passed explicitly as the value for this key,
1331 weight for Hangul Syllables is treated as undefined
1332 without decomposition into Hangul Jamo.
1333 But definition of weight for Hangul Syllables
1334 in table or C<entry> is still valid.
1338 -- see 5.1 Preprocessing, UTS #10.
1340 If specified, the coderef is used to preprocess
1341 before the formation of sort keys.
1343 ex. dropping English articles, such as "a" or "the".
1344 Then, "the pen" is before "a pencil".
1348 $str =~ s/\b(?:an?|the)\s+//gi;
1352 C<preprocess> is performed before C<normalization> (if defined).
1356 -- see 3.1.3 Rearrangement, UTS #10.
1358 Characters that are not coded in logical order and to be rearranged.
1361 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1363 If you want to disallow any rearrangement,
1364 pass C<undef> or C<[]> (a reference to an empty list)
1365 as the value for this key.
1367 B<According to the version 9 of UCA, this parameter shall not be used;
1368 but it is not warned at present.>
1372 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1374 You can use another collation element table if desired.
1376 The table file should locate in the F<Unicode/Collate> directory
1377 on C<@INC>. Say, if the filename is F<Foo.txt>
1378 the table file is searched as F<Unicode/Collate/Foo.txt> in <@INC>.
1380 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1382 If C<undef> is passed explicitly as the value for this key,
1383 no file is read (but you can define collation elements via C<entry>).
1385 A typical way to define a collation element table
1386 without any file of table:
1388 $onlyABC = Unicode::Collate->new(
1390 entry => << 'ENTRIES',
1391 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1392 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1393 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1394 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1395 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1396 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1404 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1406 Undefines the collation element as if it were unassigned in the table.
1407 This reduces the size of the table.
1408 If an unassigned character appears in the string to be collated,
1409 the sort key is made from its codepoint
1410 as a single-character collation element,
1411 as it is greater than any other assigned collation elements
1412 (in the codepoint order among the unassigned characters).
1413 But, it'd be better to ignore characters
1414 unfamiliar to you and maybe never used.
1416 ex. Collation weights for beyond-BMP characters are not stored in object:
1418 undefChar => qr/[^\0-\x{fffd}]/,
1420 =item katakana_before_hiragana
1422 =item upper_before_lower
1424 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1426 By default, lowercase is before uppercase
1427 and hiragana is before katakana.
1429 If the tag is made true, this is reversed.
1431 B<NOTE>: These tags simplemindedly assume
1432 any lowercase/uppercase or hiragana/katakana distinctions
1433 must occur in level 3, and their weights at level 3
1434 must be same as those mentioned in 7.3.1, UTS #10.
1435 If you define your collation elements which violate this requirement,
1436 these tags don't work validly.
1442 -- see 3.2.2 Variable Weighting, UTS #10.
1444 (the title in UCA version 8: Alternate Weighting)
1446 This key allows to variable weighting for variable collation elements,
1447 which are marked with an ASTERISK in the table
1448 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1450 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1452 These names are case-insensitive.
1453 By default (if specification is omitted), 'shifted' is adopted.
1455 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1456 considered at the 4th level.
1458 'Non-Ignorable' Variable elements are not reset to ignorable.
1460 'Shifted' Variable elements are made ignorable at levels 1 through 3
1461 their level 4 weight is replaced by the old level 1 weight.
1462 Level 4 weight for Non-Variable elements is 0xFFFF.
1464 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1467 For backward compatibility, C<alternate> can be used as an alias
1472 =head2 Methods for Collation
1476 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1478 Sorts a list of strings.
1480 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1482 Returns 1 (when C<$a> is greater than C<$b>)
1483 or 0 (when C<$a> is equal to C<$b>)
1484 or -1 (when C<$a> is lesser than C<$b>).
1486 =item C<$result = $Collator-E<gt>eq($a, $b)>
1488 =item C<$result = $Collator-E<gt>ne($a, $b)>
1490 =item C<$result = $Collator-E<gt>lt($a, $b)>
1492 =item C<$result = $Collator-E<gt>le($a, $b)>
1494 =item C<$result = $Collator-E<gt>gt($a, $b)>
1496 =item C<$result = $Collator-E<gt>ge($a, $b)>
1498 They works like the same name operators as theirs.
1500 eq : whether $a is equal to $b.
1501 ne : whether $a is not equal to $b.
1502 lt : whether $a is lesser than $b.
1503 le : whether $a is lesser than $b or equal to $b.
1504 gt : whether $a is greater than $b.
1505 ge : whether $a is greater than $b or equal to $b.
1507 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1509 -- see 4.3 Form a sort key for each string, UTS #10.
1513 You compare the sort keys using a binary comparison
1514 and get the result of the comparison of the strings using UCA.
1516 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1520 $Collator->cmp($a, $b)
1522 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1524 use Unicode::Collate;
1525 my $c = Unicode::Collate->new();
1526 print $c->viewSortKey("Perl"),"\n";
1529 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1530 # Level 1 Level 2 Level 3 Level 4
1532 (If C<UCA_Version> is 8, the output is slightly different.)
1536 =head2 Methods for Searching
1538 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1539 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1540 C<subst>, C<gsubst>) is croaked,
1541 as the position and the length might differ
1542 from those on the specified string.
1543 (And C<rearrange> and C<hangul_terminator> tags are neglected.)
1545 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1546 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1547 but they are not aware of any pattern, but only a literal substring.
1551 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1553 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1555 If C<$substring> matches a part of C<$string>, returns
1556 the position of the first occurrence of the matching part in scalar context;
1557 in list context, returns a two-element list of
1558 the position and the length of the matching part.
1560 If C<$substring> does not match any part of C<$string>,
1561 returns C<-1> in scalar context and
1562 an empty list in list context.
1566 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1567 # (normalization => undef) is REQUIRED.
1568 my $str = "Ich muß studieren Perl.";
1571 if (my($pos,$len) = $Collator->index($str, $sub)) {
1572 $match = substr($str, $pos, $len);
1575 and get C<"muß"> in C<$match> since C<"muß">
1576 is primary equal to C<"MÜSS">.
1578 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1580 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1582 If C<$substring> matches a part of C<$string>, in scalar context, returns
1583 B<a reference to> the first occurrence of the matching part
1584 (C<$match_ref> is always true if matches,
1585 since every reference is B<true>);
1586 in list context, returns the first occurrence of the matching part.
1588 If C<$substring> does not match any part of C<$string>,
1589 returns C<undef> in scalar context and
1590 an empty list in list context.
1594 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1595 print "matches [$$match_ref].\n";
1597 print "doesn't match.\n";
1602 if (($match) = $Collator->match($str, $sub)) { # list context
1603 print "matches [$match].\n";
1605 print "doesn't match.\n";
1608 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1610 If C<$substring> matches a part of C<$string>, returns
1611 all the matching parts (or matching count in scalar context).
1613 If C<$substring> does not match any part of C<$string>,
1614 returns an empty list.
1616 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1618 If C<$substring> matches a part of C<$string>,
1619 the first occurrence of the matching part is replaced by C<$replacement>
1620 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1622 C<$replacement> can be a C<CODEREF>,
1623 taking the matching part as an argument,
1624 and returning a string to replace the matching part
1625 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1627 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1629 If C<$substring> matches a part of C<$string>,
1630 all the occurrences of the matching part is replaced by C<$replacement>
1631 (C<$string> is modified) and return C<$count>.
1633 C<$replacement> can be a C<CODEREF>,
1634 taking the matching part as an argument,
1635 and returning a string to replace the matching part
1636 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1640 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1641 # (normalization => undef) is REQUIRED.
1642 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1643 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1645 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1646 # i.e., all the camels are made bold-faced.
1650 =head2 Other Methods
1654 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1656 Change the value of specified keys and returns the changed part.
1658 $Collator = Unicode::Collate->new(level => 4);
1660 $Collator->eq("perl", "PERL"); # false
1662 %old = $Collator->change(level => 2); # returns (level => 4).
1664 $Collator->eq("perl", "PERL"); # true
1666 $Collator->change(%old); # returns (level => 2).
1668 $Collator->eq("perl", "PERL"); # false
1670 Not all C<(key,value)>s are allowed to be changed.
1671 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1673 In the scalar context, returns the modified collator
1674 (but it is B<not> a clone from the original).
1676 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1678 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1680 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1682 =item C<$version = $Collator-E<gt>version()>
1684 Returns the version number (a string) of the Unicode Standard
1685 which the C<table> file used by the collator object is based on.
1686 If the table does not include a version line (starting with C<@version>),
1687 returns C<"unknown">.
1689 =item C<UCA_Version()>
1691 Returns the tracking version number of UTS #10 this module consults.
1693 =item C<Base_Unicode_Version()>
1695 Returns the version number of UTS #10 this module consults.
1705 Use of the C<normalization> parameter requires
1706 the B<Unicode::Normalize> module.
1708 If you need not it (say, in the case when you need not
1709 handle any combining characters),
1710 assign C<normalization =E<gt> undef> explicitly.
1712 -- see 6.5 Avoiding Normalization, UTS #10.
1714 =head2 Conformance Test
1716 The Conformance Test for the UCA is available
1717 under L<http://www.unicode.org/Public/UCA/>.
1719 For F<CollationTest_SHIFTED.txt>,
1720 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1721 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1722 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1724 B<Unicode::Normalize is required to try The Conformance Test.>
1728 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1730 http://homepage1.nifty.com/nomenclator/perl/
1732 Copyright(C) 2001-2004, SADAHIRO Tomoyuki. Japan. All rights reserved.
1734 This library is free software; you can redistribute it
1735 and/or modify it under the same terms as Perl itself.
1741 =item Unicode Collation Algorithm - UTS #10
1743 L<http://www.unicode.org/reports/tr10/>
1745 =item The Default Unicode Collation Element Table (DUCET)
1747 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1749 =item The conformance test for the UCA
1751 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1753 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1755 =item Hangul Syllable Type
1757 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1759 =item Unicode Normalization Forms - UAX #15
1761 L<http://www.unicode.org/reports/tr15/>
1763 =item L<Unicode::Normalize>