1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
17 our $VERSION = '0.50';
18 our $PACKAGE = __PACKAGE__;
20 my @Path = qw(Unicode Collate);
21 my $KeyFile = "allkeys.txt";
24 use constant TRUE => 1;
25 use constant FALSE => "";
26 use constant NOMATCHPOS => -1;
28 # A coderef to get combining class imported from Unicode::Normalize
29 # (i.e. \&Unicode::Normalize::getCombinClass).
30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
34 use constant MinLevel => 1;
35 use constant MaxLevel => 4;
37 # Minimum weights at level 2 and 3, respectively
38 use constant Min2Wt => 0x20;
39 use constant Min3Wt => 0x02;
41 # Shifted weight at 4th level
42 use constant Shift4Wt => 0xFFFF;
44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45 # PROBLEM: The Default Unicode Collation Element Table
46 # has weights over 0xFFFF at the 4th level.
47 # The tie-breaking in the variable weights
48 # other than "shift" (as well as "shift-trimmed") is unreliable.
49 use constant VCE_TEMPLATE => 'Cn4';
51 # A sort key: 16-bit weights
52 # See also the PROBLEM on VCE_TEMPLATE above.
53 use constant KEY_TEMPLATE => 'n*';
55 # Level separator in a sort key:
56 # i.e. pack(KEY_TEMPLATE, 0)
57 use constant LEVEL_SEP => "\0\0";
59 # As Unicode code point separator for hash keys.
60 # A joined code point string (denoted by JCPS below)
61 # like "65;768" is used for internal processing
62 # instead of Perl's Unicode string like "\x41\x{300}",
63 # as the native code point is different from the Unicode code point
65 # This character must not be included in any stringified
66 # representation of an integer.
67 use constant CODE_SEP => ';';
69 # boolean values of variable weights
70 use constant NON_VAR => 0; # Non-Variable character
71 use constant VAR => 1; # Variable character
73 # specific code points
74 use constant Hangul_LBase => 0x1100;
75 use constant Hangul_LIni => 0x1100;
76 use constant Hangul_LFin => 0x1159;
77 use constant Hangul_LFill => 0x115F;
78 use constant Hangul_VBase => 0x1161;
79 use constant Hangul_VIni => 0x1160; # from Vowel Filler
80 use constant Hangul_VFin => 0x11A2;
81 use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
82 use constant Hangul_TIni => 0x11A8;
83 use constant Hangul_TFin => 0x11F9;
84 use constant Hangul_TCount => 28;
85 use constant Hangul_NCount => 588;
86 use constant Hangul_SBase => 0xAC00;
87 use constant Hangul_SIni => 0xAC00;
88 use constant Hangul_SFin => 0xD7A3;
89 use constant CJK_UidIni => 0x4E00;
90 use constant CJK_UidFin => 0x9FA5;
91 use constant CJK_UidF41 => 0x9FBB;
92 use constant CJK_ExtAIni => 0x3400;
93 use constant CJK_ExtAFin => 0x4DB5;
94 use constant CJK_ExtBIni => 0x20000;
95 use constant CJK_ExtBFin => 0x2A6D6;
96 use constant BMP_Max => 0xFFFF;
98 # Logical_Order_Exception in PropList.txt
99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
101 sub UCA_Version { "14" }
103 sub Base_Unicode_Version { "4.1.0" }
108 return pack('U*', @_);
112 return unpack('U*', shift(@_).pack('U*'));
119 blanked non-ignorable shifted shift-trimmed
120 / } = (); # keys lowercased
123 alternate backwards level normalization rearrange
124 katakana_before_hiragana upper_before_lower
125 overrideHangul overrideCJK preprocess UCA_Version
126 hangul_terminator variable
130 entry mapping table maxlength
131 ignoreChar ignoreName undefChar undefName variableTable
132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
133 derivCode normCode rearrangeHash
136 # The hash key 'ignored' is deleted at v 0.21.
137 # The hash key 'isShift' is deleted at v 0.23.
138 # The hash key 'combining' is deleted at v 0.24.
139 # The hash key 'entries' is deleted at v 0.30.
140 # The hash key 'L3_ignorable' is deleted at v 0.40.
144 return $self->{versionTable} || 'unknown';
147 my (%ChangeOK, %ChangeNG);
148 @ChangeOK{ @ChangeOK } = ();
149 @ChangeNG{ @ChangeNG } = ();
155 if (exists $hash{variable} && exists $hash{alternate}) {
156 delete $hash{alternate};
158 elsif (!exists $hash{variable} && exists $hash{alternate}) {
159 $hash{variable} = $hash{alternate};
161 foreach my $k (keys %hash) {
162 if (exists $ChangeOK{$k}) {
163 $old{$k} = $self->{$k};
164 $self->{$k} = $hash{$k};
166 elsif (exists $ChangeNG{$k}) {
167 croak "change of $k via change() is not allowed!";
171 $self->checkCollator();
172 return wantarray ? %old : $self;
177 my $key = shift; # 'level' or 'backwards'
178 MinLevel <= $level or croak sprintf
179 "Illegal level %d (in value for key '%s') lower than %d.",
180 $level, $key, MinLevel;
181 $level <= MaxLevel or croak sprintf
182 "Unsupported level %d (in value for key '%s') higher than %d.",
183 $level, $key, MaxLevel;
189 11 => \&_derivCE_9, # 11 == 9
195 _checkLevel($self->{level}, "level");
197 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
198 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
200 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
201 $self->{alternateTable} || 'shifted';
202 $self->{variable} = $self->{alternate} = lc($self->{variable});
203 exists $VariableOK{ $self->{variable} }
204 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
206 if (! defined $self->{backwards}) {
207 $self->{backwardsFlag} = 0;
209 elsif (! ref $self->{backwards}) {
210 _checkLevel($self->{backwards}, "backwards");
211 $self->{backwardsFlag} = 1 << $self->{backwards};
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
236 $self->{normCode} = undef;
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
240 $@ and croak "Unicode::Normalize is required to normalize strings";
242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
247 elsif ($self->{normalization} ne 'prenormalized') {
248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
250 Unicode::Normalize::normalize($norm, shift);
252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
262 my $self = bless { @_ }, $class;
264 # If undef is passed explicitly, no file is read.
265 $self->{table} = $KeyFile if ! exists $self->{table};
266 $self->read_table() if defined $self->{table};
268 if ($self->{entry}) {
269 while ($self->{entry} =~ /([^\n]+)/g) {
270 $self->parseEntry($1);
274 $self->{level} ||= MaxLevel;
275 $self->{UCA_Version} ||= UCA_Version();
277 $self->{overrideHangul} = FALSE
278 if ! exists $self->{overrideHangul};
279 $self->{overrideCJK} = FALSE
280 if ! exists $self->{overrideCJK};
281 $self->{normalization} = 'NFD'
282 if ! exists $self->{normalization};
283 $self->{rearrange} = $self->{rearrangeTable} ||
284 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
285 if ! exists $self->{rearrange};
286 $self->{backwards} = $self->{backwardsTable}
287 if ! exists $self->{backwards};
289 $self->checkCollator();
298 foreach my $d (@INC) {
299 $f = File::Spec->catfile($d, @Path, $self->{table});
300 last if open($fh, $f);
304 or croak "$PACKAGE: $self->{table} is not found in @INC";
308 unless (s/^\s*\@//) {
309 $self->parseEntry($_);
313 if (/^version\s*(\S*)/) {
314 $self->{versionTable} ||= $1;
316 elsif (/^variable\s+(\S*)/) { # since UTS #10-9
317 $self->{variableTable} ||= $1;
319 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
320 $self->{alternateTable} ||= $1;
322 elsif (/^backwards\s+(\S*)/) {
323 push @{ $self->{backwardsTable} }, $1;
325 elsif (/^forwards\s+(\S*)/) { # parhaps no use
326 push @{ $self->{forwardsTable} }, $1;
328 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
329 push @{ $self->{rearrangeTable} }, _getHexArray($1);
337 ## get $line, parse it, and write an entry in $self
343 my($name, $entry, @uv, @key);
345 return if $line !~ /^\s*[0-9A-Fa-f]/;
347 # removes comment and gets name
349 if $line =~ s/[#%]\s*(.*)//;
350 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
353 my($e, $k) = split /;/, $line;
354 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
357 @uv = _getHexArray($e);
360 $entry = join(CODE_SEP, @uv); # in JCPS
362 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
363 my $ele = pack_U(@uv);
365 # regarded as if it were not entried in the table
367 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
369 # replaced as completely ignorable
370 $k = '[.0000.0000.0000.0000]'
371 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
374 # replaced as completely ignorable
375 $k = '[.0000.0000.0000.0000]'
376 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
378 my $is_L3_ignorable = TRUE;
380 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
381 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
382 my @wt = _getHexArray($arr);
383 push @key, pack(VCE_TEMPLATE, $var, @wt);
384 $is_L3_ignorable = FALSE
385 if $wt[0] || $wt[1] || $wt[2];
386 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
387 # is completely ignorable.
388 # For expansion, an entry $is_L3_ignorable
389 # if and only if "all" CEs are [.0000.0000.0000].
392 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
395 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
396 and $self->{maxlength}{$uv[0]} = @uv;
402 ## VCE = _varCE(variable term, VCE)
408 if ($vbl eq 'non-ignorable') {
411 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
414 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
415 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
417 elsif ($vbl eq 'blanked') {
421 return pack(VCE_TEMPLATE, $var, @wt[0..2],
422 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
429 $self->visualizeSortKey($self->getSortKey(@_));
435 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
437 if ($self->{UCA_Version} <= 8) {
438 $view =~ s/ ?0000 ?/|/g;
440 $view =~ s/\b0000\b/|/g;
447 ## arrayref of JCPS = splitEnt(string to be collated)
448 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
455 my $code = $self->{preprocess};
456 my $norm = $self->{normCode};
457 my $map = $self->{mapping};
458 my $max = $self->{maxlength};
459 my $reH = $self->{rearrangeHash};
460 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
465 $code and croak "Preprocess breaks character positions. "
466 . "Don't use with index(), match(), etc.";
467 $norm and croak "Normalization breaks character positions. "
468 . "Don't use with index(), match(), etc.";
473 $str = &$code($str) if ref $code;
474 $str = &$norm($str) if ref $norm;
477 # get array of Unicode code point of string.
478 my @src = unpack_U($str);
481 # Character positions are not kept if rearranged,
482 # then neglected if $wLen is true.
483 if ($reH && ! $wLen) {
484 for (my $i = 0; $i < @src; $i++) {
485 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
486 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
492 # remove a code point marked as a completely ignorable.
493 for (my $i = 0; $i < @src; $i++) {
495 if _isIllegal($src[$i]) || ($ver9 &&
496 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
499 for (my $i = 0; $i < @src; $i++) {
502 # skip removed code point
503 if (! defined $jcps) {
505 $buf[-1][2] = $i + 1;
514 my $temp_jcps = $jcps;
516 my $maxLen = $max->{$jcps};
518 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
519 next if ! defined $src[$p];
520 $temp_jcps .= CODE_SEP . $src[$p];
522 if ($map->{$temp_jcps}) {
528 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
529 # This process requires Unicode::Normalize.
530 # If "normalization" is undef, here should be skipped *always*
531 # (in spite of bool value of $CVgetCombinClass),
532 # since canonical ordering cannot be expected.
533 # Blocked combining character should not be contracted.
535 if ($self->{normalization})
536 # $self->{normCode} is false in the case of "prenormalized".
541 for (my $p = $i + 1; $p < @src; $p++) {
542 next if ! defined $src[$p];
543 $curCC = $CVgetCombinClass->($src[$p]);
545 my $tail = CODE_SEP . $src[$p];
546 if ($preCC != $curCC && $map->{$jcps.$tail}) {
556 # skip completely ignorable
557 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
559 $buf[-1][2] = $i + 1;
564 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
571 ## list of VCE = getWt(JCPS)
577 my $vbl = $self->{variable};
578 my $map = $self->{mapping};
579 my $der = $self->{derivCode};
581 return if !defined $u;
582 return map(_varCE($vbl, $_), @{ $map->{$u} })
585 # JCPS must not be a contraction, then it's a code point.
586 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
587 my $hang = $self->{overrideHangul};
590 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
592 elsif (!defined $hang) {
593 @hangulCE = $der->($u);
596 my $max = $self->{maxlength};
597 my @decH = _decompHangul($u);
600 my $contract = join(CODE_SEP, @decH);
601 @decH = ($contract) if $map->{$contract};
602 } else { # must be <@decH == 3>
603 if ($max->{$decH[0]}) {
604 my $contract = join(CODE_SEP, @decH);
605 if ($map->{$contract}) {
608 $contract = join(CODE_SEP, @decH[0,1]);
609 $map->{$contract} and @decH = ($contract, $decH[2]);
611 # even if V's ignorable, LT contraction is not supported.
612 # If such a situatution were required, NFD should be used.
614 if (@decH == 3 && $max->{$decH[1]}) {
615 my $contract = join(CODE_SEP, @decH[1,2]);
616 $map->{$contract} and @decH = ($decH[0], $contract);
621 $map->{$_} ? @{ $map->{$_} } : $der->($_);
624 return map _varCE($vbl, $_), @hangulCE;
626 elsif (_isUIdeo($u, $self->{UCA_Version})) {
627 my $cjk = $self->{overrideCJK};
628 return map _varCE($vbl, $_),
630 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
631 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
636 return map _varCE($vbl, $_), $der->($u);
642 ## string sortkey = getSortKey(string arg)
647 my $lev = $self->{level};
648 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
649 my $v2i = $self->{UCA_Version} >= 9 &&
650 $self->{variable} ne 'non-ignorable';
652 my @buf; # weight arrays
653 if ($self->{hangul_terminator}) {
655 foreach my $jcps (@$rEnt) {
656 # weird things like VL, TL-contraction are not considered!
658 foreach my $u (split /;/, $jcps) {
659 $curHST .= getHST($u);
661 if ($preHST && !$curHST || # hangul before non-hangul
662 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
663 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
664 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
666 push @buf, $self->getWtHangulTerm();
670 push @buf, $self->getWt($jcps);
672 $preHST # end at hangul
673 and push @buf, $self->getWtHangulTerm();
676 foreach my $jcps (@$rEnt) {
677 push @buf, $self->getWt($jcps);
682 my @ret = ([],[],[],[]);
683 my $last_is_variable;
685 foreach my $vwt (@buf) {
686 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
688 # "Ignorable (L1, L2) after Variable" since track. v. 9
691 $last_is_variable = TRUE;
693 elsif (!$wt[0]) { # ignorable
694 next if $last_is_variable;
697 $last_is_variable = FALSE;
700 foreach my $v (0..$lev-1) {
701 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
705 # modification of tertiary weights
706 if ($self->{upper_before_lower}) {
707 foreach (@{ $ret[2] }) {
708 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
709 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
710 elsif ($_ == 0x1C) { $_ += 1 } # square upper
711 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
714 if ($self->{katakana_before_hiragana}) {
715 foreach (@{ $ret[2] }) {
716 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
717 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
721 if ($self->{backwardsFlag}) {
722 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
723 if ($self->{backwardsFlag} & (1 << $v)) {
724 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
729 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
734 ## int compare = cmp(string a, string b)
736 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
737 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
738 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
739 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
740 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
741 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
742 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
745 ## list[strings] sorted = sort(list[strings] arg)
751 sort{ $a->[0] cmp $b->[0] }
752 map [ $obj->getSortKey($_), $_ ], @_;
759 (CJK_UidIni <= $u && $u <= CJK_UidF41)
761 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
762 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
766 my $aaaa = $base + ($u >> 15);
767 my $bbbb = ($u & 0x7FFF) | 0x8000;
769 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
770 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
776 (CJK_UidIni <= $u && $u <= CJK_UidFin)
778 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
779 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
783 my $aaaa = $base + ($u >> 15);
784 my $bbbb = ($u & 0x7FFF) | 0x8000;
786 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
787 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
792 my $aaaa = 0xFF80 + ($code >> 15);
793 my $bbbb = ($code & 0x7FFF) | 0x8000;
795 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
796 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
801 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
805 my ($u, $uca_vers) = @_;
808 ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
810 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
812 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
817 sub getWtHangulTerm {
819 return _varCE($self->{variable},
820 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
825 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
827 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
830 # $code *must* be in Hangul syllable.
831 # Check it before you enter here.
835 my $si = $code - Hangul_SBase;
836 my $li = int( $si / Hangul_NCount);
837 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
838 my $ti = $si % Hangul_TCount;
842 $ti ? (Hangul_TBase + $ti) : (),
848 return ! defined $code # removed
849 || ($code < 0 || 0x10FFFF < $code) # out of range
850 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
851 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
852 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
856 # Hangul Syllable Type
860 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
861 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
862 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
863 Hangul_SIni <= $u && $u <= Hangul_SFin ?
864 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
869 ## bool _nonIgnorAtLevel(arrayref weights, int level)
871 sub _nonIgnorAtLevel($$)
874 return if ! defined $wt;
876 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
881 ## arrayref of arrayref[weights] source,
882 ## arrayref of arrayref[weights] substr,
884 ## * comparison of graphemes vs graphemes.
885 ## @$source >= @$substr must be true (check it before call this);
893 for my $g (0..@$substr-1){
894 # Do the $g'th graphemes have the same number of AV weigths?
895 return if @{ $source->[$g] } != @{ $substr->[$g] };
897 for my $w (0..@{ $substr->[$g] }-1) {
898 for my $v (0..$lev-1) {
899 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
907 ## (int position, int length)
908 ## int position = index(string, substring, position, [undoc'ed grobal])
910 ## With "grobal" (only for the list context),
911 ## returns list of arrayref[position, length].
917 my $len = length($str);
918 my $subE = $self->splitEnt(shift);
919 my $pos = @_ ? shift : 0;
920 $pos = 0 if $pos < 0;
923 my $lev = $self->{level};
924 my $v2i = $self->{UCA_Version} >= 9 &&
925 $self->{variable} ne 'non-ignorable';
928 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
930 ? map([$_, 0], $temp..$len)
931 : wantarray ? ($temp,0) : $temp;
934 and return wantarray ? () : NOMATCHPOS;
935 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
937 or return wantarray ? () : NOMATCHPOS;
939 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
941 my $last_is_variable;
942 for my $vwt (map $self->getWt($_), @$subE) {
943 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
944 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
946 # "Ignorable (L1, L2) after Variable" since track. v. 9
949 $last_is_variable = TRUE;
951 elsif (!$wt[0]) { # ignorable
952 $to_be_pushed = FALSE if $last_is_variable;
955 $last_is_variable = FALSE;
959 if (@subWt && !$var && !$wt[0]) {
960 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
962 push @subWt, [ \@wt ];
967 my $end = @$strE - 1;
969 $last_is_variable = FALSE; # reuse
970 for (my $i = 0; $i <= $end; ) { # no $i++
974 while ($i <= $end && $found_base == 0) {
975 for my $vwt ($self->getWt($strE->[$i][0])) {
976 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
977 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
979 # "Ignorable (L1, L2) after Variable" since track. v. 9
982 $last_is_variable = TRUE;
984 elsif (!$wt[0]) { # ignorable
985 $to_be_pushed = FALSE if $last_is_variable;
988 $last_is_variable = FALSE;
992 if (@strWt && !$var && !$wt[0]) {
993 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
994 $finPos[-1] = $strE->[$i][2];
995 } elsif ($to_be_pushed) {
996 push @strWt, [ \@wt ];
997 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
998 $finPos[-1] = NOMATCHPOS if $found_base;
999 push @finPos, $strE->[$i][2];
1008 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1009 if ($iniPos[0] != NOMATCHPOS &&
1010 $finPos[$#subWt] != NOMATCHPOS &&
1011 _eqArray(\@strWt, \@subWt, $lev)) {
1012 my $temp = $iniPos[0] + $pos;
1015 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1016 splice @strWt, 0, $#subWt;
1017 splice @iniPos, 0, $#subWt;
1018 splice @finPos, 0, $#subWt;
1022 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1034 : wantarray ? () : NOMATCHPOS;
1038 ## scalarref to matching part = match(string, substring)
1043 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1044 my $temp = substr($_[0], $pos, $len);
1045 return wantarray ? $temp : \$temp;
1046 # An lvalue ref \substr should be avoided,
1047 # since its value is affected by modification of its referent.
1055 ## arrayref matching parts = gmatch(string, substring)
1062 return map substr($str, $_->[0], $_->[1]),
1063 $self->index($str, $sub, 0, 'g');
1067 ## bool subst'ed = subst(string, substring, replace)
1072 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1074 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1076 my $mat = substr($_[0], $pos, $len);
1077 substr($_[0], $pos, $len, $code->($mat));
1079 substr($_[0], $pos, $len, $_[2]);
1089 ## int count = gsubst(string, substring, replace)
1094 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1097 # Replacement is carried out from the end, then use reverse.
1098 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1100 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1101 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1103 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1115 Unicode::Collate - Unicode Collation Algorithm
1119 use Unicode::Collate;
1122 $Collator = Unicode::Collate->new(%tailoring);
1125 @sorted = $Collator->sort(@not_sorted);
1128 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1130 # If %tailoring is false (i.e. empty),
1131 # $Collator should do the default collation.
1135 This module is an implementation of Unicode Technical Standard #10
1136 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1138 =head2 Constructor and Tailoring
1140 The C<new> method returns a collator object.
1142 $Collator = Unicode::Collate->new(
1143 UCA_Version => $UCA_Version,
1144 alternate => $alternate, # deprecated: use of 'variable' is recommended.
1145 backwards => $levelNumber, # or \@levelNumbers
1147 hangul_terminator => $term_primary_weight,
1148 ignoreName => qr/$ignoreName/,
1149 ignoreChar => qr/$ignoreChar/,
1150 katakana_before_hiragana => $bool,
1151 level => $collationLevel,
1152 normalization => $normalization_form,
1153 overrideCJK => \&overrideCJK,
1154 overrideHangul => \&overrideHangul,
1155 preprocess => \&preprocess,
1156 rearrange => \@charList,
1158 undefName => qr/$undefName/,
1159 undefChar => qr/$undefChar/,
1160 upper_before_lower => $bool,
1161 variable => $variable,
1168 If the tracking version number of UCA is given,
1169 behavior of that tracking version is emulated on collating.
1170 If omitted, the return value of C<UCA_Version()> is used.
1171 C<UCA_Version()> should return the latest tracking version supported.
1173 The supported tracking version: 8, 9, 11, or 14.
1175 UCA tracking version Unicode version
1177 9 3.1 with Corrigendum 3
1181 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1185 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1187 For backward compatibility, C<alternate> (old name) can be used
1188 as an alias for C<variable>.
1192 -- see 3.1.2 French Accents, UTS #10.
1194 backwards => $levelNumber or \@levelNumbers
1196 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1197 If omitted, forwards at all the levels.
1201 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1203 If the same character (or a sequence of characters) exists
1204 in the collation element table through C<table>,
1205 mapping to collation elements is overrided.
1206 If it does not exist, the mapping is defined additionally.
1208 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1209 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1210 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1211 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1212 006C 006C ; [.0F4C.0020.0002.006C] # ll
1213 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1214 004C 004C ; [.0F4C.0020.0008.004C] # LL
1215 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1216 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1217 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1218 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1221 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1222 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1223 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1226 B<NOTE:> The code point in the UCA file format (before C<';'>)
1227 B<must> be a Unicode code point (defined as hexadecimal),
1228 but not a native code point.
1229 So C<0063> must always denote C<U+0063>,
1230 but not a character of C<"\x63">.
1232 Weighting may vary depending on collation element table.
1233 So ensure the weights defined in C<entry> will be consistent with
1234 those in the collation element table loaded via C<table>.
1236 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1237 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1238 (as a value between C<0E60> and C<0E6D>)
1239 makes ordering as C<C E<lt> CH E<lt> D>.
1240 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1241 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1242 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1243 and C<c-curl> (C<U+0255>) with C<0E69>.
1244 Then primary weight C<0E6A> for C<CH> makes C<CH>
1245 ordered between C<c-curl> and C<D>.
1247 =item hangul_terminator
1249 -- see 7.1.4 Trailing Weights, UTS #10.
1251 If a true value is given (non-zero but should be positive),
1252 it will be added as a terminator primary weight to the end of
1253 every standard Hangul syllable. Secondary and any higher weights
1254 for terminator are set to zero.
1255 If the value is false or C<hangul_terminator> key does not exist,
1256 insertion of terminator weights will not be performed.
1258 Boundaries of Hangul syllables are determined
1259 according to conjoining Jamo behavior in F<the Unicode Standard>
1260 and F<HangulSyllableType.txt>.
1262 B<Implementation Note:>
1263 (1) For expansion mapping (Unicode character mapped
1264 to a sequence of collation elements), a terminator will not be added
1265 between collation elements, even if Hangul syllable boundary exists there.
1266 Addition of terminator is restricted to the next position
1267 to the last collation element.
1269 (2) Non-conjoining Hangul letters
1270 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1271 automatically terminated with a terminator primary weight.
1272 These characters may need terminator included in a collation element
1279 -- see 3.2.2 Variable Weighting, UTS #10.
1281 Makes the entry in the table completely ignorable;
1282 i.e. as if the weights were zero at all level.
1284 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1285 will be ignored. Through C<ignoreName>, any character whose name
1286 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1289 E.g. when 'a' and 'e' are ignorable,
1290 'element' is equal to 'lament' (or 'lmnt').
1292 =item katakana_before_hiragana
1294 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1296 By default, hiragana is before katakana.
1297 If the parameter is made true, this is reversed.
1299 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1300 distinctions must occur in level 3, and their weights at level 3 must be
1301 same as those mentioned in 7.3.1, UTS #10.
1302 If you define your collation elements which violate this requirement,
1303 this parameter does not work validly.
1307 -- see 4.3 Form Sort Key, UTS #10.
1309 Set the maximum level.
1310 Any higher levels than the specified one are ignored.
1312 Level 1: alphabetic ordering
1313 Level 2: diacritic ordering
1314 Level 3: case ordering
1315 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1319 If omitted, the maximum is the 4th.
1323 -- see 4.1 Normalize, UTS #10.
1325 If specified, strings are normalized before preparation of sort keys
1326 (the normalization is executed after preprocess).
1328 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1329 as C<$normalization_form>.
1330 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1331 See C<Unicode::Normalize::normalize()> for detail.
1332 If omitted, C<'NFD'> is used.
1334 C<normalization> is performed after C<preprocess> (if defined).
1336 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1337 though they are not concerned with C<Unicode::Normalize::normalize()>.
1339 If C<undef> (not a string C<"undef">) is passed explicitly
1340 as the value for this key,
1341 any normalization is not carried out (this may make tailoring easier
1342 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1343 only contiguous contractions are resolved;
1344 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1345 C<A-cedilla-ring> would be primary equal to C<A>.
1347 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1348 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1350 In the case of C<(normalization =E<gt> "prenormalized")>,
1351 any normalization is not performed, but
1352 non-contiguous contractions with combining characters are performed.
1354 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1355 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1356 If source strings are finely prenormalized,
1357 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1359 Except C<(normalization =E<gt> undef)>,
1360 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1364 -- see 7.1 Derived Collation Elements, UTS #10.
1366 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1367 but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1368 C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1369 are lesser than C<CJK Unified Ideographs Extension> (its range is
1370 C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1372 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1374 ex. CJK Unified Ideographs in the JIS code point order.
1376 overrideCJK => sub {
1377 my $u = shift; # get a Unicode codepoint
1378 my $b = pack('n', $u); # to UTF-16BE
1379 my $s = your_unicode_to_sjis_converter($b); # convert
1380 my $n = unpack('n', $s); # convert sjis to short
1381 [ $n, 0x20, 0x2, $u ]; # return the collation element
1384 ex. ignores all CJK Unified Ideographs.
1386 overrideCJK => sub {()}, # CODEREF returning empty list
1388 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1389 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1391 If C<undef> is passed explicitly as the value for this key,
1392 weights for CJK Unified Ideographs are treated as undefined.
1393 But assignment of weight for CJK Unified Ideographs
1394 in table or C<entry> is still valid.
1396 =item overrideHangul
1398 -- see 7.1 Derived Collation Elements, UTS #10.
1400 By default, Hangul Syllables are decomposed into Hangul Jamo,
1401 even if C<(normalization =E<gt> undef)>.
1402 But the mapping of Hangul Syllables may be overrided.
1404 This parameter works like C<overrideCJK>, so see there for examples.
1406 If you want to override the mapping of Hangul Syllables,
1407 NFD, NFKD, and FCD are not appropriate,
1408 since they will decompose Hangul Syllables before overriding.
1410 If C<undef> is passed explicitly as the value for this key,
1411 weight for Hangul Syllables is treated as undefined
1412 without decomposition into Hangul Jamo.
1413 But definition of weight for Hangul Syllables
1414 in table or C<entry> is still valid.
1418 -- see 5.1 Preprocessing, UTS #10.
1420 If specified, the coderef is used to preprocess
1421 before the formation of sort keys.
1423 ex. dropping English articles, such as "a" or "the".
1424 Then, "the pen" is before "a pencil".
1428 $str =~ s/\b(?:an?|the)\s+//gi;
1432 C<preprocess> is performed before C<normalization> (if defined).
1436 -- see 3.1.3 Rearrangement, UTS #10.
1438 Characters that are not coded in logical order and to be rearranged.
1439 If C<UCA_Version> is equal to or lesser than 11, default is:
1441 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1443 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1444 (a reference to empty list) as the value for this key.
1446 If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1448 B<According to the version 9 of UCA, this parameter shall not be used;
1449 but it is not warned at present.>
1453 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1455 You can use another collation element table if desired.
1457 The table file should locate in the F<Unicode/Collate> directory
1458 on C<@INC>. Say, if the filename is F<Foo.txt>,
1459 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1461 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1463 If C<undef> is passed explicitly as the value for this key,
1464 no file is read (but you can define collation elements via C<entry>).
1466 A typical way to define a collation element table
1467 without any file of table:
1469 $onlyABC = Unicode::Collate->new(
1471 entry => << 'ENTRIES',
1472 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1473 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1474 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1475 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1476 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1477 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1481 If C<ignoreName> or C<undefName> is used, character names should be
1482 specified as a comment (following C<#>) on each line.
1488 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1490 Undefines the collation element as if it were unassigned in the table.
1491 This reduces the size of the table.
1492 If an unassigned character appears in the string to be collated,
1493 the sort key is made from its codepoint
1494 as a single-character collation element,
1495 as it is greater than any other assigned collation elements
1496 (in the codepoint order among the unassigned characters).
1497 But, it'd be better to ignore characters
1498 unfamiliar to you and maybe never used.
1500 Through C<undefChar>, any character matching C<qr/$undefChar/>
1501 will be undefined. Through C<undefName>, any character whose name
1502 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1505 ex. Collation weights for beyond-BMP characters are not stored in object:
1507 undefChar => qr/[^\0-\x{fffd}]/,
1509 =item upper_before_lower
1511 -- see 6.6 Case Comparisons, UTS #10.
1513 By default, lowercase is before uppercase.
1514 If the parameter is made true, this is reversed.
1516 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1517 distinctions must occur in level 3, and their weights at level 3 must be
1518 same as those mentioned in 7.3.1, UTS #10.
1519 If you define your collation elements which differs from this requirement,
1520 this parameter doesn't work validly.
1524 -- see 3.2.2 Variable Weighting, UTS #10.
1526 This key allows to variable weighting for variable collation elements,
1527 which are marked with an ASTERISK in the table
1528 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1530 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1532 These names are case-insensitive.
1533 By default (if specification is omitted), 'shifted' is adopted.
1535 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1536 considered at the 4th level.
1538 'Non-Ignorable' Variable elements are not reset to ignorable.
1540 'Shifted' Variable elements are made ignorable at levels 1 through 3
1541 their level 4 weight is replaced by the old level 1 weight.
1542 Level 4 weight for Non-Variable elements is 0xFFFF.
1544 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1549 =head2 Methods for Collation
1553 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1555 Sorts a list of strings.
1557 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1559 Returns 1 (when C<$a> is greater than C<$b>)
1560 or 0 (when C<$a> is equal to C<$b>)
1561 or -1 (when C<$a> is lesser than C<$b>).
1563 =item C<$result = $Collator-E<gt>eq($a, $b)>
1565 =item C<$result = $Collator-E<gt>ne($a, $b)>
1567 =item C<$result = $Collator-E<gt>lt($a, $b)>
1569 =item C<$result = $Collator-E<gt>le($a, $b)>
1571 =item C<$result = $Collator-E<gt>gt($a, $b)>
1573 =item C<$result = $Collator-E<gt>ge($a, $b)>
1575 They works like the same name operators as theirs.
1577 eq : whether $a is equal to $b.
1578 ne : whether $a is not equal to $b.
1579 lt : whether $a is lesser than $b.
1580 le : whether $a is lesser than $b or equal to $b.
1581 gt : whether $a is greater than $b.
1582 ge : whether $a is greater than $b or equal to $b.
1584 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1586 -- see 4.3 Form Sort Key, UTS #10.
1590 You compare the sort keys using a binary comparison
1591 and get the result of the comparison of the strings using UCA.
1593 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1597 $Collator->cmp($a, $b)
1599 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1601 Converts a sorting key into its representation form.
1602 If C<UCA_Version> is 8, the output is slightly different.
1604 use Unicode::Collate;
1605 my $c = Unicode::Collate->new();
1606 print $c->viewSortKey("Perl"),"\n";
1609 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1610 # Level 1 Level 2 Level 3 Level 4
1614 =head2 Methods for Searching
1616 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1617 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1618 C<subst>, C<gsubst>) is croaked,
1619 as the position and the length might differ
1620 from those on the specified string.
1621 (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1623 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1624 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1625 but they are not aware of any pattern, but only a literal substring.
1629 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1631 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1633 If C<$substring> matches a part of C<$string>, returns
1634 the position of the first occurrence of the matching part in scalar context;
1635 in list context, returns a two-element list of
1636 the position and the length of the matching part.
1638 If C<$substring> does not match any part of C<$string>,
1639 returns C<-1> in scalar context and
1640 an empty list in list context.
1644 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1645 # (normalization => undef) is REQUIRED.
1646 my $str = "Ich muß studieren Perl.";
1649 if (my($pos,$len) = $Collator->index($str, $sub)) {
1650 $match = substr($str, $pos, $len);
1653 and get C<"muß"> in C<$match> since C<"muß">
1654 is primary equal to C<"MÜSS">.
1656 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1658 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1660 If C<$substring> matches a part of C<$string>, in scalar context, returns
1661 B<a reference to> the first occurrence of the matching part
1662 (C<$match_ref> is always true if matches,
1663 since every reference is B<true>);
1664 in list context, returns the first occurrence of the matching part.
1666 If C<$substring> does not match any part of C<$string>,
1667 returns C<undef> in scalar context and
1668 an empty list in list context.
1672 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1673 print "matches [$$match_ref].\n";
1675 print "doesn't match.\n";
1680 if (($match) = $Collator->match($str, $sub)) { # list context
1681 print "matches [$match].\n";
1683 print "doesn't match.\n";
1686 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1688 If C<$substring> matches a part of C<$string>, returns
1689 all the matching parts (or matching count in scalar context).
1691 If C<$substring> does not match any part of C<$string>,
1692 returns an empty list.
1694 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1696 If C<$substring> matches a part of C<$string>,
1697 the first occurrence of the matching part is replaced by C<$replacement>
1698 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1700 C<$replacement> can be a C<CODEREF>,
1701 taking the matching part as an argument,
1702 and returning a string to replace the matching part
1703 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1705 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1707 If C<$substring> matches a part of C<$string>,
1708 all the occurrences of the matching part is replaced by C<$replacement>
1709 (C<$string> is modified) and return C<$count>.
1711 C<$replacement> can be a C<CODEREF>,
1712 taking the matching part as an argument,
1713 and returning a string to replace the matching part
1714 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1718 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1719 # (normalization => undef) is REQUIRED.
1720 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1721 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1723 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1724 # i.e., all the camels are made bold-faced.
1728 =head2 Other Methods
1732 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1734 Change the value of specified keys and returns the changed part.
1736 $Collator = Unicode::Collate->new(level => 4);
1738 $Collator->eq("perl", "PERL"); # false
1740 %old = $Collator->change(level => 2); # returns (level => 4).
1742 $Collator->eq("perl", "PERL"); # true
1744 $Collator->change(%old); # returns (level => 2).
1746 $Collator->eq("perl", "PERL"); # false
1748 Not all C<(key,value)>s are allowed to be changed.
1749 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1751 In the scalar context, returns the modified collator
1752 (but it is B<not> a clone from the original).
1754 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1756 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1758 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1760 =item C<$version = $Collator-E<gt>version()>
1762 Returns the version number (a string) of the Unicode Standard
1763 which the C<table> file used by the collator object is based on.
1764 If the table does not include a version line (starting with C<@version>),
1765 returns C<"unknown">.
1767 =item C<UCA_Version()>
1769 Returns the tracking version number of UTS #10 this module consults.
1771 =item C<Base_Unicode_Version()>
1773 Returns the version number of UTS #10 this module consults.
1779 No method will be exported.
1787 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1788 module (see L<Unicode::Normalize>).
1790 If you need not it (say, in the case when you need not
1791 handle any combining characters),
1792 assign C<normalization =E<gt> undef> explicitly.
1794 -- see 6.5 Avoiding Normalization, UTS #10.
1796 =item Conformance Test
1798 The Conformance Test for the UCA is available
1799 under L<http://www.unicode.org/Public/UCA/>.
1801 For F<CollationTest_SHIFTED.txt>,
1802 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1803 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1804 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1806 B<Unicode::Normalize is required to try The Conformance Test.>
1812 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1814 Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved.
1816 This module is free software; you can redistribute it
1817 and/or modify it under the same terms as Perl itself.
1823 =item Unicode Collation Algorithm - UTS #10
1825 L<http://www.unicode.org/reports/tr10/>
1827 =item The Default Unicode Collation Element Table (DUCET)
1829 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1831 =item The conformance test for the UCA
1833 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1835 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1837 =item Hangul Syllable Type
1839 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1841 =item Unicode Normalization Forms - UAX #15
1843 L<http://www.unicode.org/reports/tr15/>