1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
17 our $VERSION = '0.51';
18 our $PACKAGE = __PACKAGE__;
20 my @Path = qw(Unicode Collate);
21 my $KeyFile = "allkeys.txt";
24 use constant TRUE => 1;
25 use constant FALSE => "";
26 use constant NOMATCHPOS => -1;
28 # A coderef to get combining class imported from Unicode::Normalize
29 # (i.e. \&Unicode::Normalize::getCombinClass).
30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
34 use constant MinLevel => 1;
35 use constant MaxLevel => 4;
37 # Minimum weights at level 2 and 3, respectively
38 use constant Min2Wt => 0x20;
39 use constant Min3Wt => 0x02;
41 # Shifted weight at 4th level
42 use constant Shift4Wt => 0xFFFF;
44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45 # PROBLEM: The Default Unicode Collation Element Table
46 # has weights over 0xFFFF at the 4th level.
47 # The tie-breaking in the variable weights
48 # other than "shift" (as well as "shift-trimmed") is unreliable.
49 use constant VCE_TEMPLATE => 'Cn4';
51 # A sort key: 16-bit weights
52 # See also the PROBLEM on VCE_TEMPLATE above.
53 use constant KEY_TEMPLATE => 'n*';
55 # Level separator in a sort key:
56 # i.e. pack(KEY_TEMPLATE, 0)
57 use constant LEVEL_SEP => "\0\0";
59 # As Unicode code point separator for hash keys.
60 # A joined code point string (denoted by JCPS below)
61 # like "65;768" is used for internal processing
62 # instead of Perl's Unicode string like "\x41\x{300}",
63 # as the native code point is different from the Unicode code point
65 # This character must not be included in any stringified
66 # representation of an integer.
67 use constant CODE_SEP => ';';
69 # boolean values of variable weights
70 use constant NON_VAR => 0; # Non-Variable character
71 use constant VAR => 1; # Variable character
73 # specific code points
74 use constant Hangul_LBase => 0x1100;
75 use constant Hangul_LIni => 0x1100;
76 use constant Hangul_LFin => 0x1159;
77 use constant Hangul_LFill => 0x115F;
78 use constant Hangul_VBase => 0x1161;
79 use constant Hangul_VIni => 0x1160; # from Vowel Filler
80 use constant Hangul_VFin => 0x11A2;
81 use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
82 use constant Hangul_TIni => 0x11A8;
83 use constant Hangul_TFin => 0x11F9;
84 use constant Hangul_TCount => 28;
85 use constant Hangul_NCount => 588;
86 use constant Hangul_SBase => 0xAC00;
87 use constant Hangul_SIni => 0xAC00;
88 use constant Hangul_SFin => 0xD7A3;
89 use constant CJK_UidIni => 0x4E00;
90 use constant CJK_UidFin => 0x9FA5;
91 use constant CJK_UidF41 => 0x9FBB;
92 use constant CJK_ExtAIni => 0x3400;
93 use constant CJK_ExtAFin => 0x4DB5;
94 use constant CJK_ExtBIni => 0x20000;
95 use constant CJK_ExtBFin => 0x2A6D6;
96 use constant BMP_Max => 0xFFFF;
98 # Logical_Order_Exception in PropList.txt
99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
101 sub UCA_Version { "14" }
103 sub Base_Unicode_Version { "4.1.0" }
108 return pack('U*', @_);
112 return unpack('U*', shift(@_).pack('U*'));
119 blanked non-ignorable shifted shift-trimmed
120 / } = (); # keys lowercased
123 alternate backwards level normalization rearrange
124 katakana_before_hiragana upper_before_lower
125 overrideHangul overrideCJK preprocess UCA_Version
126 hangul_terminator variable
130 entry mapping table maxlength
131 ignoreChar ignoreName undefChar undefName variableTable
132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
133 derivCode normCode rearrangeHash
136 # The hash key 'ignored' is deleted at v 0.21.
137 # The hash key 'isShift' is deleted at v 0.23.
138 # The hash key 'combining' is deleted at v 0.24.
139 # The hash key 'entries' is deleted at v 0.30.
140 # The hash key 'L3_ignorable' is deleted at v 0.40.
144 return $self->{versionTable} || 'unknown';
147 my (%ChangeOK, %ChangeNG);
148 @ChangeOK{ @ChangeOK } = ();
149 @ChangeNG{ @ChangeNG } = ();
155 if (exists $hash{variable} && exists $hash{alternate}) {
156 delete $hash{alternate};
158 elsif (!exists $hash{variable} && exists $hash{alternate}) {
159 $hash{variable} = $hash{alternate};
161 foreach my $k (keys %hash) {
162 if (exists $ChangeOK{$k}) {
163 $old{$k} = $self->{$k};
164 $self->{$k} = $hash{$k};
166 elsif (exists $ChangeNG{$k}) {
167 croak "change of $k via change() is not allowed!";
171 $self->checkCollator();
172 return wantarray ? %old : $self;
177 my $key = shift; # 'level' or 'backwards'
178 MinLevel <= $level or croak sprintf
179 "Illegal level %d (in value for key '%s') lower than %d.",
180 $level, $key, MinLevel;
181 $level <= MaxLevel or croak sprintf
182 "Unsupported level %d (in value for key '%s') higher than %d.",
183 $level, $key, MaxLevel;
189 11 => \&_derivCE_9, # 11 == 9
195 _checkLevel($self->{level}, "level");
197 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
198 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
200 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
201 $self->{alternateTable} || 'shifted';
202 $self->{variable} = $self->{alternate} = lc($self->{variable});
203 exists $VariableOK{ $self->{variable} }
204 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
206 if (! defined $self->{backwards}) {
207 $self->{backwardsFlag} = 0;
209 elsif (! ref $self->{backwards}) {
210 _checkLevel($self->{backwards}, "backwards");
211 $self->{backwardsFlag} = 1 << $self->{backwards};
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
236 $self->{normCode} = undef;
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
240 $@ and croak "Unicode::Normalize is required to normalize strings";
242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
247 elsif ($self->{normalization} ne 'prenormalized') {
248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
250 Unicode::Normalize::normalize($norm, shift);
252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
262 my $self = bless { @_ }, $class;
264 # If undef is passed explicitly, no file is read.
265 $self->{table} = $KeyFile if ! exists $self->{table};
266 $self->read_table() if defined $self->{table};
268 if ($self->{entry}) {
269 while ($self->{entry} =~ /([^\n]+)/g) {
270 $self->parseEntry($1);
274 $self->{level} ||= MaxLevel;
275 $self->{UCA_Version} ||= UCA_Version();
277 $self->{overrideHangul} = FALSE
278 if ! exists $self->{overrideHangul};
279 $self->{overrideCJK} = FALSE
280 if ! exists $self->{overrideCJK};
281 $self->{normalization} = 'NFD'
282 if ! exists $self->{normalization};
283 $self->{rearrange} = $self->{rearrangeTable} ||
284 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
285 if ! exists $self->{rearrange};
286 $self->{backwards} = $self->{backwardsTable}
287 if ! exists $self->{backwards};
289 $self->checkCollator();
298 foreach my $d (@INC) {
299 $f = File::Spec->catfile($d, @Path, $self->{table});
300 last if open($fh, $f);
304 $f = File::Spec->catfile(@Path, $self->{table});
305 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
310 unless (s/^\s*\@//) {
311 $self->parseEntry($_);
315 if (/^version\s*(\S*)/) {
316 $self->{versionTable} ||= $1;
318 elsif (/^variable\s+(\S*)/) { # since UTS #10-9
319 $self->{variableTable} ||= $1;
321 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
322 $self->{alternateTable} ||= $1;
324 elsif (/^backwards\s+(\S*)/) {
325 push @{ $self->{backwardsTable} }, $1;
327 elsif (/^forwards\s+(\S*)/) { # parhaps no use
328 push @{ $self->{forwardsTable} }, $1;
330 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
331 push @{ $self->{rearrangeTable} }, _getHexArray($1);
339 ## get $line, parse it, and write an entry in $self
345 my($name, $entry, @uv, @key);
347 return if $line !~ /^\s*[0-9A-Fa-f]/;
349 # removes comment and gets name
351 if $line =~ s/[#%]\s*(.*)//;
352 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
355 my($e, $k) = split /;/, $line;
356 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
359 @uv = _getHexArray($e);
362 $entry = join(CODE_SEP, @uv); # in JCPS
364 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
365 my $ele = pack_U(@uv);
367 # regarded as if it were not entried in the table
369 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
371 # replaced as completely ignorable
372 $k = '[.0000.0000.0000.0000]'
373 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
376 # replaced as completely ignorable
377 $k = '[.0000.0000.0000.0000]'
378 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
380 my $is_L3_ignorable = TRUE;
382 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
383 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
384 my @wt = _getHexArray($arr);
385 push @key, pack(VCE_TEMPLATE, $var, @wt);
386 $is_L3_ignorable = FALSE
387 if $wt[0] || $wt[1] || $wt[2];
388 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
389 # is completely ignorable.
390 # For expansion, an entry $is_L3_ignorable
391 # if and only if "all" CEs are [.0000.0000.0000].
394 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
397 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
398 and $self->{maxlength}{$uv[0]} = @uv;
404 ## VCE = _varCE(variable term, VCE)
410 if ($vbl eq 'non-ignorable') {
413 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
416 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
417 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
419 elsif ($vbl eq 'blanked') {
423 return pack(VCE_TEMPLATE, $var, @wt[0..2],
424 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
431 $self->visualizeSortKey($self->getSortKey(@_));
437 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
439 if ($self->{UCA_Version} <= 8) {
440 $view =~ s/ ?0000 ?/|/g;
442 $view =~ s/\b0000\b/|/g;
449 ## arrayref of JCPS = splitEnt(string to be collated)
450 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
457 my $code = $self->{preprocess};
458 my $norm = $self->{normCode};
459 my $map = $self->{mapping};
460 my $max = $self->{maxlength};
461 my $reH = $self->{rearrangeHash};
462 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
467 $code and croak "Preprocess breaks character positions. "
468 . "Don't use with index(), match(), etc.";
469 $norm and croak "Normalization breaks character positions. "
470 . "Don't use with index(), match(), etc.";
475 $str = &$code($str) if ref $code;
476 $str = &$norm($str) if ref $norm;
479 # get array of Unicode code point of string.
480 my @src = unpack_U($str);
483 # Character positions are not kept if rearranged,
484 # then neglected if $wLen is true.
485 if ($reH && ! $wLen) {
486 for (my $i = 0; $i < @src; $i++) {
487 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
488 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
494 # remove a code point marked as a completely ignorable.
495 for (my $i = 0; $i < @src; $i++) {
497 if _isIllegal($src[$i]) || ($ver9 &&
498 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
501 for (my $i = 0; $i < @src; $i++) {
504 # skip removed code point
505 if (! defined $jcps) {
507 $buf[-1][2] = $i + 1;
516 my $temp_jcps = $jcps;
518 my $maxLen = $max->{$jcps};
520 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
521 next if ! defined $src[$p];
522 $temp_jcps .= CODE_SEP . $src[$p];
524 if ($map->{$temp_jcps}) {
530 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
531 # This process requires Unicode::Normalize.
532 # If "normalization" is undef, here should be skipped *always*
533 # (in spite of bool value of $CVgetCombinClass),
534 # since canonical ordering cannot be expected.
535 # Blocked combining character should not be contracted.
537 if ($self->{normalization})
538 # $self->{normCode} is false in the case of "prenormalized".
543 for (my $p = $i + 1; $p < @src; $p++) {
544 next if ! defined $src[$p];
545 $curCC = $CVgetCombinClass->($src[$p]);
547 my $tail = CODE_SEP . $src[$p];
548 if ($preCC != $curCC && $map->{$jcps.$tail}) {
558 # skip completely ignorable
559 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
561 $buf[-1][2] = $i + 1;
566 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
573 ## list of VCE = getWt(JCPS)
579 my $vbl = $self->{variable};
580 my $map = $self->{mapping};
581 my $der = $self->{derivCode};
583 return if !defined $u;
584 return map(_varCE($vbl, $_), @{ $map->{$u} })
587 # JCPS must not be a contraction, then it's a code point.
588 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
589 my $hang = $self->{overrideHangul};
592 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
594 elsif (!defined $hang) {
595 @hangulCE = $der->($u);
598 my $max = $self->{maxlength};
599 my @decH = _decompHangul($u);
602 my $contract = join(CODE_SEP, @decH);
603 @decH = ($contract) if $map->{$contract};
604 } else { # must be <@decH == 3>
605 if ($max->{$decH[0]}) {
606 my $contract = join(CODE_SEP, @decH);
607 if ($map->{$contract}) {
610 $contract = join(CODE_SEP, @decH[0,1]);
611 $map->{$contract} and @decH = ($contract, $decH[2]);
613 # even if V's ignorable, LT contraction is not supported.
614 # If such a situatution were required, NFD should be used.
616 if (@decH == 3 && $max->{$decH[1]}) {
617 my $contract = join(CODE_SEP, @decH[1,2]);
618 $map->{$contract} and @decH = ($decH[0], $contract);
623 $map->{$_} ? @{ $map->{$_} } : $der->($_);
626 return map _varCE($vbl, $_), @hangulCE;
628 elsif (_isUIdeo($u, $self->{UCA_Version})) {
629 my $cjk = $self->{overrideCJK};
630 return map _varCE($vbl, $_),
632 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
633 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
638 return map _varCE($vbl, $_), $der->($u);
644 ## string sortkey = getSortKey(string arg)
649 my $lev = $self->{level};
650 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
651 my $v2i = $self->{UCA_Version} >= 9 &&
652 $self->{variable} ne 'non-ignorable';
654 my @buf; # weight arrays
655 if ($self->{hangul_terminator}) {
657 foreach my $jcps (@$rEnt) {
658 # weird things like VL, TL-contraction are not considered!
660 foreach my $u (split /;/, $jcps) {
661 $curHST .= getHST($u);
663 if ($preHST && !$curHST || # hangul before non-hangul
664 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
665 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
666 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
668 push @buf, $self->getWtHangulTerm();
672 push @buf, $self->getWt($jcps);
674 $preHST # end at hangul
675 and push @buf, $self->getWtHangulTerm();
678 foreach my $jcps (@$rEnt) {
679 push @buf, $self->getWt($jcps);
684 my @ret = ([],[],[],[]);
685 my $last_is_variable;
687 foreach my $vwt (@buf) {
688 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
690 # "Ignorable (L1, L2) after Variable" since track. v. 9
693 $last_is_variable = TRUE;
695 elsif (!$wt[0]) { # ignorable
696 next if $last_is_variable;
699 $last_is_variable = FALSE;
702 foreach my $v (0..$lev-1) {
703 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
707 # modification of tertiary weights
708 if ($self->{upper_before_lower}) {
709 foreach (@{ $ret[2] }) {
710 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
711 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
712 elsif ($_ == 0x1C) { $_ += 1 } # square upper
713 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
716 if ($self->{katakana_before_hiragana}) {
717 foreach (@{ $ret[2] }) {
718 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
719 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
723 if ($self->{backwardsFlag}) {
724 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
725 if ($self->{backwardsFlag} & (1 << $v)) {
726 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
731 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
736 ## int compare = cmp(string a, string b)
738 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
739 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
740 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
741 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
742 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
743 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
744 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
747 ## list[strings] sorted = sort(list[strings] arg)
753 sort{ $a->[0] cmp $b->[0] }
754 map [ $obj->getSortKey($_), $_ ], @_;
761 (CJK_UidIni <= $u && $u <= CJK_UidF41)
763 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
764 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
768 my $aaaa = $base + ($u >> 15);
769 my $bbbb = ($u & 0x7FFF) | 0x8000;
771 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
772 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
778 (CJK_UidIni <= $u && $u <= CJK_UidFin)
780 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
781 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
785 my $aaaa = $base + ($u >> 15);
786 my $bbbb = ($u & 0x7FFF) | 0x8000;
788 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
789 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
794 my $aaaa = 0xFF80 + ($code >> 15);
795 my $bbbb = ($code & 0x7FFF) | 0x8000;
797 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
798 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
803 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
807 my ($u, $uca_vers) = @_;
810 ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
812 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
814 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
819 sub getWtHangulTerm {
821 return _varCE($self->{variable},
822 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
827 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
829 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
832 # $code *must* be in Hangul syllable.
833 # Check it before you enter here.
837 my $si = $code - Hangul_SBase;
838 my $li = int( $si / Hangul_NCount);
839 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
840 my $ti = $si % Hangul_TCount;
844 $ti ? (Hangul_TBase + $ti) : (),
850 return ! defined $code # removed
851 || ($code < 0 || 0x10FFFF < $code) # out of range
852 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
853 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
854 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
858 # Hangul Syllable Type
862 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
863 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
864 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
865 Hangul_SIni <= $u && $u <= Hangul_SFin ?
866 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
871 ## bool _nonIgnorAtLevel(arrayref weights, int level)
873 sub _nonIgnorAtLevel($$)
876 return if ! defined $wt;
878 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
883 ## arrayref of arrayref[weights] source,
884 ## arrayref of arrayref[weights] substr,
886 ## * comparison of graphemes vs graphemes.
887 ## @$source >= @$substr must be true (check it before call this);
895 for my $g (0..@$substr-1){
896 # Do the $g'th graphemes have the same number of AV weigths?
897 return if @{ $source->[$g] } != @{ $substr->[$g] };
899 for my $w (0..@{ $substr->[$g] }-1) {
900 for my $v (0..$lev-1) {
901 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
909 ## (int position, int length)
910 ## int position = index(string, substring, position, [undoc'ed grobal])
912 ## With "grobal" (only for the list context),
913 ## returns list of arrayref[position, length].
919 my $len = length($str);
920 my $subE = $self->splitEnt(shift);
921 my $pos = @_ ? shift : 0;
922 $pos = 0 if $pos < 0;
925 my $lev = $self->{level};
926 my $v2i = $self->{UCA_Version} >= 9 &&
927 $self->{variable} ne 'non-ignorable';
930 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
932 ? map([$_, 0], $temp..$len)
933 : wantarray ? ($temp,0) : $temp;
936 and return wantarray ? () : NOMATCHPOS;
937 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
939 or return wantarray ? () : NOMATCHPOS;
941 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
943 my $last_is_variable;
944 for my $vwt (map $self->getWt($_), @$subE) {
945 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
946 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
948 # "Ignorable (L1, L2) after Variable" since track. v. 9
951 $last_is_variable = TRUE;
953 elsif (!$wt[0]) { # ignorable
954 $to_be_pushed = FALSE if $last_is_variable;
957 $last_is_variable = FALSE;
961 if (@subWt && !$var && !$wt[0]) {
962 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
964 push @subWt, [ \@wt ];
969 my $end = @$strE - 1;
971 $last_is_variable = FALSE; # reuse
972 for (my $i = 0; $i <= $end; ) { # no $i++
976 while ($i <= $end && $found_base == 0) {
977 for my $vwt ($self->getWt($strE->[$i][0])) {
978 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
979 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
981 # "Ignorable (L1, L2) after Variable" since track. v. 9
984 $last_is_variable = TRUE;
986 elsif (!$wt[0]) { # ignorable
987 $to_be_pushed = FALSE if $last_is_variable;
990 $last_is_variable = FALSE;
994 if (@strWt && !$var && !$wt[0]) {
995 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
996 $finPos[-1] = $strE->[$i][2];
997 } elsif ($to_be_pushed) {
998 push @strWt, [ \@wt ];
999 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1000 $finPos[-1] = NOMATCHPOS if $found_base;
1001 push @finPos, $strE->[$i][2];
1010 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1011 if ($iniPos[0] != NOMATCHPOS &&
1012 $finPos[$#subWt] != NOMATCHPOS &&
1013 _eqArray(\@strWt, \@subWt, $lev)) {
1014 my $temp = $iniPos[0] + $pos;
1017 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1018 splice @strWt, 0, $#subWt;
1019 splice @iniPos, 0, $#subWt;
1020 splice @finPos, 0, $#subWt;
1024 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1036 : wantarray ? () : NOMATCHPOS;
1040 ## scalarref to matching part = match(string, substring)
1045 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1046 my $temp = substr($_[0], $pos, $len);
1047 return wantarray ? $temp : \$temp;
1048 # An lvalue ref \substr should be avoided,
1049 # since its value is affected by modification of its referent.
1057 ## arrayref matching parts = gmatch(string, substring)
1064 return map substr($str, $_->[0], $_->[1]),
1065 $self->index($str, $sub, 0, 'g');
1069 ## bool subst'ed = subst(string, substring, replace)
1074 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1076 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1078 my $mat = substr($_[0], $pos, $len);
1079 substr($_[0], $pos, $len, $code->($mat));
1081 substr($_[0], $pos, $len, $_[2]);
1091 ## int count = gsubst(string, substring, replace)
1096 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1099 # Replacement is carried out from the end, then use reverse.
1100 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1102 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1103 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1105 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1117 Unicode::Collate - Unicode Collation Algorithm
1121 use Unicode::Collate;
1124 $Collator = Unicode::Collate->new(%tailoring);
1127 @sorted = $Collator->sort(@not_sorted);
1130 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1132 # If %tailoring is false (i.e. empty),
1133 # $Collator should do the default collation.
1137 This module is an implementation of Unicode Technical Standard #10
1138 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1140 =head2 Constructor and Tailoring
1142 The C<new> method returns a collator object.
1144 $Collator = Unicode::Collate->new(
1145 UCA_Version => $UCA_Version,
1146 alternate => $alternate, # deprecated: use of 'variable' is recommended.
1147 backwards => $levelNumber, # or \@levelNumbers
1149 hangul_terminator => $term_primary_weight,
1150 ignoreName => qr/$ignoreName/,
1151 ignoreChar => qr/$ignoreChar/,
1152 katakana_before_hiragana => $bool,
1153 level => $collationLevel,
1154 normalization => $normalization_form,
1155 overrideCJK => \&overrideCJK,
1156 overrideHangul => \&overrideHangul,
1157 preprocess => \&preprocess,
1158 rearrange => \@charList,
1160 undefName => qr/$undefName/,
1161 undefChar => qr/$undefChar/,
1162 upper_before_lower => $bool,
1163 variable => $variable,
1170 If the tracking version number of UCA is given,
1171 behavior of that tracking version is emulated on collating.
1172 If omitted, the return value of C<UCA_Version()> is used.
1173 C<UCA_Version()> should return the latest tracking version supported.
1175 The supported tracking version: 8, 9, 11, or 14.
1177 UCA Unicode Standard DUCET (@version)
1178 ---------------------------------------------------
1179 8 3.1 3.0.1 (3.0.1d9)
1180 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1181 11 4.0 4.0.0 (4.0.0)
1182 14 4.1.0 4.1.0 (4.1.0)
1184 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1188 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1190 For backward compatibility, C<alternate> (old name) can be used
1191 as an alias for C<variable>.
1195 -- see 3.1.2 French Accents, UTS #10.
1197 backwards => $levelNumber or \@levelNumbers
1199 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1200 If omitted, forwards at all the levels.
1204 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1206 If the same character (or a sequence of characters) exists
1207 in the collation element table through C<table>,
1208 mapping to collation elements is overrided.
1209 If it does not exist, the mapping is defined additionally.
1211 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1212 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1213 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1214 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1215 006C 006C ; [.0F4C.0020.0002.006C] # ll
1216 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1217 004C 004C ; [.0F4C.0020.0008.004C] # LL
1218 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1219 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1220 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1221 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1224 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1225 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1226 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1229 B<NOTE:> The code point in the UCA file format (before C<';'>)
1230 B<must> be a Unicode code point (defined as hexadecimal),
1231 but not a native code point.
1232 So C<0063> must always denote C<U+0063>,
1233 but not a character of C<"\x63">.
1235 Weighting may vary depending on collation element table.
1236 So ensure the weights defined in C<entry> will be consistent with
1237 those in the collation element table loaded via C<table>.
1239 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1240 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1241 (as a value between C<0E60> and C<0E6D>)
1242 makes ordering as C<C E<lt> CH E<lt> D>.
1243 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1244 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1245 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1246 and C<c-curl> (C<U+0255>) with C<0E69>.
1247 Then primary weight C<0E6A> for C<CH> makes C<CH>
1248 ordered between C<c-curl> and C<D>.
1250 =item hangul_terminator
1252 -- see 7.1.4 Trailing Weights, UTS #10.
1254 If a true value is given (non-zero but should be positive),
1255 it will be added as a terminator primary weight to the end of
1256 every standard Hangul syllable. Secondary and any higher weights
1257 for terminator are set to zero.
1258 If the value is false or C<hangul_terminator> key does not exist,
1259 insertion of terminator weights will not be performed.
1261 Boundaries of Hangul syllables are determined
1262 according to conjoining Jamo behavior in F<the Unicode Standard>
1263 and F<HangulSyllableType.txt>.
1265 B<Implementation Note:>
1266 (1) For expansion mapping (Unicode character mapped
1267 to a sequence of collation elements), a terminator will not be added
1268 between collation elements, even if Hangul syllable boundary exists there.
1269 Addition of terminator is restricted to the next position
1270 to the last collation element.
1272 (2) Non-conjoining Hangul letters
1273 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1274 automatically terminated with a terminator primary weight.
1275 These characters may need terminator included in a collation element
1282 -- see 3.2.2 Variable Weighting, UTS #10.
1284 Makes the entry in the table completely ignorable;
1285 i.e. as if the weights were zero at all level.
1287 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1288 will be ignored. Through C<ignoreName>, any character whose name
1289 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1292 E.g. when 'a' and 'e' are ignorable,
1293 'element' is equal to 'lament' (or 'lmnt').
1295 =item katakana_before_hiragana
1297 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1299 By default, hiragana is before katakana.
1300 If the parameter is made true, this is reversed.
1302 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1303 distinctions must occur in level 3, and their weights at level 3 must be
1304 same as those mentioned in 7.3.1, UTS #10.
1305 If you define your collation elements which violate this requirement,
1306 this parameter does not work validly.
1310 -- see 4.3 Form Sort Key, UTS #10.
1312 Set the maximum level.
1313 Any higher levels than the specified one are ignored.
1315 Level 1: alphabetic ordering
1316 Level 2: diacritic ordering
1317 Level 3: case ordering
1318 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1322 If omitted, the maximum is the 4th.
1326 -- see 4.1 Normalize, UTS #10.
1328 If specified, strings are normalized before preparation of sort keys
1329 (the normalization is executed after preprocess).
1331 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1332 as C<$normalization_form>.
1333 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1334 See C<Unicode::Normalize::normalize()> for detail.
1335 If omitted, C<'NFD'> is used.
1337 C<normalization> is performed after C<preprocess> (if defined).
1339 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1340 though they are not concerned with C<Unicode::Normalize::normalize()>.
1342 If C<undef> (not a string C<"undef">) is passed explicitly
1343 as the value for this key,
1344 any normalization is not carried out (this may make tailoring easier
1345 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1346 only contiguous contractions are resolved;
1347 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1348 C<A-cedilla-ring> would be primary equal to C<A>.
1350 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1351 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1353 In the case of C<(normalization =E<gt> "prenormalized")>,
1354 any normalization is not performed, but
1355 non-contiguous contractions with combining characters are performed.
1357 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1358 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1359 If source strings are finely prenormalized,
1360 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1362 Except C<(normalization =E<gt> undef)>,
1363 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1367 -- see 7.1 Derived Collation Elements, UTS #10.
1369 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1370 but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1371 C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1372 are lesser than C<CJK Unified Ideographs Extension> (its range is
1373 C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1375 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1377 ex. CJK Unified Ideographs in the JIS code point order.
1379 overrideCJK => sub {
1380 my $u = shift; # get a Unicode codepoint
1381 my $b = pack('n', $u); # to UTF-16BE
1382 my $s = your_unicode_to_sjis_converter($b); # convert
1383 my $n = unpack('n', $s); # convert sjis to short
1384 [ $n, 0x20, 0x2, $u ]; # return the collation element
1387 ex. ignores all CJK Unified Ideographs.
1389 overrideCJK => sub {()}, # CODEREF returning empty list
1391 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1392 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1394 If C<undef> is passed explicitly as the value for this key,
1395 weights for CJK Unified Ideographs are treated as undefined.
1396 But assignment of weight for CJK Unified Ideographs
1397 in table or C<entry> is still valid.
1399 =item overrideHangul
1401 -- see 7.1 Derived Collation Elements, UTS #10.
1403 By default, Hangul Syllables are decomposed into Hangul Jamo,
1404 even if C<(normalization =E<gt> undef)>.
1405 But the mapping of Hangul Syllables may be overrided.
1407 This parameter works like C<overrideCJK>, so see there for examples.
1409 If you want to override the mapping of Hangul Syllables,
1410 NFD, NFKD, and FCD are not appropriate,
1411 since they will decompose Hangul Syllables before overriding.
1413 If C<undef> is passed explicitly as the value for this key,
1414 weight for Hangul Syllables is treated as undefined
1415 without decomposition into Hangul Jamo.
1416 But definition of weight for Hangul Syllables
1417 in table or C<entry> is still valid.
1421 -- see 5.1 Preprocessing, UTS #10.
1423 If specified, the coderef is used to preprocess
1424 before the formation of sort keys.
1426 ex. dropping English articles, such as "a" or "the".
1427 Then, "the pen" is before "a pencil".
1431 $str =~ s/\b(?:an?|the)\s+//gi;
1435 C<preprocess> is performed before C<normalization> (if defined).
1439 -- see 3.1.3 Rearrangement, UTS #10.
1441 Characters that are not coded in logical order and to be rearranged.
1442 If C<UCA_Version> is equal to or lesser than 11, default is:
1444 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1446 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1447 (a reference to empty list) as the value for this key.
1449 If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1451 B<According to the version 9 of UCA, this parameter shall not be used;
1452 but it is not warned at present.>
1456 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1458 You can use another collation element table if desired.
1460 The table file should locate in the F<Unicode/Collate> directory
1461 on C<@INC>. Say, if the filename is F<Foo.txt>,
1462 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1464 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1465 If you will prepare your own table file, any name other than F<allkeys.txt>
1466 may be better to avoid namespace conflict.
1468 If C<undef> is passed explicitly as the value for this key,
1469 no file is read (but you can define collation elements via C<entry>).
1471 A typical way to define a collation element table
1472 without any file of table:
1474 $onlyABC = Unicode::Collate->new(
1476 entry => << 'ENTRIES',
1477 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1478 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1479 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1480 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1481 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1482 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1486 If C<ignoreName> or C<undefName> is used, character names should be
1487 specified as a comment (following C<#>) on each line.
1493 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1495 Undefines the collation element as if it were unassigned in the table.
1496 This reduces the size of the table.
1497 If an unassigned character appears in the string to be collated,
1498 the sort key is made from its codepoint
1499 as a single-character collation element,
1500 as it is greater than any other assigned collation elements
1501 (in the codepoint order among the unassigned characters).
1502 But, it'd be better to ignore characters
1503 unfamiliar to you and maybe never used.
1505 Through C<undefChar>, any character matching C<qr/$undefChar/>
1506 will be undefined. Through C<undefName>, any character whose name
1507 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1510 ex. Collation weights for beyond-BMP characters are not stored in object:
1512 undefChar => qr/[^\0-\x{fffd}]/,
1514 =item upper_before_lower
1516 -- see 6.6 Case Comparisons, UTS #10.
1518 By default, lowercase is before uppercase.
1519 If the parameter is made true, this is reversed.
1521 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1522 distinctions must occur in level 3, and their weights at level 3 must be
1523 same as those mentioned in 7.3.1, UTS #10.
1524 If you define your collation elements which differs from this requirement,
1525 this parameter doesn't work validly.
1529 -- see 3.2.2 Variable Weighting, UTS #10.
1531 This key allows to variable weighting for variable collation elements,
1532 which are marked with an ASTERISK in the table
1533 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1535 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1537 These names are case-insensitive.
1538 By default (if specification is omitted), 'shifted' is adopted.
1540 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1541 considered at the 4th level.
1543 'Non-Ignorable' Variable elements are not reset to ignorable.
1545 'Shifted' Variable elements are made ignorable at levels 1 through 3
1546 their level 4 weight is replaced by the old level 1 weight.
1547 Level 4 weight for Non-Variable elements is 0xFFFF.
1549 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1554 =head2 Methods for Collation
1558 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1560 Sorts a list of strings.
1562 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1564 Returns 1 (when C<$a> is greater than C<$b>)
1565 or 0 (when C<$a> is equal to C<$b>)
1566 or -1 (when C<$a> is lesser than C<$b>).
1568 =item C<$result = $Collator-E<gt>eq($a, $b)>
1570 =item C<$result = $Collator-E<gt>ne($a, $b)>
1572 =item C<$result = $Collator-E<gt>lt($a, $b)>
1574 =item C<$result = $Collator-E<gt>le($a, $b)>
1576 =item C<$result = $Collator-E<gt>gt($a, $b)>
1578 =item C<$result = $Collator-E<gt>ge($a, $b)>
1580 They works like the same name operators as theirs.
1582 eq : whether $a is equal to $b.
1583 ne : whether $a is not equal to $b.
1584 lt : whether $a is lesser than $b.
1585 le : whether $a is lesser than $b or equal to $b.
1586 gt : whether $a is greater than $b.
1587 ge : whether $a is greater than $b or equal to $b.
1589 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1591 -- see 4.3 Form Sort Key, UTS #10.
1595 You compare the sort keys using a binary comparison
1596 and get the result of the comparison of the strings using UCA.
1598 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1602 $Collator->cmp($a, $b)
1604 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1606 Converts a sorting key into its representation form.
1607 If C<UCA_Version> is 8, the output is slightly different.
1609 use Unicode::Collate;
1610 my $c = Unicode::Collate->new();
1611 print $c->viewSortKey("Perl"),"\n";
1614 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1615 # Level 1 Level 2 Level 3 Level 4
1619 =head2 Methods for Searching
1621 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1622 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1623 C<subst>, C<gsubst>) is croaked,
1624 as the position and the length might differ
1625 from those on the specified string.
1626 (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1628 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1629 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1630 but they are not aware of any pattern, but only a literal substring.
1634 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1636 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1638 If C<$substring> matches a part of C<$string>, returns
1639 the position of the first occurrence of the matching part in scalar context;
1640 in list context, returns a two-element list of
1641 the position and the length of the matching part.
1643 If C<$substring> does not match any part of C<$string>,
1644 returns C<-1> in scalar context and
1645 an empty list in list context.
1649 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1650 # (normalization => undef) is REQUIRED.
1651 my $str = "Ich muß studieren Perl.";
1654 if (my($pos,$len) = $Collator->index($str, $sub)) {
1655 $match = substr($str, $pos, $len);
1658 and get C<"muß"> in C<$match> since C<"muß">
1659 is primary equal to C<"MÜSS">.
1661 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1663 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1665 If C<$substring> matches a part of C<$string>, in scalar context, returns
1666 B<a reference to> the first occurrence of the matching part
1667 (C<$match_ref> is always true if matches,
1668 since every reference is B<true>);
1669 in list context, returns the first occurrence of the matching part.
1671 If C<$substring> does not match any part of C<$string>,
1672 returns C<undef> in scalar context and
1673 an empty list in list context.
1677 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1678 print "matches [$$match_ref].\n";
1680 print "doesn't match.\n";
1685 if (($match) = $Collator->match($str, $sub)) { # list context
1686 print "matches [$match].\n";
1688 print "doesn't match.\n";
1691 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1693 If C<$substring> matches a part of C<$string>, returns
1694 all the matching parts (or matching count in scalar context).
1696 If C<$substring> does not match any part of C<$string>,
1697 returns an empty list.
1699 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1701 If C<$substring> matches a part of C<$string>,
1702 the first occurrence of the matching part is replaced by C<$replacement>
1703 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1705 C<$replacement> can be a C<CODEREF>,
1706 taking the matching part as an argument,
1707 and returning a string to replace the matching part
1708 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1710 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1712 If C<$substring> matches a part of C<$string>,
1713 all the occurrences of the matching part is replaced by C<$replacement>
1714 (C<$string> is modified) and return C<$count>.
1716 C<$replacement> can be a C<CODEREF>,
1717 taking the matching part as an argument,
1718 and returning a string to replace the matching part
1719 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1723 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1724 # (normalization => undef) is REQUIRED.
1725 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1726 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1728 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1729 # i.e., all the camels are made bold-faced.
1733 =head2 Other Methods
1737 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1739 Change the value of specified keys and returns the changed part.
1741 $Collator = Unicode::Collate->new(level => 4);
1743 $Collator->eq("perl", "PERL"); # false
1745 %old = $Collator->change(level => 2); # returns (level => 4).
1747 $Collator->eq("perl", "PERL"); # true
1749 $Collator->change(%old); # returns (level => 2).
1751 $Collator->eq("perl", "PERL"); # false
1753 Not all C<(key,value)>s are allowed to be changed.
1754 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1756 In the scalar context, returns the modified collator
1757 (but it is B<not> a clone from the original).
1759 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1761 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1763 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1765 =item C<$version = $Collator-E<gt>version()>
1767 Returns the version number (a string) of the Unicode Standard
1768 which the C<table> file used by the collator object is based on.
1769 If the table does not include a version line (starting with C<@version>),
1770 returns C<"unknown">.
1772 =item C<UCA_Version()>
1774 Returns the tracking version number of UTS #10 this module consults.
1776 =item C<Base_Unicode_Version()>
1778 Returns the version number of UTS #10 this module consults.
1784 No method will be exported.
1788 Though this module can be used without any C<table> file,
1789 to use this module easily, it is recommended to install a table file
1790 in the UCA format, by copying it under the directory
1791 <a place in @INC>/Unicode/Collate.
1793 The most preferable one is "The Default Unicode Collation Element Table",
1794 available from the Unicode Consortium's website:
1796 http://www.unicode.org/Public/UCA/
1798 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1800 If DUCET is not installed, it is recommended to copy the file
1801 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1802 to <a place in @INC>/Unicode/Collate/allkeys.txt
1811 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1812 module (see L<Unicode::Normalize>).
1814 If you need not it (say, in the case when you need not
1815 handle any combining characters),
1816 assign C<normalization =E<gt> undef> explicitly.
1818 -- see 6.5 Avoiding Normalization, UTS #10.
1820 =item Conformance Test
1822 The Conformance Test for the UCA is available
1823 under L<http://www.unicode.org/Public/UCA/>.
1825 For F<CollationTest_SHIFTED.txt>,
1826 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1827 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1828 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1830 B<Unicode::Normalize is required to try The Conformance Test.>
1834 =head1 AUTHOR, COPYRIGHT AND LICENSE
1836 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1837 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1838 SADAHIRO Tomoyuki. Japan. All rights reserved.
1840 This module is free software; you can redistribute it and/or
1841 modify it under the same terms as Perl itself.
1843 The file Unicode/Collate/allkeys.txt was copied directly
1844 from http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt (aka DUCET).
1845 This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
1846 Distributed under the Terms of Use in http://www.unicode.org/copyright.html
1852 =item Unicode Collation Algorithm - UTS #10
1854 L<http://www.unicode.org/reports/tr10/>
1856 =item The Default Unicode Collation Element Table (DUCET)
1858 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1860 =item The conformance test for the UCA
1862 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1864 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1866 =item Hangul Syllable Type
1868 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1870 =item Unicode Normalization Forms - UAX #15
1872 L<http://www.unicode.org/reports/tr15/>