1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
17 our $VERSION = '0.52';
18 our $PACKAGE = __PACKAGE__;
20 my @Path = qw(Unicode Collate);
21 my $KeyFile = "allkeys.txt";
24 use constant TRUE => 1;
25 use constant FALSE => "";
26 use constant NOMATCHPOS => -1;
28 # A coderef to get combining class imported from Unicode::Normalize
29 # (i.e. \&Unicode::Normalize::getCombinClass).
30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
34 use constant MinLevel => 1;
35 use constant MaxLevel => 4;
37 # Minimum weights at level 2 and 3, respectively
38 use constant Min2Wt => 0x20;
39 use constant Min3Wt => 0x02;
41 # Shifted weight at 4th level
42 use constant Shift4Wt => 0xFFFF;
44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45 # PROBLEM: The Default Unicode Collation Element Table
46 # has weights over 0xFFFF at the 4th level.
47 # The tie-breaking in the variable weights
48 # other than "shift" (as well as "shift-trimmed") is unreliable.
49 use constant VCE_TEMPLATE => 'Cn4';
51 # A sort key: 16-bit weights
52 # See also the PROBLEM on VCE_TEMPLATE above.
53 use constant KEY_TEMPLATE => 'n*';
55 # Level separator in a sort key:
56 # i.e. pack(KEY_TEMPLATE, 0)
57 use constant LEVEL_SEP => "\0\0";
59 # As Unicode code point separator for hash keys.
60 # A joined code point string (denoted by JCPS below)
61 # like "65;768" is used for internal processing
62 # instead of Perl's Unicode string like "\x41\x{300}",
63 # as the native code point is different from the Unicode code point
65 # This character must not be included in any stringified
66 # representation of an integer.
67 use constant CODE_SEP => ';';
69 # boolean values of variable weights
70 use constant NON_VAR => 0; # Non-Variable character
71 use constant VAR => 1; # Variable character
73 # specific code points
74 use constant Hangul_LBase => 0x1100;
75 use constant Hangul_LIni => 0x1100;
76 use constant Hangul_LFin => 0x1159;
77 use constant Hangul_LFill => 0x115F;
78 use constant Hangul_VBase => 0x1161;
79 use constant Hangul_VIni => 0x1160; # from Vowel Filler
80 use constant Hangul_VFin => 0x11A2;
81 use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
82 use constant Hangul_TIni => 0x11A8;
83 use constant Hangul_TFin => 0x11F9;
84 use constant Hangul_TCount => 28;
85 use constant Hangul_NCount => 588;
86 use constant Hangul_SBase => 0xAC00;
87 use constant Hangul_SIni => 0xAC00;
88 use constant Hangul_SFin => 0xD7A3;
89 use constant CJK_UidIni => 0x4E00;
90 use constant CJK_UidFin => 0x9FA5;
91 use constant CJK_UidF41 => 0x9FBB;
92 use constant CJK_ExtAIni => 0x3400;
93 use constant CJK_ExtAFin => 0x4DB5;
94 use constant CJK_ExtBIni => 0x20000;
95 use constant CJK_ExtBFin => 0x2A6D6;
96 use constant BMP_Max => 0xFFFF;
98 # Logical_Order_Exception in PropList.txt
99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
101 sub UCA_Version { "14" }
103 sub Base_Unicode_Version { "4.1.0" }
108 return pack('U*', @_);
112 return unpack('U*', shift(@_).pack('U*'));
119 blanked non-ignorable shifted shift-trimmed
120 / } = (); # keys lowercased
123 alternate backwards level normalization rearrange
124 katakana_before_hiragana upper_before_lower
125 overrideHangul overrideCJK preprocess UCA_Version
126 hangul_terminator variable
130 entry mapping table maxlength
131 ignoreChar ignoreName undefChar undefName variableTable
132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
133 derivCode normCode rearrangeHash
136 # The hash key 'ignored' is deleted at v 0.21.
137 # The hash key 'isShift' is deleted at v 0.23.
138 # The hash key 'combining' is deleted at v 0.24.
139 # The hash key 'entries' is deleted at v 0.30.
140 # The hash key 'L3_ignorable' is deleted at v 0.40.
144 return $self->{versionTable} || 'unknown';
147 my (%ChangeOK, %ChangeNG);
148 @ChangeOK{ @ChangeOK } = ();
149 @ChangeNG{ @ChangeNG } = ();
155 if (exists $hash{variable} && exists $hash{alternate}) {
156 delete $hash{alternate};
158 elsif (!exists $hash{variable} && exists $hash{alternate}) {
159 $hash{variable} = $hash{alternate};
161 foreach my $k (keys %hash) {
162 if (exists $ChangeOK{$k}) {
163 $old{$k} = $self->{$k};
164 $self->{$k} = $hash{$k};
166 elsif (exists $ChangeNG{$k}) {
167 croak "change of $k via change() is not allowed!";
171 $self->checkCollator();
172 return wantarray ? %old : $self;
177 my $key = shift; # 'level' or 'backwards'
178 MinLevel <= $level or croak sprintf
179 "Illegal level %d (in value for key '%s') lower than %d.",
180 $level, $key, MinLevel;
181 $level <= MaxLevel or croak sprintf
182 "Unsupported level %d (in value for key '%s') higher than %d.",
183 $level, $key, MaxLevel;
189 11 => \&_derivCE_9, # 11 == 9
195 _checkLevel($self->{level}, "level");
197 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
198 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
200 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
201 $self->{alternateTable} || 'shifted';
202 $self->{variable} = $self->{alternate} = lc($self->{variable});
203 exists $VariableOK{ $self->{variable} }
204 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
206 if (! defined $self->{backwards}) {
207 $self->{backwardsFlag} = 0;
209 elsif (! ref $self->{backwards}) {
210 _checkLevel($self->{backwards}, "backwards");
211 $self->{backwardsFlag} = 1 << $self->{backwards};
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
236 $self->{normCode} = undef;
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
240 $@ and croak "Unicode::Normalize is required to normalize strings";
242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
247 elsif ($self->{normalization} ne 'prenormalized') {
248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
250 Unicode::Normalize::normalize($norm, shift);
252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
262 my $self = bless { @_ }, $class;
264 # If undef is passed explicitly, no file is read.
265 $self->{table} = $KeyFile if ! exists $self->{table};
266 $self->read_table() if defined $self->{table};
268 if ($self->{entry}) {
269 while ($self->{entry} =~ /([^\n]+)/g) {
270 $self->parseEntry($1);
274 $self->{level} ||= MaxLevel;
275 $self->{UCA_Version} ||= UCA_Version();
277 $self->{overrideHangul} = FALSE
278 if ! exists $self->{overrideHangul};
279 $self->{overrideCJK} = FALSE
280 if ! exists $self->{overrideCJK};
281 $self->{normalization} = 'NFD'
282 if ! exists $self->{normalization};
283 $self->{rearrange} = $self->{rearrangeTable} ||
284 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
285 if ! exists $self->{rearrange};
286 $self->{backwards} = $self->{backwardsTable}
287 if ! exists $self->{backwards};
289 $self->checkCollator();
298 foreach my $d (@INC) {
299 $f = File::Spec->catfile($d, @Path, $self->{table});
300 last if open($fh, $f);
304 $f = File::Spec->catfile(@Path, $self->{table});
305 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
308 while (my $line = <$fh>) {
309 next if $line =~ /^\s*#/;
310 unless ($line =~ s/^\s*\@//) {
311 $self->parseEntry($line);
316 if ($line =~ /^version\s*(\S*)/) {
317 $self->{versionTable} ||= $1;
319 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
320 $self->{variableTable} ||= $1;
322 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
323 $self->{alternateTable} ||= $1;
325 elsif ($line =~ /^backwards\s+(\S*)/) {
326 push @{ $self->{backwardsTable} }, $1;
328 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
329 push @{ $self->{forwardsTable} }, $1;
331 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
332 push @{ $self->{rearrangeTable} }, _getHexArray($1);
340 ## get $line, parse it, and write an entry in $self
346 my($name, $entry, @uv, @key);
348 return if $line !~ /^\s*[0-9A-Fa-f]/;
350 # removes comment and gets name
352 if $line =~ s/[#%]\s*(.*)//;
353 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
356 my($e, $k) = split /;/, $line;
357 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
360 @uv = _getHexArray($e);
363 $entry = join(CODE_SEP, @uv); # in JCPS
365 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
366 my $ele = pack_U(@uv);
368 # regarded as if it were not entried in the table
370 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
372 # replaced as completely ignorable
373 $k = '[.0000.0000.0000.0000]'
374 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
377 # replaced as completely ignorable
378 $k = '[.0000.0000.0000.0000]'
379 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
381 my $is_L3_ignorable = TRUE;
383 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
384 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
385 my @wt = _getHexArray($arr);
386 push @key, pack(VCE_TEMPLATE, $var, @wt);
387 $is_L3_ignorable = FALSE
388 if $wt[0] || $wt[1] || $wt[2];
389 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
390 # is completely ignorable.
391 # For expansion, an entry $is_L3_ignorable
392 # if and only if "all" CEs are [.0000.0000.0000].
395 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
398 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
399 and $self->{maxlength}{$uv[0]} = @uv;
405 ## VCE = _varCE(variable term, VCE)
411 if ($vbl eq 'non-ignorable') {
414 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
417 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
418 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
420 elsif ($vbl eq 'blanked') {
424 return pack(VCE_TEMPLATE, $var, @wt[0..2],
425 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
432 $self->visualizeSortKey($self->getSortKey(@_));
438 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
440 if ($self->{UCA_Version} <= 8) {
441 $view =~ s/ ?0000 ?/|/g;
443 $view =~ s/\b0000\b/|/g;
450 ## arrayref of JCPS = splitEnt(string to be collated)
451 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
458 my $code = $self->{preprocess};
459 my $norm = $self->{normCode};
460 my $map = $self->{mapping};
461 my $max = $self->{maxlength};
462 my $reH = $self->{rearrangeHash};
463 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
468 $code and croak "Preprocess breaks character positions. "
469 . "Don't use with index(), match(), etc.";
470 $norm and croak "Normalization breaks character positions. "
471 . "Don't use with index(), match(), etc.";
476 $str = &$code($str) if ref $code;
477 $str = &$norm($str) if ref $norm;
480 # get array of Unicode code point of string.
481 my @src = unpack_U($str);
484 # Character positions are not kept if rearranged,
485 # then neglected if $wLen is true.
486 if ($reH && ! $wLen) {
487 for (my $i = 0; $i < @src; $i++) {
488 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
489 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
495 # remove a code point marked as a completely ignorable.
496 for (my $i = 0; $i < @src; $i++) {
498 if _isIllegal($src[$i]) || ($ver9 &&
499 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
502 for (my $i = 0; $i < @src; $i++) {
505 # skip removed code point
506 if (! defined $jcps) {
508 $buf[-1][2] = $i + 1;
517 my $temp_jcps = $jcps;
519 my $maxLen = $max->{$jcps};
521 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
522 next if ! defined $src[$p];
523 $temp_jcps .= CODE_SEP . $src[$p];
525 if ($map->{$temp_jcps}) {
531 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
532 # This process requires Unicode::Normalize.
533 # If "normalization" is undef, here should be skipped *always*
534 # (in spite of bool value of $CVgetCombinClass),
535 # since canonical ordering cannot be expected.
536 # Blocked combining character should not be contracted.
538 if ($self->{normalization})
539 # $self->{normCode} is false in the case of "prenormalized".
544 for (my $p = $i + 1; $p < @src; $p++) {
545 next if ! defined $src[$p];
546 $curCC = $CVgetCombinClass->($src[$p]);
548 my $tail = CODE_SEP . $src[$p];
549 if ($preCC != $curCC && $map->{$jcps.$tail}) {
559 # skip completely ignorable
560 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
562 $buf[-1][2] = $i + 1;
567 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
574 ## list of VCE = getWt(JCPS)
580 my $vbl = $self->{variable};
581 my $map = $self->{mapping};
582 my $der = $self->{derivCode};
584 return if !defined $u;
585 return map(_varCE($vbl, $_), @{ $map->{$u} })
588 # JCPS must not be a contraction, then it's a code point.
589 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
590 my $hang = $self->{overrideHangul};
593 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
595 elsif (!defined $hang) {
596 @hangulCE = $der->($u);
599 my $max = $self->{maxlength};
600 my @decH = _decompHangul($u);
603 my $contract = join(CODE_SEP, @decH);
604 @decH = ($contract) if $map->{$contract};
605 } else { # must be <@decH == 3>
606 if ($max->{$decH[0]}) {
607 my $contract = join(CODE_SEP, @decH);
608 if ($map->{$contract}) {
611 $contract = join(CODE_SEP, @decH[0,1]);
612 $map->{$contract} and @decH = ($contract, $decH[2]);
614 # even if V's ignorable, LT contraction is not supported.
615 # If such a situatution were required, NFD should be used.
617 if (@decH == 3 && $max->{$decH[1]}) {
618 my $contract = join(CODE_SEP, @decH[1,2]);
619 $map->{$contract} and @decH = ($decH[0], $contract);
624 $map->{$_} ? @{ $map->{$_} } : $der->($_);
627 return map _varCE($vbl, $_), @hangulCE;
629 elsif (_isUIdeo($u, $self->{UCA_Version})) {
630 my $cjk = $self->{overrideCJK};
631 return map _varCE($vbl, $_),
633 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
634 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
639 return map _varCE($vbl, $_), $der->($u);
645 ## string sortkey = getSortKey(string arg)
650 my $lev = $self->{level};
651 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
652 my $v2i = $self->{UCA_Version} >= 9 &&
653 $self->{variable} ne 'non-ignorable';
655 my @buf; # weight arrays
656 if ($self->{hangul_terminator}) {
658 foreach my $jcps (@$rEnt) {
659 # weird things like VL, TL-contraction are not considered!
661 foreach my $u (split /;/, $jcps) {
662 $curHST .= getHST($u);
664 if ($preHST && !$curHST || # hangul before non-hangul
665 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
666 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
667 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
669 push @buf, $self->getWtHangulTerm();
673 push @buf, $self->getWt($jcps);
675 $preHST # end at hangul
676 and push @buf, $self->getWtHangulTerm();
679 foreach my $jcps (@$rEnt) {
680 push @buf, $self->getWt($jcps);
685 my @ret = ([],[],[],[]);
686 my $last_is_variable;
688 foreach my $vwt (@buf) {
689 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
691 # "Ignorable (L1, L2) after Variable" since track. v. 9
694 $last_is_variable = TRUE;
696 elsif (!$wt[0]) { # ignorable
697 next if $last_is_variable;
700 $last_is_variable = FALSE;
703 foreach my $v (0..$lev-1) {
704 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
708 # modification of tertiary weights
709 if ($self->{upper_before_lower}) {
710 foreach my $w (@{ $ret[2] }) {
711 if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
712 elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
713 elsif ($w == 0x1C) { $w += 1 } # square upper
714 elsif ($w == 0x1D) { $w -= 1 } # square lower
717 if ($self->{katakana_before_hiragana}) {
718 foreach my $w (@{ $ret[2] }) {
719 if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
720 elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
724 if ($self->{backwardsFlag}) {
725 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
726 if ($self->{backwardsFlag} & (1 << $v)) {
727 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
732 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
737 ## int compare = cmp(string a, string b)
739 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
740 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
741 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
742 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
743 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
744 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
745 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
748 ## list[strings] sorted = sort(list[strings] arg)
754 sort{ $a->[0] cmp $b->[0] }
755 map [ $obj->getSortKey($_), $_ ], @_;
762 (CJK_UidIni <= $u && $u <= CJK_UidF41)
764 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
765 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
769 my $aaaa = $base + ($u >> 15);
770 my $bbbb = ($u & 0x7FFF) | 0x8000;
772 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
773 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
779 (CJK_UidIni <= $u && $u <= CJK_UidFin)
781 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
782 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
786 my $aaaa = $base + ($u >> 15);
787 my $bbbb = ($u & 0x7FFF) | 0x8000;
789 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
790 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
795 my $aaaa = 0xFF80 + ($code >> 15);
796 my $bbbb = ($code & 0x7FFF) | 0x8000;
798 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
799 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
804 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
808 my ($u, $uca_vers) = @_;
811 ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
813 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
815 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
820 sub getWtHangulTerm {
822 return _varCE($self->{variable},
823 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
828 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
830 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
833 # $code *must* be in Hangul syllable.
834 # Check it before you enter here.
838 my $si = $code - Hangul_SBase;
839 my $li = int( $si / Hangul_NCount);
840 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
841 my $ti = $si % Hangul_TCount;
845 $ti ? (Hangul_TBase + $ti) : (),
851 return ! defined $code # removed
852 || ($code < 0 || 0x10FFFF < $code) # out of range
853 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
854 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
855 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
859 # Hangul Syllable Type
863 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
864 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
865 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
866 Hangul_SIni <= $u && $u <= Hangul_SFin ?
867 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
872 ## bool _nonIgnorAtLevel(arrayref weights, int level)
874 sub _nonIgnorAtLevel($$)
877 return if ! defined $wt;
879 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
884 ## arrayref of arrayref[weights] source,
885 ## arrayref of arrayref[weights] substr,
887 ## * comparison of graphemes vs graphemes.
888 ## @$source >= @$substr must be true (check it before call this);
896 for my $g (0..@$substr-1){
897 # Do the $g'th graphemes have the same number of AV weigths?
898 return if @{ $source->[$g] } != @{ $substr->[$g] };
900 for my $w (0..@{ $substr->[$g] }-1) {
901 for my $v (0..$lev-1) {
902 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
910 ## (int position, int length)
911 ## int position = index(string, substring, position, [undoc'ed grobal])
913 ## With "grobal" (only for the list context),
914 ## returns list of arrayref[position, length].
920 my $len = length($str);
921 my $subE = $self->splitEnt(shift);
922 my $pos = @_ ? shift : 0;
923 $pos = 0 if $pos < 0;
926 my $lev = $self->{level};
927 my $v2i = $self->{UCA_Version} >= 9 &&
928 $self->{variable} ne 'non-ignorable';
931 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
933 ? map([$_, 0], $temp..$len)
934 : wantarray ? ($temp,0) : $temp;
937 and return wantarray ? () : NOMATCHPOS;
938 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
940 or return wantarray ? () : NOMATCHPOS;
942 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
944 my $last_is_variable;
945 for my $vwt (map $self->getWt($_), @$subE) {
946 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
947 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
949 # "Ignorable (L1, L2) after Variable" since track. v. 9
952 $last_is_variable = TRUE;
954 elsif (!$wt[0]) { # ignorable
955 $to_be_pushed = FALSE if $last_is_variable;
958 $last_is_variable = FALSE;
962 if (@subWt && !$var && !$wt[0]) {
963 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
965 push @subWt, [ \@wt ];
970 my $end = @$strE - 1;
972 $last_is_variable = FALSE; # reuse
973 for (my $i = 0; $i <= $end; ) { # no $i++
977 while ($i <= $end && $found_base == 0) {
978 for my $vwt ($self->getWt($strE->[$i][0])) {
979 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
980 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
982 # "Ignorable (L1, L2) after Variable" since track. v. 9
985 $last_is_variable = TRUE;
987 elsif (!$wt[0]) { # ignorable
988 $to_be_pushed = FALSE if $last_is_variable;
991 $last_is_variable = FALSE;
995 if (@strWt && !$var && !$wt[0]) {
996 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
997 $finPos[-1] = $strE->[$i][2];
998 } elsif ($to_be_pushed) {
999 push @strWt, [ \@wt ];
1000 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1001 $finPos[-1] = NOMATCHPOS if $found_base;
1002 push @finPos, $strE->[$i][2];
1011 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1012 if ($iniPos[0] != NOMATCHPOS &&
1013 $finPos[$#subWt] != NOMATCHPOS &&
1014 _eqArray(\@strWt, \@subWt, $lev)) {
1015 my $temp = $iniPos[0] + $pos;
1018 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1019 splice @strWt, 0, $#subWt;
1020 splice @iniPos, 0, $#subWt;
1021 splice @finPos, 0, $#subWt;
1025 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1037 : wantarray ? () : NOMATCHPOS;
1041 ## scalarref to matching part = match(string, substring)
1046 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1047 my $temp = substr($_[0], $pos, $len);
1048 return wantarray ? $temp : \$temp;
1049 # An lvalue ref \substr should be avoided,
1050 # since its value is affected by modification of its referent.
1058 ## arrayref matching parts = gmatch(string, substring)
1065 return map substr($str, $_->[0], $_->[1]),
1066 $self->index($str, $sub, 0, 'g');
1070 ## bool subst'ed = subst(string, substring, replace)
1075 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1077 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1079 my $mat = substr($_[0], $pos, $len);
1080 substr($_[0], $pos, $len, $code->($mat));
1082 substr($_[0], $pos, $len, $_[2]);
1092 ## int count = gsubst(string, substring, replace)
1097 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1100 # Replacement is carried out from the end, then use reverse.
1101 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1103 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1104 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1106 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1118 Unicode::Collate - Unicode Collation Algorithm
1122 use Unicode::Collate;
1125 $Collator = Unicode::Collate->new(%tailoring);
1128 @sorted = $Collator->sort(@not_sorted);
1131 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1133 # If %tailoring is false (i.e. empty),
1134 # $Collator should do the default collation.
1138 This module is an implementation of Unicode Technical Standard #10
1139 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1141 =head2 Constructor and Tailoring
1143 The C<new> method returns a collator object.
1145 $Collator = Unicode::Collate->new(
1146 UCA_Version => $UCA_Version,
1147 alternate => $alternate, # deprecated: use of 'variable' is recommended.
1148 backwards => $levelNumber, # or \@levelNumbers
1150 hangul_terminator => $term_primary_weight,
1151 ignoreName => qr/$ignoreName/,
1152 ignoreChar => qr/$ignoreChar/,
1153 katakana_before_hiragana => $bool,
1154 level => $collationLevel,
1155 normalization => $normalization_form,
1156 overrideCJK => \&overrideCJK,
1157 overrideHangul => \&overrideHangul,
1158 preprocess => \&preprocess,
1159 rearrange => \@charList,
1161 undefName => qr/$undefName/,
1162 undefChar => qr/$undefChar/,
1163 upper_before_lower => $bool,
1164 variable => $variable,
1171 If the tracking version number of UCA is given,
1172 behavior of that tracking version is emulated on collating.
1173 If omitted, the return value of C<UCA_Version()> is used.
1174 C<UCA_Version()> should return the latest tracking version supported.
1176 The supported tracking version: 8, 9, 11, or 14.
1178 UCA Unicode Standard DUCET (@version)
1179 ---------------------------------------------------
1180 8 3.1 3.0.1 (3.0.1d9)
1181 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1182 11 4.0 4.0.0 (4.0.0)
1183 14 4.1.0 4.1.0 (4.1.0)
1185 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1189 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1191 For backward compatibility, C<alternate> (old name) can be used
1192 as an alias for C<variable>.
1196 -- see 3.1.2 French Accents, UTS #10.
1198 backwards => $levelNumber or \@levelNumbers
1200 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1201 If omitted, forwards at all the levels.
1205 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1207 If the same character (or a sequence of characters) exists
1208 in the collation element table through C<table>,
1209 mapping to collation elements is overrided.
1210 If it does not exist, the mapping is defined additionally.
1212 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1213 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1214 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1215 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1216 006C 006C ; [.0F4C.0020.0002.006C] # ll
1217 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1218 004C 004C ; [.0F4C.0020.0008.004C] # LL
1219 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1220 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1221 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1222 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1225 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1226 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1227 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1230 B<NOTE:> The code point in the UCA file format (before C<';'>)
1231 B<must> be a Unicode code point (defined as hexadecimal),
1232 but not a native code point.
1233 So C<0063> must always denote C<U+0063>,
1234 but not a character of C<"\x63">.
1236 Weighting may vary depending on collation element table.
1237 So ensure the weights defined in C<entry> will be consistent with
1238 those in the collation element table loaded via C<table>.
1240 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1241 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1242 (as a value between C<0E60> and C<0E6D>)
1243 makes ordering as C<C E<lt> CH E<lt> D>.
1244 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1245 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1246 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1247 and C<c-curl> (C<U+0255>) with C<0E69>.
1248 Then primary weight C<0E6A> for C<CH> makes C<CH>
1249 ordered between C<c-curl> and C<D>.
1251 =item hangul_terminator
1253 -- see 7.1.4 Trailing Weights, UTS #10.
1255 If a true value is given (non-zero but should be positive),
1256 it will be added as a terminator primary weight to the end of
1257 every standard Hangul syllable. Secondary and any higher weights
1258 for terminator are set to zero.
1259 If the value is false or C<hangul_terminator> key does not exist,
1260 insertion of terminator weights will not be performed.
1262 Boundaries of Hangul syllables are determined
1263 according to conjoining Jamo behavior in F<the Unicode Standard>
1264 and F<HangulSyllableType.txt>.
1266 B<Implementation Note:>
1267 (1) For expansion mapping (Unicode character mapped
1268 to a sequence of collation elements), a terminator will not be added
1269 between collation elements, even if Hangul syllable boundary exists there.
1270 Addition of terminator is restricted to the next position
1271 to the last collation element.
1273 (2) Non-conjoining Hangul letters
1274 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1275 automatically terminated with a terminator primary weight.
1276 These characters may need terminator included in a collation element
1283 -- see 3.2.2 Variable Weighting, UTS #10.
1285 Makes the entry in the table completely ignorable;
1286 i.e. as if the weights were zero at all level.
1288 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1289 will be ignored. Through C<ignoreName>, any character whose name
1290 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1293 E.g. when 'a' and 'e' are ignorable,
1294 'element' is equal to 'lament' (or 'lmnt').
1296 =item katakana_before_hiragana
1298 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1300 By default, hiragana is before katakana.
1301 If the parameter is made true, this is reversed.
1303 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1304 distinctions must occur in level 3, and their weights at level 3 must be
1305 same as those mentioned in 7.3.1, UTS #10.
1306 If you define your collation elements which violate this requirement,
1307 this parameter does not work validly.
1311 -- see 4.3 Form Sort Key, UTS #10.
1313 Set the maximum level.
1314 Any higher levels than the specified one are ignored.
1316 Level 1: alphabetic ordering
1317 Level 2: diacritic ordering
1318 Level 3: case ordering
1319 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1323 If omitted, the maximum is the 4th.
1327 -- see 4.1 Normalize, UTS #10.
1329 If specified, strings are normalized before preparation of sort keys
1330 (the normalization is executed after preprocess).
1332 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1333 as C<$normalization_form>.
1334 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1335 See C<Unicode::Normalize::normalize()> for detail.
1336 If omitted, C<'NFD'> is used.
1338 C<normalization> is performed after C<preprocess> (if defined).
1340 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1341 though they are not concerned with C<Unicode::Normalize::normalize()>.
1343 If C<undef> (not a string C<"undef">) is passed explicitly
1344 as the value for this key,
1345 any normalization is not carried out (this may make tailoring easier
1346 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1347 only contiguous contractions are resolved;
1348 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1349 C<A-cedilla-ring> would be primary equal to C<A>.
1351 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1352 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1354 In the case of C<(normalization =E<gt> "prenormalized")>,
1355 any normalization is not performed, but
1356 non-contiguous contractions with combining characters are performed.
1358 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1359 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1360 If source strings are finely prenormalized,
1361 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1363 Except C<(normalization =E<gt> undef)>,
1364 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1368 -- see 7.1 Derived Collation Elements, UTS #10.
1370 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1371 but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1372 C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1373 are lesser than C<CJK Unified Ideographs Extension> (its range is
1374 C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1376 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1378 ex. CJK Unified Ideographs in the JIS code point order.
1380 overrideCJK => sub {
1381 my $u = shift; # get a Unicode codepoint
1382 my $b = pack('n', $u); # to UTF-16BE
1383 my $s = your_unicode_to_sjis_converter($b); # convert
1384 my $n = unpack('n', $s); # convert sjis to short
1385 [ $n, 0x20, 0x2, $u ]; # return the collation element
1388 ex. ignores all CJK Unified Ideographs.
1390 overrideCJK => sub {()}, # CODEREF returning empty list
1392 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1393 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1395 If C<undef> is passed explicitly as the value for this key,
1396 weights for CJK Unified Ideographs are treated as undefined.
1397 But assignment of weight for CJK Unified Ideographs
1398 in table or C<entry> is still valid.
1400 =item overrideHangul
1402 -- see 7.1 Derived Collation Elements, UTS #10.
1404 By default, Hangul Syllables are decomposed into Hangul Jamo,
1405 even if C<(normalization =E<gt> undef)>.
1406 But the mapping of Hangul Syllables may be overrided.
1408 This parameter works like C<overrideCJK>, so see there for examples.
1410 If you want to override the mapping of Hangul Syllables,
1411 NFD, NFKD, and FCD are not appropriate,
1412 since they will decompose Hangul Syllables before overriding.
1414 If C<undef> is passed explicitly as the value for this key,
1415 weight for Hangul Syllables is treated as undefined
1416 without decomposition into Hangul Jamo.
1417 But definition of weight for Hangul Syllables
1418 in table or C<entry> is still valid.
1422 -- see 5.1 Preprocessing, UTS #10.
1424 If specified, the coderef is used to preprocess
1425 before the formation of sort keys.
1427 ex. dropping English articles, such as "a" or "the".
1428 Then, "the pen" is before "a pencil".
1432 $str =~ s/\b(?:an?|the)\s+//gi;
1436 C<preprocess> is performed before C<normalization> (if defined).
1440 -- see 3.1.3 Rearrangement, UTS #10.
1442 Characters that are not coded in logical order and to be rearranged.
1443 If C<UCA_Version> is equal to or lesser than 11, default is:
1445 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1447 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1448 (a reference to empty list) as the value for this key.
1450 If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1452 B<According to the version 9 of UCA, this parameter shall not be used;
1453 but it is not warned at present.>
1457 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1459 You can use another collation element table if desired.
1461 The table file should locate in the F<Unicode/Collate> directory
1462 on C<@INC>. Say, if the filename is F<Foo.txt>,
1463 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1465 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1466 If you will prepare your own table file, any name other than F<allkeys.txt>
1467 may be better to avoid namespace conflict.
1469 If C<undef> is passed explicitly as the value for this key,
1470 no file is read (but you can define collation elements via C<entry>).
1472 A typical way to define a collation element table
1473 without any file of table:
1475 $onlyABC = Unicode::Collate->new(
1477 entry => << 'ENTRIES',
1478 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1479 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1480 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1481 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1482 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1483 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1487 If C<ignoreName> or C<undefName> is used, character names should be
1488 specified as a comment (following C<#>) on each line.
1494 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1496 Undefines the collation element as if it were unassigned in the table.
1497 This reduces the size of the table.
1498 If an unassigned character appears in the string to be collated,
1499 the sort key is made from its codepoint
1500 as a single-character collation element,
1501 as it is greater than any other assigned collation elements
1502 (in the codepoint order among the unassigned characters).
1503 But, it'd be better to ignore characters
1504 unfamiliar to you and maybe never used.
1506 Through C<undefChar>, any character matching C<qr/$undefChar/>
1507 will be undefined. Through C<undefName>, any character whose name
1508 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1511 ex. Collation weights for beyond-BMP characters are not stored in object:
1513 undefChar => qr/[^\0-\x{fffd}]/,
1515 =item upper_before_lower
1517 -- see 6.6 Case Comparisons, UTS #10.
1519 By default, lowercase is before uppercase.
1520 If the parameter is made true, this is reversed.
1522 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1523 distinctions must occur in level 3, and their weights at level 3 must be
1524 same as those mentioned in 7.3.1, UTS #10.
1525 If you define your collation elements which differs from this requirement,
1526 this parameter doesn't work validly.
1530 -- see 3.2.2 Variable Weighting, UTS #10.
1532 This key allows to variable weighting for variable collation elements,
1533 which are marked with an ASTERISK in the table
1534 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1536 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1538 These names are case-insensitive.
1539 By default (if specification is omitted), 'shifted' is adopted.
1541 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1542 considered at the 4th level.
1544 'Non-Ignorable' Variable elements are not reset to ignorable.
1546 'Shifted' Variable elements are made ignorable at levels 1 through 3
1547 their level 4 weight is replaced by the old level 1 weight.
1548 Level 4 weight for Non-Variable elements is 0xFFFF.
1550 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1555 =head2 Methods for Collation
1559 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1561 Sorts a list of strings.
1563 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1565 Returns 1 (when C<$a> is greater than C<$b>)
1566 or 0 (when C<$a> is equal to C<$b>)
1567 or -1 (when C<$a> is lesser than C<$b>).
1569 =item C<$result = $Collator-E<gt>eq($a, $b)>
1571 =item C<$result = $Collator-E<gt>ne($a, $b)>
1573 =item C<$result = $Collator-E<gt>lt($a, $b)>
1575 =item C<$result = $Collator-E<gt>le($a, $b)>
1577 =item C<$result = $Collator-E<gt>gt($a, $b)>
1579 =item C<$result = $Collator-E<gt>ge($a, $b)>
1581 They works like the same name operators as theirs.
1583 eq : whether $a is equal to $b.
1584 ne : whether $a is not equal to $b.
1585 lt : whether $a is lesser than $b.
1586 le : whether $a is lesser than $b or equal to $b.
1587 gt : whether $a is greater than $b.
1588 ge : whether $a is greater than $b or equal to $b.
1590 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1592 -- see 4.3 Form Sort Key, UTS #10.
1596 You compare the sort keys using a binary comparison
1597 and get the result of the comparison of the strings using UCA.
1599 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1603 $Collator->cmp($a, $b)
1605 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1607 Converts a sorting key into its representation form.
1608 If C<UCA_Version> is 8, the output is slightly different.
1610 use Unicode::Collate;
1611 my $c = Unicode::Collate->new();
1612 print $c->viewSortKey("Perl"),"\n";
1615 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1616 # Level 1 Level 2 Level 3 Level 4
1620 =head2 Methods for Searching
1622 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1623 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1624 C<subst>, C<gsubst>) is croaked,
1625 as the position and the length might differ
1626 from those on the specified string.
1627 (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1629 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1630 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1631 but they are not aware of any pattern, but only a literal substring.
1635 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1637 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1639 If C<$substring> matches a part of C<$string>, returns
1640 the position of the first occurrence of the matching part in scalar context;
1641 in list context, returns a two-element list of
1642 the position and the length of the matching part.
1644 If C<$substring> does not match any part of C<$string>,
1645 returns C<-1> in scalar context and
1646 an empty list in list context.
1650 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1651 # (normalization => undef) is REQUIRED.
1652 my $str = "Ich muß studieren Perl.";
1655 if (my($pos,$len) = $Collator->index($str, $sub)) {
1656 $match = substr($str, $pos, $len);
1659 and get C<"muß"> in C<$match> since C<"muß">
1660 is primary equal to C<"MÜSS">.
1662 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1664 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1666 If C<$substring> matches a part of C<$string>, in scalar context, returns
1667 B<a reference to> the first occurrence of the matching part
1668 (C<$match_ref> is always true if matches,
1669 since every reference is B<true>);
1670 in list context, returns the first occurrence of the matching part.
1672 If C<$substring> does not match any part of C<$string>,
1673 returns C<undef> in scalar context and
1674 an empty list in list context.
1678 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1679 print "matches [$$match_ref].\n";
1681 print "doesn't match.\n";
1686 if (($match) = $Collator->match($str, $sub)) { # list context
1687 print "matches [$match].\n";
1689 print "doesn't match.\n";
1692 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1694 If C<$substring> matches a part of C<$string>, returns
1695 all the matching parts (or matching count in scalar context).
1697 If C<$substring> does not match any part of C<$string>,
1698 returns an empty list.
1700 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1702 If C<$substring> matches a part of C<$string>,
1703 the first occurrence of the matching part is replaced by C<$replacement>
1704 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1706 C<$replacement> can be a C<CODEREF>,
1707 taking the matching part as an argument,
1708 and returning a string to replace the matching part
1709 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1711 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1713 If C<$substring> matches a part of C<$string>,
1714 all the occurrences of the matching part is replaced by C<$replacement>
1715 (C<$string> is modified) and return C<$count>.
1717 C<$replacement> can be a C<CODEREF>,
1718 taking the matching part as an argument,
1719 and returning a string to replace the matching part
1720 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1724 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1725 # (normalization => undef) is REQUIRED.
1726 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1727 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1729 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1730 # i.e., all the camels are made bold-faced.
1734 =head2 Other Methods
1738 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1740 Change the value of specified keys and returns the changed part.
1742 $Collator = Unicode::Collate->new(level => 4);
1744 $Collator->eq("perl", "PERL"); # false
1746 %old = $Collator->change(level => 2); # returns (level => 4).
1748 $Collator->eq("perl", "PERL"); # true
1750 $Collator->change(%old); # returns (level => 2).
1752 $Collator->eq("perl", "PERL"); # false
1754 Not all C<(key,value)>s are allowed to be changed.
1755 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1757 In the scalar context, returns the modified collator
1758 (but it is B<not> a clone from the original).
1760 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1762 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1764 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1766 =item C<$version = $Collator-E<gt>version()>
1768 Returns the version number (a string) of the Unicode Standard
1769 which the C<table> file used by the collator object is based on.
1770 If the table does not include a version line (starting with C<@version>),
1771 returns C<"unknown">.
1773 =item C<UCA_Version()>
1775 Returns the tracking version number of UTS #10 this module consults.
1777 =item C<Base_Unicode_Version()>
1779 Returns the version number of UTS #10 this module consults.
1785 No method will be exported.
1789 Though this module can be used without any C<table> file,
1790 to use this module easily, it is recommended to install a table file
1791 in the UCA format, by copying it under the directory
1792 <a place in @INC>/Unicode/Collate.
1794 The most preferable one is "The Default Unicode Collation Element Table"
1795 (aka DUCET), available from the Unicode Consortium's website:
1797 http://www.unicode.org/Public/UCA/
1799 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1801 If DUCET is not installed, it is recommended to copy the file
1802 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1803 to <a place in @INC>/Unicode/Collate/allkeys.txt
1812 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1813 module (see L<Unicode::Normalize>).
1815 If you need not it (say, in the case when you need not
1816 handle any combining characters),
1817 assign C<normalization =E<gt> undef> explicitly.
1819 -- see 6.5 Avoiding Normalization, UTS #10.
1821 =item Conformance Test
1823 The Conformance Test for the UCA is available
1824 under L<http://www.unicode.org/Public/UCA/>.
1826 For F<CollationTest_SHIFTED.txt>,
1827 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1828 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1829 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1831 B<Unicode::Normalize is required to try The Conformance Test.>
1835 =head1 AUTHOR, COPYRIGHT AND LICENSE
1837 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1838 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1839 SADAHIRO Tomoyuki. Japan. All rights reserved.
1841 This module is free software; you can redistribute it and/or
1842 modify it under the same terms as Perl itself.
1844 The file Unicode/Collate/allkeys.txt was copied directly
1845 from L<http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt>.
1846 This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
1847 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1853 =item Unicode Collation Algorithm - UTS #10
1855 L<http://www.unicode.org/reports/tr10/>
1857 =item The Default Unicode Collation Element Table (DUCET)
1859 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1861 =item The conformance test for the UCA
1863 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1865 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1867 =item Hangul Syllable Type
1869 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1871 =item Unicode Normalization Forms - UAX #15
1873 L<http://www.unicode.org/reports/tr15/>