1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
17 our $VERSION = '0.25';
18 our $PACKAGE = __PACKAGE__;
20 our @ISA = qw(Exporter);
22 our %EXPORT_TAGS = ();
26 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27 our $KeyFile = "allkeys.txt";
31 eval { require Unicode::UCD };
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
38 foreach my $d (@INC) {
39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
41 $UNICODE_VERSION = '3.0.1';
49 use constant TRUE => 1;
50 use constant FALSE => "";
51 use constant NOMATCHPOS => -1;
53 # A coderef to get combining class imported from Unicode::Normalize
54 # (i.e. \&Unicode::Normalize::getCombinClass).
55 # This is also used as a HAS_UNICODE_NORMALIZE flag.
59 use constant MinLevel => 1;
60 use constant MaxLevel => 4;
62 # Minimum weights at level 2 and 3, respectively
63 use constant Min2Wt => 0x20;
64 use constant Min3Wt => 0x02;
66 # Shifted weight at 4th level
67 use constant Shift4Wt => 0xFFFF;
69 # Variable weight at 1st level.
70 # This is a negative value but should be regarded as zero on collation.
71 # This is for distinction of variable chars from level 3 ignorable chars.
72 use constant Var1Wt => -1;
75 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
76 # PROBLEM: The Default Unicode Collation Element Table
77 # has weights over 0xFFFF at the 4th level.
78 # The tie-breaking in the variable weights
79 # other than "shift" (as well as "shift-trimmed") is unreliable.
80 use constant VCE_TEMPLATE => 'Cn4';
82 # A sort key: 16-bit weights
83 # See also the PROBLEM on VCE_TEMPLATE above.
84 use constant KEY_TEMPLATE => 'n*';
86 # Level separator in a sort key:
87 # i.e. pack(KEY_TEMPLATE, 0)
88 use constant LEVEL_SEP => "\0\0";
90 # As Unicode code point separator for hash keys.
91 # A joined code point string (denoted by JCPS below)
92 # like "65;768" is used for internal processing
93 # instead of Perl's Unicode string like "\x41\x{300}",
94 # as the native code point is different from the Unicode code point
96 # This character must not be included in any stringified
97 # representation of an integer.
98 use constant CODE_SEP => ';';
100 # boolean values of variable weights
101 use constant NON_VAR => 0; # Non-Variable character
102 use constant VAR => 1; # Variable character
104 # Logical_Order_Exception in PropList.txt
105 # TODO: synchronization with change of PropList.txt.
106 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
108 sub UCA_Version { "9" }
110 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
115 return pack('U*', @_);
119 return unpack('U*', pack('U*').shift);
126 blanked non-ignorable shifted shift-trimmed
130 alternate backwards level normalization rearrange
131 katakana_before_hiragana upper_before_lower
132 overrideHangul overrideCJK preprocess UCA_Version
136 entry entries table maxlength
137 ignoreChar ignoreName undefChar undefName
138 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
139 derivCode normCode rearrangeHash L3_ignorable
142 # The hash key 'ignored' is deleted at v 0.21.
143 # The hash key 'isShift' is deleted at v 0.23.
144 # The hash key 'combining' is deleted at v 0.24.
146 my (%ChangeOK, %ChangeNG);
147 @ChangeOK{ @ChangeOK } = ();
148 @ChangeNG{ @ChangeNG } = ();
154 foreach my $k (keys %hash) {
155 if (exists $ChangeOK{$k}) {
156 $old{$k} = $self->{$k};
157 $self->{$k} = $hash{$k};
159 elsif (exists $ChangeNG{$k}) {
160 croak "change of $k via change() is not allowed!";
164 $self->checkCollator;
165 return wantarray ? %old : $self;
171 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
172 $level, $key, MinLevel if MinLevel > $level;
173 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
174 $level, $key, MaxLevel if MaxLevel < $level;
179 _checkLevel($self->{level}, "level");
182 $self->{UCA_Version} == 8 ? \&_derivCE_8 :
183 $self->{UCA_Version} == 9 ? \&_derivCE_9 :
184 croak "Illegal UCA version (passed $self->{UCA_Version}).";
186 $self->{alternate} = lc($self->{alternate});
187 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
188 unless exists $AlternateOK{ $self->{alternate} };
190 if (! defined $self->{backwards}) {
191 $self->{backwardsFlag} = 0;
193 elsif (! ref $self->{backwards}) {
194 _checkLevel($self->{backwards}, "backwards");
195 $self->{backwardsFlag} = 1 << $self->{backwards};
199 $self->{backwardsFlag} = 0;
200 for my $b (@{ $self->{backwards} }) {
201 _checkLevel($b, "backwards");
204 for my $v (sort keys %level) {
205 $self->{backwardsFlag} += 1 << $v;
209 $self->{rearrange} = []
210 if ! defined $self->{rearrange};
211 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
212 if ! ref $self->{rearrange};
214 # keys of $self->{rearrangeHash} are $self->{rearrange}.
215 $self->{rearrangeHash} = undef;
217 if (@{ $self->{rearrange} }) {
218 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
221 $self->{normCode} = undef;
223 if (defined $self->{normalization}) {
224 eval { require Unicode::Normalize };
225 croak "Unicode/Normalize.pm is required to normalize strings: $@"
228 Unicode::Normalize->import();
229 $getCombinClass = \&Unicode::Normalize::getCombinClass
230 if ! $getCombinClass;
233 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
234 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
235 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
236 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
237 croak "$PACKAGE unknown normalization form name: "
238 . $self->{normalization};
246 my $self = bless { @_ }, $class;
248 # If undef is passed explicitly, no file is read.
249 $self->{table} = $KeyFile if ! exists $self->{table};
250 $self->read_table if defined $self->{table};
252 if ($self->{entry}) {
253 $self->parseEntry($_) foreach split /\n/, $self->{entry};
256 $self->{level} ||= MaxLevel;
257 $self->{UCA_Version} ||= UCA_Version();
259 $self->{overrideHangul} = ''
260 if ! exists $self->{overrideHangul};
261 $self->{overrideCJK} = ''
262 if ! exists $self->{overrideCJK};
263 $self->{normalization} = 'D'
264 if ! exists $self->{normalization};
265 $self->{alternate} = $self->{alternateTable} || 'shifted'
266 if ! exists $self->{alternate};
267 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
268 if ! exists $self->{rearrange};
269 $self->{backwards} = $self->{backwardsTable}
270 if ! exists $self->{backwards};
272 $self->checkCollator;
279 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
281 my $filepath = File::Spec->catfile($Path, $file);
282 open my $fk, "<$filepath"
283 or croak "File does not exist at $filepath";
288 if (/^\s*\@version\s*(\S*)/) {
289 $self->{versionTable} ||= $1;
291 elsif (/^\s*\@alternate\s+(\S*)/) {
292 $self->{alternateTable} ||= $1;
294 elsif (/^\s*\@backwards\s+(\S*)/) {
295 push @{ $self->{backwardsTable} }, $1;
297 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
298 push @{ $self->{forwardsTable} }, $1;
300 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
301 push @{ $self->{rearrangeTable} }, _getHexArray($1);
305 $self->parseEntry($_);
312 ## get $line, parse it, and write an entry in $self
318 my($name, $entry, @uv, @key);
320 return if $line !~ /^\s*[0-9A-Fa-f]/;
322 # removes comment and gets name
324 if $line =~ s/[#%]\s*(.*)//;
325 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
328 my($e, $k) = split /;/, $line;
329 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
332 @uv = _getHexArray($e);
335 $entry = join(CODE_SEP, @uv); # in JCPS
337 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
338 my $ele = pack_U(@uv);
340 # regarded as if it were not entried in the table
342 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
344 # replaced as completely ignorable
345 $k = '[.0000.0000.0000.0000]'
346 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
349 # replaced as completely ignorable
350 $k = '[.0000.0000.0000.0000]'
351 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
355 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
356 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
357 my @wt = _getHexArray($arr);
358 push @key, pack(VCE_TEMPLATE, $var, @wt);
359 $is_L3_ignorable = TRUE
360 if $wt[0] + $wt[1] + $wt[2] == 0;
361 # if $arr !~ /[1-9A-Fa-f]/; NG
362 # Conformance Test shows L3-ignorable is completely ignorable.
365 $self->{entries}{$entry} = \@key;
367 $self->{L3_ignorable}{$uv[0]} = TRUE
368 if @uv == 1 && $is_L3_ignorable;
370 # Contraction is to be considered in the range of this maxlength.
371 $self->{maxlength}{$uv[0]} = scalar @uv
377 ## arrayref[weights] = altCE(VCE)
382 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
384 $self->{alternate} eq 'blanked' ?
385 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
386 $self->{alternate} eq 'non-ignorable' ?
388 $self->{alternate} eq 'shifted' ?
389 $var ? [Var1Wt, 0, 0, $wt[0] ]
390 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
391 $self->{alternate} eq 'shift-trimmed' ?
392 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
393 croak "$PACKAGE unknown alternate name: $self->{alternate}";
399 $self->visualizeSortKey($self->getSortKey(@_));
405 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
407 if ($self->{UCA_Version} <= 8) {
408 $view =~ s/ ?0000 ?/|/g;
410 $view =~ s/\b0000\b/|/g;
417 ## arrayref of JCPS = splitCE(string to be collated)
418 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
425 my $code = $self->{preprocess};
426 my $norm = $self->{normCode};
427 my $ent = $self->{entries};
428 my $max = $self->{maxlength};
429 my $reH = $self->{rearrangeHash};
430 my $ign = $self->{L3_ignorable};
431 my $ver9 = $self->{UCA_Version} > 8;
436 $code and croak "Preprocess breaks character positions. "
437 . "Don't use with index(), match(), etc.";
438 $norm and croak "Normalization breaks character positions. "
439 . "Don't use with index(), match(), etc.";
444 $str = &$code($str) if ref $code;
445 $str = &$norm($str) if ref $norm;
448 # get array of Unicode code point of string.
449 my @src = unpack_U($str);
452 # Character positions are not kept if rearranged,
453 # then neglected if $wLen is true.
454 if ($reH && ! $wLen) {
455 for (my $i = 0; $i < @src; $i++) {
456 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
457 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
464 # To remove a character marked as a completely ignorable.
465 for (my $i = 0; $i < @src; $i++) {
466 $src[$i] = undef if $ign->{ $src[$i] };
470 for (my $i = 0; $i < @src; $i++) {
471 next if _isNonCharacter($src[$i]);
476 if ($max->{$ce}) { # contract
479 for (my $p = $i + 1; $p < @src; $p++) {
480 next if ! defined $src[$p];
481 $temp_ce .= CODE_SEP . $src[$p];
482 if ($ent->{$temp_ce}) {
489 # with Combining Char (UTS#10, 4.2.1).
490 # requires Unicode::Normalize.
491 # Not be $wLen, as not croaked due to $norm.
492 if ($getCombinClass) {
493 for (my $p = $i + 1; $p < @src; $p++) {
494 next if ! defined $src[$p];
495 last unless $getCombinClass->($src[$p]);
496 my $tail = CODE_SEP . $src[$p];
497 if ($ent->{$ce.$tail}) {
505 for (my $p = $i + 1; $p < @src; $p++) {
506 last if defined $src[$p];
511 push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
518 ## list of arrayrefs of weights = getWt(JCPS)
524 my $ent = $self->{entries};
525 my $cjk = $self->{overrideCJK};
526 my $hang = $self->{overrideHangul};
527 my $der = $self->{derivCode};
529 return if !defined $ce;
530 return map($self->altCE($_), @{ $ent->{$ce} })
533 # CE must not be a contraction, then it's a code point.
536 if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
537 return map $self->altCE($_),
539 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
542 $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
546 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
547 0x4E00 <= $u && $u <= 0x9FA5 ||
548 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
549 return map $self->altCE($_),
551 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
552 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
553 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
557 return map $self->altCE($_), $der->($u);
563 ## string sortkey = getSortKey(string arg)
568 my $lev = $self->{level};
569 my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
570 my $ver9 = $self->{UCA_Version} > 8;
571 my $v2i = $self->{alternate} ne 'non-ignorable';
574 my (@buf, $last_is_variable);
576 foreach my $wt (map $self->getWt($_), @$rCE) {
578 if ($wt->[0] == 0) { # ignorable
579 next if $last_is_variable;
581 $last_is_variable = ($wt->[0] == Var1Wt);
588 my @ret = ([],[],[],[]);
589 foreach my $v (0..$lev-1) {
590 foreach my $b (@buf) {
591 push @{ $ret[$v] }, $b->[$v]
596 # modification of tertiary weights
597 if ($self->{upper_before_lower}) {
598 foreach (@{ $ret[2] }) {
599 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
600 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
601 elsif ($_ == 0x1C) { $_ += 1 } # square upper
602 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
605 if ($self->{katakana_before_hiragana}) {
606 foreach (@{ $ret[2] }) {
607 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
608 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
612 if ($self->{backwardsFlag}) {
613 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
614 if ($self->{backwardsFlag} & (1 << $v)) {
615 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
620 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
625 ## int compare = cmp(string a, string b)
627 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
628 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
629 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
630 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
631 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
632 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
633 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
636 ## list[strings] sorted = sort(list[strings] arg)
642 sort{ $a->[0] cmp $b->[0] }
643 map [ $obj->getSortKey($_), $_ ], @_;
650 (0x4E00 <= $u && $u <= 0x9FA5)
652 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
656 my $aaaa = $base + ($u >> 15);
657 my $bbbb = ($u & 0x7FFF) | 0x8000;
659 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
660 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
665 my $aaaa = 0xFF80 + ($code >> 15);
666 my $bbbb = ($code & 0x7FFF) | 0x8000;
668 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
669 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
673 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
675 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
678 # $code *must* be in Hangul syllable.
679 # Check it before you enter here.
683 my $SIndex = $code - 0xAC00;
684 my $LIndex = int( $SIndex / 588);
685 my $VIndex = int(($SIndex % 588) / 28);
686 my $TIndex = $SIndex % 28;
690 $TIndex ? (0x11A7 + $TIndex) : (),
694 sub _isNonCharacter {
696 return ! defined $code # removed
697 || ($code < 0 || 0x10FFFF < $code) # out of range
698 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
699 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
700 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
706 ## bool _nonIgnorAtLevel(arrayref weights, int level)
708 sub _nonIgnorAtLevel($$)
711 return if ! defined $wt;
713 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
718 ## arrayref of arrayref[weights] source,
719 ## arrayref of arrayref[weights] substr,
721 ## * comparison of graphemes vs graphemes.
722 ## @$source >= @$substr must be true (check it before call this);
730 for my $g (0..@$substr-1){
731 # Do the $g'th graphemes have the same number of AV weigths?
732 return if @{ $source->[$g] } != @{ $substr->[$g] };
734 for my $w (0..@{ $substr->[$g] }-1) {
735 for my $v (0..$lev-1) {
736 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
744 ## (int position, int length)
745 ## int position = index(string, substring, position, [undoc'ed grobal])
747 ## With "grobal" (only for the list context),
748 ## returns list of arrayref[position, length].
754 my $len = length($str);
755 my $subCE = $self->splitCE(shift);
756 my $pos = @_ ? shift : 0;
757 $pos = 0 if $pos < 0;
760 my $lev = $self->{level};
761 my $ver9 = $self->{UCA_Version} > 8;
762 my $v2i = $self->{alternate} ne 'non-ignorable';
765 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
767 ? map([$_, 0], $temp..$len)
768 : wantarray ? ($temp,0) : $temp;
771 return wantarray ? () : NOMATCHPOS;
773 my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
775 return wantarray ? () : NOMATCHPOS;
777 my $last_is_variable;
778 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
780 $last_is_variable = FALSE;
781 for my $wt (map $self->getWt($_), @$subCE) {
782 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
786 $to_be_pushed = FALSE if $last_is_variable;
788 $last_is_variable = ($wt->[0] == Var1Wt);
792 if (@subWt && $wt->[0] == 0) {
793 push @{ $subWt[-1] }, $wt if $to_be_pushed;
795 $wt->[0] = 0 if $wt->[0] == Var1Wt;
796 push @subWt, [ $wt ];
801 my $end = @$strCE - 1;
803 $last_is_variable = FALSE;
805 for (my $i = 0; $i <= $end; ) { # no $i++
809 while ($i <= $end && $found_base == 0) {
810 for my $wt ($self->getWt($strCE->[$i][0])) {
811 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
815 $to_be_pushed = FALSE if $last_is_variable;
817 $last_is_variable = ($wt->[0] == Var1Wt);
821 if (@strWt && $wt->[0] == 0) {
822 push @{ $strWt[-1] }, $wt if $to_be_pushed;
823 $finPos[-1] = $strCE->[$i][2];
824 } elsif ($to_be_pushed) {
825 $wt->[0] = 0 if $wt->[0] == Var1Wt;
826 push @strWt, [ $wt ];
827 push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
828 $finPos[-1] = NOMATCHPOS if $found_base;
829 push @finPos, $strCE->[$i][2];
838 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
839 if ($iniPos[0] != NOMATCHPOS &&
840 $finPos[$#subWt] != NOMATCHPOS &&
841 _eqArray(\@strWt, \@subWt, $lev)) {
842 my $temp = $iniPos[0] + $pos;
845 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
846 splice @strWt, 0, $#subWt;
847 splice @iniPos, 0, $#subWt;
848 splice @finPos, 0, $#subWt;
852 ? ($temp, $finPos[$#subWt] - $iniPos[0])
864 : wantarray ? () : NOMATCHPOS;
868 ## scalarref to matching part = match(string, substring)
873 if (my($pos,$len) = $self->index($_[0], $_[1])) {
874 my $temp = substr($_[0], $pos, $len);
875 return wantarray ? $temp : \$temp;
876 # An lvalue ref \substr should be avoided,
877 # since its value is affected by modification of its referent.
885 ## arrayref matching parts = gmatch(string, substring)
892 return map substr($str, $_->[0], $_->[1]),
893 $self->index($str, $sub, 0, 'g');
897 ## bool subst'ed = subst(string, substring, replace)
902 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
904 if (my($pos,$len) = $self->index($_[0], $_[1])) {
906 my $mat = substr($_[0], $pos, $len);
907 substr($_[0], $pos, $len, $code->($mat));
909 substr($_[0], $pos, $len, $_[2]);
919 ## int count = gsubst(string, substring, replace)
924 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
927 # Replacement is carried out from the end, then use reverse.
928 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
930 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
931 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
933 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
945 Unicode::Collate - Unicode Collation Algorithm
949 use Unicode::Collate;
952 $Collator = Unicode::Collate->new(%tailoring);
955 @sorted = $Collator->sort(@not_sorted);
958 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
962 This module is an implementation
963 of Unicode Technical Standard #10 (UTS #10)
964 "Unicode Collation Algorithm."
966 =head2 Constructor and Tailoring
968 The C<new> method returns a collator object.
970 $Collator = Unicode::Collate->new(
971 UCA_Version => $UCA_Version,
972 alternate => $alternate,
973 backwards => $levelNumber, # or \@levelNumbers
975 normalization => $normalization_form,
976 ignoreName => qr/$ignoreName/,
977 ignoreChar => qr/$ignoreChar/,
978 katakana_before_hiragana => $bool,
979 level => $collationLevel,
980 overrideCJK => \&overrideCJK,
981 overrideHangul => \&overrideHangul,
982 preprocess => \&preprocess,
983 rearrange => \@charList,
985 undefName => qr/$undefName/,
986 undefChar => qr/$undefChar/,
987 upper_before_lower => $bool,
989 # if %tailoring is false (i.e. empty),
990 # $Collator should do the default collation.
996 If the version number of the older UCA is given,
997 the older behavior of that version is emulated on collating.
998 If omitted, the return value of C<UCA_Version()> is used.
1000 The supported version: 8 or 9.
1002 B<This parameter may be removed in the future version,
1003 as switching the algorithm would affect the performance.>
1007 -- see 3.2.2 Variable Weighting, UTS #10.
1009 (the title in UCA version 8: Alternate Weighting)
1011 This key allows to alternate weighting for variable collation elements,
1012 which are marked with an ASTERISK in the table
1013 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1015 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1017 These names are case-insensitive.
1018 By default (if specification is omitted), 'shifted' is adopted.
1020 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1021 considered at the 4th level.
1023 'Non-ignorable' Variable elements are not reset to ignorable.
1025 'Shifted' Variable elements are made ignorable at levels 1 through 3
1026 their level 4 weight is replaced by the old level 1 weight.
1027 Level 4 weight for Non-Variable elements is 0xFFFF.
1029 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1034 -- see 3.1.2 French Accents, UTS #10.
1036 backwards => $levelNumber or \@levelNumbers
1038 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1039 If omitted, forwards at all the levels.
1043 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1045 Overrides a default order or defines additional collation elements
1047 entry => <<'ENTRIES', # use the UCA file format
1048 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
1049 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
1050 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1053 B<NOTE:> The code point in the UCA file format (before C<';'>)
1054 B<must> be a Unicode code point, but not a native code point.
1055 So C<0063> must always denote C<U+0063>,
1056 but not a character of C<"\x63">.
1062 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1064 Makes the entry in the table completely ignorable;
1065 i.e. as if the weights were zero at all level.
1067 E.g. when 'a' and 'e' are ignorable,
1068 'element' is equal to 'lament' (or 'lmnt').
1072 -- see 4.3 Form a sort key for each string, UTS #10.
1074 Set the maximum level.
1075 Any higher levels than the specified one are ignored.
1077 Level 1: alphabetic ordering
1078 Level 2: diacritic ordering
1079 Level 3: case ordering
1080 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1084 If omitted, the maximum is the 4th.
1088 -- see 4.1 Normalize each input string, UTS #10.
1090 If specified, strings are normalized before preparation of sort keys
1091 (the normalization is executed after preprocess).
1093 As a form name, one of the following names must be used.
1095 'C' or 'NFC' for Normalization Form C
1096 'D' or 'NFD' for Normalization Form D
1097 'KC' or 'NFKC' for Normalization Form KC
1098 'KD' or 'NFKD' for Normalization Form KD
1100 If omitted, the string is put into Normalization Form D.
1102 If C<undef> is passed explicitly as the value for this key,
1103 any normalization is not carried out (this may make tailoring easier
1104 if any normalization is not desired).
1110 -- see 7.1 Derived Collation Elements, UTS #10.
1112 By default, mapping of CJK Unified Ideographs
1113 uses the Unicode codepoint order.
1114 But the mapping of CJK Unified Ideographs may be overrided.
1116 ex. CJK Unified Ideographs in the JIS code point order.
1118 overrideCJK => sub {
1119 my $u = shift; # get a Unicode codepoint
1120 my $b = pack('n', $u); # to UTF-16BE
1121 my $s = your_unicode_to_sjis_converter($b); # convert
1122 my $n = unpack('n', $s); # convert sjis to short
1123 [ $n, 0x20, 0x2, $u ]; # return the collation element
1126 ex. ignores all CJK Unified Ideographs.
1128 overrideCJK => sub {()}, # CODEREF returning empty list
1130 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1131 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1133 If C<undef> is passed explicitly as the value for this key,
1134 weights for CJK Unified Ideographs are treated as undefined.
1135 But assignment of weight for CJK Unified Ideographs
1136 in table or L<entry> is still valid.
1138 =item overrideHangul
1140 -- see 7.1 Derived Collation Elements, UTS #10.
1142 By default, Hangul Syllables are decomposed into Hangul Jamo.
1143 But the mapping of Hangul Syllables may be overrided.
1145 This tag works like L<overrideCJK>, so see there for examples.
1147 If you want to override the mapping of Hangul Syllables,
1148 the Normalization Forms D and KD are not appropriate
1149 (they will be decomposed before overriding).
1151 If C<undef> is passed explicitly as the value for this key,
1152 weight for Hangul Syllables is treated as undefined
1153 without decomposition into Hangul Jamo.
1154 But definition of weight for Hangul Syllables
1155 in table or L<entry> is still valid.
1159 -- see 5.1 Preprocessing, UTS #10.
1161 If specified, the coderef is used to preprocess
1162 before the formation of sort keys.
1164 ex. dropping English articles, such as "a" or "the".
1165 Then, "the pen" is before "a pencil".
1169 $str =~ s/\b(?:an?|the)\s+//gi;
1175 -- see 3.1.3 Rearrangement, UTS #10.
1177 Characters that are not coded in logical order and to be rearranged.
1180 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1182 If you want to disallow any rearrangement,
1183 pass C<undef> or C<[]> (a reference to an empty list)
1184 as the value for this key.
1186 B<According to the version 9 of UCA, this parameter shall not be used;
1187 but it is not warned at present.>
1191 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1193 You can use another element table if desired.
1194 The table file must be put into a directory
1195 where F<Unicode/Collate.pm> is installed.
1196 E.g. in F<perl/lib/Unicode/Collate> directory
1197 when you have F<perl/lib/Unicode/Collate.pm>.
1199 By default, the filename F<"allkeys.txt"> is used.
1201 If C<undef> is passed explicitly as the value for this key,
1202 no file is read (but you can define collation elements via L<entry>).
1204 A typical way to define a collation element table
1205 without any file of table:
1207 $onlyABC = Unicode::Collate->new(
1209 entry => << 'ENTRIES',
1210 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1211 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1212 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1213 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1214 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1215 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1223 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1225 Undefines the collation element as if it were unassigned in the table.
1226 This reduces the size of the table.
1227 If an unassigned character appears in the string to be collated,
1228 the sort key is made from its codepoint
1229 as a single-character collation element,
1230 as it is greater than any other assigned collation elements
1231 (in the codepoint order among the unassigned characters).
1232 But, it'd be better to ignore characters
1233 unfamiliar to you and maybe never used.
1235 =item katakana_before_hiragana
1237 =item upper_before_lower
1239 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1241 By default, lowercase is before uppercase
1242 and hiragana is before katakana.
1244 If the tag is made true, this is reversed.
1246 B<NOTE>: These tags simplemindedly assume
1247 any lowercase/uppercase or hiragana/katakana distinctions
1248 must occur in level 3, and their weights at level 3
1249 must be same as those mentioned in 7.3.1, UTS #10.
1250 If you define your collation elements which violate this requirement,
1251 these tags don't work validly.
1255 =head2 Methods for Collation
1259 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1261 Sorts a list of strings.
1263 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1265 Returns 1 (when C<$a> is greater than C<$b>)
1266 or 0 (when C<$a> is equal to C<$b>)
1267 or -1 (when C<$a> is lesser than C<$b>).
1269 =item C<$result = $Collator-E<gt>eq($a, $b)>
1271 =item C<$result = $Collator-E<gt>ne($a, $b)>
1273 =item C<$result = $Collator-E<gt>lt($a, $b)>
1275 =item C<$result = $Collator-E<gt>le($a, $b)>
1277 =item C<$result = $Collator-E<gt>gt($a, $b)>
1279 =item C<$result = $Collator-E<gt>ge($a, $b)>
1281 They works like the same name operators as theirs.
1283 eq : whether $a is equal to $b.
1284 ne : whether $a is not equal to $b.
1285 lt : whether $a is lesser than $b.
1286 le : whether $a is lesser than $b or equal to $b.
1287 gt : whether $a is greater than $b.
1288 ge : whether $a is greater than $b or equal to $b.
1290 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1292 -- see 4.3 Form a sort key for each string, UTS #10.
1296 You compare the sort keys using a binary comparison
1297 and get the result of the comparison of the strings using UCA.
1299 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1303 $Collator->cmp($a, $b)
1305 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1307 use Unicode::Collate;
1308 my $c = Unicode::Collate->new();
1309 print $c->viewSortKey("Perl"),"\n";
1312 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1313 # Level 1 Level 2 Level 3 Level 4
1315 (If C<UCA_Version> is 8, the output is slightly different.)
1319 =head2 Methods for Searching
1321 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1322 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1323 C<subst>, C<gsubst>) is croaked,
1324 as the position and the length might differ
1325 from those on the specified string.
1326 (And the C<rearrange> tag is neglected.)
1328 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1329 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1330 but they are not aware of any pattern, but only a literal substring.
1334 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1336 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1338 If C<$substring> matches a part of C<$string>, returns
1339 the position of the first occurrence of the matching part in scalar context;
1340 in list context, returns a two-element list of
1341 the position and the length of the matching part.
1343 If C<$substring> does not match any part of C<$string>,
1344 returns C<-1> in scalar context and
1345 an empty list in list context.
1349 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1350 # (normalization => undef) is REQUIRED.
1351 my $str = "Ich muß studieren Perl.";
1354 if (my($pos,$len) = $Collator->index($str, $sub)) {
1355 $match = substr($str, $pos, $len);
1358 and get C<"muß"> in C<$match> since C<"muß">
1359 is primary equal to C<"MÜSS">.
1361 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1363 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1365 If C<$substring> matches a part of C<$string>, in scalar context, returns
1366 B<a reference to> the first occurrence of the matching part
1367 (C<$match_ref> is always true if matches,
1368 since every reference is B<true>);
1369 in list context, returns the first occurrence of the matching part.
1371 If C<$substring> does not match any part of C<$string>,
1372 returns C<undef> in scalar context and
1373 an empty list in list context.
1377 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1378 print "matches [$$match_ref].\n";
1380 print "doesn't match.\n";
1385 if (($match) = $Collator->match($str, $sub)) { # list context
1386 print "matches [$match].\n";
1388 print "doesn't match.\n";
1391 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1393 If C<$substring> matches a part of C<$string>, returns
1394 all the matching parts (or matching count in scalar context).
1396 If C<$substring> does not match any part of C<$string>,
1397 returns an empty list.
1399 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1401 If C<$substring> matches a part of C<$string>,
1402 the first occurrence of the matching part is replaced by C<$replacement>
1403 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1405 C<$replacement> can be a C<CODEREF>,
1406 taking the matching part as an argument,
1407 and returning a string to replace the matching part
1408 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1410 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1412 If C<$substring> matches a part of C<$string>,
1413 all the occurrences of the matching part is replaced by C<$replacement>
1414 (C<$string> is modified) and return C<$count>.
1416 C<$replacement> can be a C<CODEREF>,
1417 taking the matching part as an argument,
1418 and returning a string to replace the matching part
1419 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1423 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1424 # (normalization => undef) is REQUIRED.
1425 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1426 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1428 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1429 # i.e., all the camels are made bold-faced.
1433 =head2 Other Methods
1437 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1439 Change the value of specified keys and returns the changed part.
1441 $Collator = Unicode::Collate->new(level => 4);
1443 $Collator->eq("perl", "PERL"); # false
1445 %old = $Collator->change(level => 2); # returns (level => 4).
1447 $Collator->eq("perl", "PERL"); # true
1449 $Collator->change(%old); # returns (level => 2).
1451 $Collator->eq("perl", "PERL"); # false
1453 Not all C<(key,value)>s are allowed to be changed.
1454 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1456 In the scalar context, returns the modified collator
1457 (but it is B<not> a clone from the original).
1459 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1461 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1463 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1467 Returns the version number of UTS #10 this module consults.
1469 =item Base_Unicode_Version
1471 Returns the version number of the Unicode Standard
1472 this module is based on.
1482 Use of the C<normalization> parameter requires
1483 the B<Unicode::Normalize> module.
1485 If you need not it (say, in the case when you need not
1486 handle any combining characters),
1487 assign C<normalization =E<gt> undef> explicitly.
1489 -- see 6.5 Avoiding Normalization, UTS #10.
1491 =head2 Conformance Test
1493 The Conformance Test for the UCA is provided
1494 in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1495 and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1497 For F<CollationTest_SHIFTED.txt>,
1498 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1499 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1500 C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1502 B<Unicode::Normalize is required to try The Conformance Test.>
1506 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1508 http://homepage1.nifty.com/nomenclator/perl/
1510 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
1512 This library is free software; you can redistribute it
1513 and/or modify it under the same terms as Perl itself.
1519 =item http://www.unicode.org/reports/tr10/
1521 Unicode Collation Algorithm - UTS #10
1523 =item http://www.unicode.org/reports/tr10/allkeys.txt
1525 The Default Unicode Collation Element Table
1527 =item http://www.unicode.org/reports/tr10/CollationTest.html
1528 http://www.unicode.org/reports/tr10/CollationTest.zip
1530 The latest versions of the conformance test for the UCA
1532 =item http://www.unicode.org/reports/tr15/
1534 Unicode Normalization Forms - UAX #15
1536 =item L<Unicode::Normalize>