1 package Unicode::Collate;
5 die "Unicode::Collate not ported to EBCDIC\n";
17 # Supporting on EBCDIC platform is not tested.
19 our $IsEBCDIC = ord("A") != 0x41;
21 our $VERSION = '0.23';
22 our $PACKAGE = __PACKAGE__;
24 our @ISA = qw(Exporter);
26 our %EXPORT_TAGS = ();
30 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
31 our $KeyFile = "allkeys.txt";
35 eval { require Unicode::UCD };
38 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
40 else { # XXX, Perl 5.6.1
42 foreach my $d (@INC) {
43 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
45 $UNICODE_VERSION = '3.0.1';
53 use constant TRUE => 1;
54 use constant FALSE => "";
55 use constant NOMATCHPOS => -1;
57 # A coderef to get combining class imported from Unicode::Normalize
58 # (i.e. \&Unicode::Normalize::getCombinClass).
59 # This is also used as a HAS_UNICODE_NORMALIZE flag.
62 # Minimum weights at level 2 and 3, respectively
63 use constant Min2 => 0x20;
64 use constant Min3 => 0x02;
66 # Shifted weight at 4th level
67 use constant Shift4 => 0xFFFF;
69 # Variable weight at 1st level.
70 # This is a negative value but should be regarded as zero on collation.
71 # This is for distinction of variable chars from level 3 ignorable chars.
72 use constant Var1 => -1;
75 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
76 # PROBLEM: The Default Unicode Collation Element Table
77 # has weights over 0xFFFF at the 4th level.
78 # The tie-breaking in the variable weights
79 # other than "shift" (as well as "shift-trimmed") is unreliable.
80 use constant VCE_TEMPLATE => 'Cn4';
82 # Unicode encoding of strings to be collated
83 # TODO: 'N*' for UTF-32BE, 'V*' for UTF-32LE.
84 use constant UTF_TEMPLATE => 'U*';
86 # A sort key: 16-bit weights
87 # See also the PROBLEM on VCE_TEMPLATE above.
88 use constant KEY_TEMPLATE => 'n*';
90 # Level separator in a sort key:
91 # i.e. pack(KEY_TEMPLATE, 0)
92 use constant LEVEL_SEP => "\0\0";
94 # As Unicode code point separator for hash keys.
95 # A joined code point string (denoted by JCPS below)
96 # like "65;768" is used for internal processing
97 # instead of Perl's Unicode string like "\x41\x{300}",
98 # as the native code point is different from the Unicode code point
100 # This character must not be included in any stringified
101 # representation of an integer.
102 use constant CODE_SEP => ';';
104 # boolean values of variable weights
105 use constant NON_VAR => 0; # Non-Variable character
106 use constant VAR => 1; # Variable character
108 # Logical_Order_Exception in PropList.txt
109 # TODO: synchronization with change of PropList.txt.
110 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
112 sub UCA_Version { "9" }
114 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
118 blanked non-ignorable shifted shift-trimmed
122 alternate backwards level normalization rearrange
123 katakana_before_hiragana upper_before_lower
124 overrideHangul overrideCJK preprocess UCA_Version
128 entry entries table combining maxlength
129 ignoreChar ignoreName undefChar undefName
130 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
131 derivCode normCode rearrangeHash L3_ignorable
133 # The hash key 'ignored' is deleted at VERSION 0.21.
134 # The hash key 'isShift' are deleted at VERSION 0.23.
136 my (%ChangeOK, %ChangeNG);
137 @ChangeOK{ @ChangeOK } = ();
138 @ChangeNG{ @ChangeNG } = ();
144 foreach my $k (keys %hash) {
145 if (exists $ChangeOK{$k}) {
146 $old{$k} = $self->{$k};
147 $self->{$k} = $hash{$k};
149 elsif (exists $ChangeNG{$k}) {
150 croak "change of $k via change() is not allowed!";
154 $self->checkCollator;
155 return wantarray ? %old : $self;
160 croak "Illegal level lower than 1 (passed $self->{level})."
161 if $self->{level} < 1;
162 croak "A level higher than 4 (passed $self->{level}) is not supported."
163 if 4 < $self->{level};
166 $self->{UCA_Version} == 8 ? \&_derivCE_8 :
167 $self->{UCA_Version} == 9 ? \&_derivCE_9 :
168 croak "Illegal UCA version (passed $self->{UCA_Version}).";
170 $self->{alternate} = lc($self->{alternate});
171 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
172 unless exists $AlternateOK{ $self->{alternate} };
174 $self->{backwards} = []
175 if ! defined $self->{backwards};
176 $self->{backwards} = [ $self->{backwards} ]
177 if ! ref $self->{backwards};
179 $self->{rearrange} = []
180 if ! defined $self->{rearrange};
181 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
182 if ! ref $self->{rearrange};
184 # keys of $self->{rearrangeHash} are $self->{rearrange}.
185 $self->{rearrangeHash} = undef;
187 if (@{ $self->{rearrange} }) {
188 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
191 $self->{normCode} = undef;
193 if (defined $self->{normalization}) {
194 eval { require Unicode::Normalize };
195 croak "Unicode/Normalize.pm is required to normalize strings: $@"
198 Unicode::Normalize->import();
199 $getCombinClass = \&Unicode::Normalize::getCombinClass
200 if ! $getCombinClass;
203 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
204 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
205 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
206 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
207 croak "$PACKAGE unknown normalization form name: "
208 . $self->{normalization};
216 my $self = bless { @_ }, $class;
218 # If undef is passed explicitly, no file is read.
219 $self->{table} = $KeyFile if ! exists $self->{table};
220 $self->read_table if defined $self->{table};
222 if ($self->{entry}) {
223 $self->parseEntry($_) foreach split /\n/, $self->{entry};
226 $self->{level} ||= 4;
227 $self->{UCA_Version} ||= UCA_Version();
229 $self->{overrideHangul} = ''
230 if ! exists $self->{overrideHangul};
231 $self->{overrideCJK} = ''
232 if ! exists $self->{overrideCJK};
233 $self->{normalization} = 'D'
234 if ! exists $self->{normalization};
235 $self->{alternate} = $self->{alternateTable} || 'shifted'
236 if ! exists $self->{alternate};
237 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
238 if ! exists $self->{rearrange};
239 $self->{backwards} = $self->{backwardsTable}
240 if ! exists $self->{backwards};
242 $self->checkCollator;
249 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
251 my $filepath = File::Spec->catfile($Path, $file);
252 open my $fk, "<$filepath"
253 or croak "File does not exist at $filepath";
258 if (/^\s*\@version\s*(\S*)/) {
259 $self->{versionTable} ||= $1;
261 elsif (/^\s*\@alternate\s+(\S*)/) {
262 $self->{alternateTable} ||= $1;
264 elsif (/^\s*\@backwards\s+(\S*)/) {
265 push @{ $self->{backwardsTable} }, $1;
267 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
268 push @{ $self->{forwardsTable} }, $1;
270 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
271 push @{ $self->{rearrangeTable} }, _getHexArray($1);
275 $self->parseEntry($_);
282 ## get $line, parse it, and write an entry in $self
288 my($name, $entry, @uv, @key);
290 return if $line !~ /^\s*[0-9A-Fa-f]/;
292 # removes comment and gets name
294 if $line =~ s/[#%]\s*(.*)//;
295 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
298 my($e, $k) = split /;/, $line;
299 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
302 @uv = _getHexArray($e);
305 $entry = join(CODE_SEP, @uv); # in JCPS
307 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
308 # Do not use UTF_TEMPLATE; Perl' RE is only for utf8.
310 ? pack('U*', map utf8::unicode_to_native($_), @uv)
313 # regarded as if it were not entried in the table
315 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
317 # replaced as completely ignorable
318 $k = '[.0000.0000.0000.0000]'
319 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
322 # replaced as completely ignorable
323 $k = '[.0000.0000.0000.0000]'
324 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
326 my $combining = TRUE; # primary = 0, secondary != 0;
329 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
330 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
331 my @wt = _getHexArray($arr);
332 push @key, pack(VCE_TEMPLATE, $var, @wt);
334 unless $wt[0] == 0 && $wt[1] != 0;
335 $is_L3_ignorable = TRUE
336 if $wt[0] + $wt[1] + $wt[2] == 0;
337 # if $arr !~ /[1-9A-Fa-f]/; NG
338 # Conformance Test shows L3-ignorable is completely ignorable.
341 $self->{entries}{$entry} = \@key;
343 $self->{combining}{$entry} = TRUE
346 # The key is a string representing a numeral code point.
348 $self->{L3_ignorable}{$uv[0]} = TRUE
349 if @uv == 1 && $is_L3_ignorable;
351 # Contraction is to be considered in the range of this maxlength.
352 $self->{maxlength}{$uv[0]} = scalar @uv
357 ## arrayref[weights] = altCE(bool variable?, list[num] weights)
362 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
364 $self->{alternate} eq 'blanked' ?
365 $var ? [Var1, 0, 0, $wt[3]] : \@wt :
366 $self->{alternate} eq 'non-ignorable' ?
368 $self->{alternate} eq 'shifted' ?
369 $var ? [Var1, 0, 0, $wt[0] ]
370 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4 : 0 ] :
371 $self->{alternate} eq 'shift-trimmed' ?
372 $var ? [Var1, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
373 croak "$PACKAGE unknown alternate name: $self->{alternate}";
379 my $ver = $self->{UCA_Version};
381 my $key = $self->getSortKey(@_);
382 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, $key);
385 $view =~ s/ ?0000 ?/|/g;
387 $view =~ s/\b0000\b/|/g;
394 ## arrayref of JCPS = splitCE(string to be collated)
395 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
402 my $code = $self->{preprocess};
403 my $norm = $self->{normCode};
404 my $ent = $self->{entries};
405 my $max = $self->{maxlength};
406 my $reH = $self->{rearrangeHash};
407 my $ign = $self->{L3_ignorable};
408 my $ver9 = $self->{UCA_Version} > 8;
413 $code and croak "Preprocess breaks character positions. "
414 . "Don't use with index(), match(), etc.";
415 $norm and croak "Normalization breaks character positions. "
416 . "Don't use with index(), match(), etc.";
421 $str = &$code($str) if ref $code;
422 $str = &$norm($str) if ref $norm;
425 # get array of Unicode code point of string.
427 ? map(utf8::native_to_unicode($_), unpack UTF_TEMPLATE, $str)
428 : unpack(UTF_TEMPLATE, $str);
431 # Character positions are not kept if rearranged,
432 # then neglected if $wLen is true.
433 if ($reH && ! $wLen) {
434 for (my $i = 0; $i < @src; $i++) {
435 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
436 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
443 # To remove a character marked as a completely ignorable.
444 for (my $i = 0; $i < @src; $i++) {
445 $src[$i] = undef if $ign->{ $src[$i] };
449 for (my $i = 0; $i < @src; $i++) {
450 next if _isNonCharacter($src[$i]);
455 if ($max->{$ce}) { # contract
458 for (my $p = $i + 1; $p < @src; $p++) {
459 next if ! defined $src[$p];
460 $temp_ce .= CODE_SEP . $src[$p];
461 if ($ent->{$temp_ce}) {
468 # with Combining Char (UTS#10, 4.2.1).
469 # requires Unicode::Normalize.
470 # Not be $wLen, as not croaked due to $norm.
471 if ($getCombinClass) {
472 for (my $p = $i + 1; $p < @src; $p++) {
473 next if ! defined $src[$p];
474 last unless $getCombinClass->($src[$p]);
475 my $tail = CODE_SEP . $src[$p];
476 if ($ent->{$ce.$tail}) {
484 for (my $p = $i + 1; $p < @src; $p++) {
485 last if defined $src[$p];
490 push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
497 ## list of arrayrefs of weights = getWt(JCPS)
503 my $ent = $self->{entries};
504 my $cjk = $self->{overrideCJK};
505 my $hang = $self->{overrideHangul};
506 my $der = $self->{derivCode};
508 return if !defined $ce;
509 return map($self->altCE($_), @{ $ent->{$ce} })
512 # CE must not be a contraction, then it's a code point.
515 if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
516 return map $self->altCE($_),
518 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
521 $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
525 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
526 0x4E00 <= $u && $u <= 0x9FA5 ||
527 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
528 return map $self->altCE($_),
530 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
531 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
532 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2, Min3, $u)
536 return map $self->altCE($_), $der->($u);
542 ## string sortkey = getSortKey(string arg)
547 my $lev = $self->{level};
548 my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
549 my $ver9 = $self->{UCA_Version} > 8;
550 my $v2i = $self->{alternate} ne 'non-ignorable';
553 my (@buf, $last_is_variable);
555 foreach my $wt (map $self->getWt($_), @$rCE) {
557 if ($wt->[0] == 0) { # ignorable
558 next if $last_is_variable;
560 $last_is_variable = ($wt->[0] == Var1);
567 my @ret = ([],[],[],[]);
568 foreach my $v (0..$lev-1) {
569 foreach my $b (@buf) {
570 push @{ $ret[$v] }, $b->[$v]
574 foreach (@{ $self->{backwards} }) {
576 @{ $ret[$v] } = reverse @{ $ret[$v] };
579 # modification of tertiary weights
580 if ($self->{upper_before_lower}) {
581 foreach (@{ $ret[2] }) {
582 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
583 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
584 elsif ($_ == 0x1C) { $_ += 1 } # square upper
585 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
588 if ($self->{katakana_before_hiragana}) {
589 foreach (@{ $ret[2] }) {
590 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
591 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
594 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
599 ## int compare = cmp(string a, string b)
601 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
602 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
603 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
604 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
605 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
606 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
607 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
610 ## list[strings] sorted = sort(list[strings] arg)
616 sort{ $a->[0] cmp $b->[0] }
617 map [ $obj->getSortKey($_), $_ ], @_;
624 (0x4E00 <= $u && $u <= 0x9FA5)
626 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
630 my $aaaa = $base + ($u >> 15);
631 my $bbbb = ($u & 0x7FFF) | 0x8000;
633 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2, Min3, $u),
634 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
639 my $aaaa = 0xFF80 + ($code >> 15);
640 my $bbbb = ($code & 0x7FFF) | 0x8000;
642 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
643 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
647 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
649 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
652 # $code *must* be in Hangul syllable.
653 # Check it before you enter here.
657 my $SIndex = $code - 0xAC00;
658 my $LIndex = int( $SIndex / 588);
659 my $VIndex = int(($SIndex % 588) / 28);
660 my $TIndex = $SIndex % 28;
664 $TIndex ? (0x11A7 + $TIndex) : (),
668 sub _isNonCharacter {
670 return ! defined $code # removed
671 || ($code < 0 || 0x10FFFF < $code) # out of range
672 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
673 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
674 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
680 ## bool _nonIgnorAtLevel(arrayref weights, int level)
682 sub _nonIgnorAtLevel($$)
685 return if ! defined $wt;
687 return grep($wt->[$_] != 0, 0..$lv-1) ? TRUE : FALSE;
692 ## arrayref of arrayref[weights] source,
693 ## arrayref of arrayref[weights] substr,
695 ## * comparison of graphemes vs graphemes.
696 ## @$source >= @$substr must be true (check it before call this);
704 for my $g (0..@$substr-1){
705 # Do the $g'th graphemes have the same number of AV weigths?
706 return if @{ $source->[$g] } != @{ $substr->[$g] };
708 for my $w (0..@{ $substr->[$g] }-1) {
709 for my $v (0..$lev-1) {
710 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
718 ## (int position, int length)
719 ## int position = index(string, substring, position, [undoc'ed grobal])
721 ## With "grobal" (only for the list context),
722 ## returns list of arrayref[position, length].
728 my $len = length($str);
729 my $subCE = $self->splitCE(shift);
730 my $pos = @_ ? shift : 0;
731 $pos = 0 if $pos < 0;
734 my $comb = $self->{combining};
735 my $lev = $self->{level};
736 my $ver9 = $self->{UCA_Version} > 8;
737 my $v2i = $self->{alternate} ne 'non-ignorable';
740 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
742 ? map([$_, 0], $temp..$len)
743 : wantarray ? ($temp,0) : $temp;
746 return wantarray ? () : NOMATCHPOS;
748 my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
750 return wantarray ? () : NOMATCHPOS;
752 my $last_is_variable;
753 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
755 $last_is_variable = FALSE;
756 for my $wt (map $self->getWt($_), @$subCE) {
757 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
761 $to_be_pushed = FALSE if $last_is_variable;
763 $last_is_variable = ($wt->[0] == Var1);
767 if (@subWt && $wt->[0] == 0) {
768 push @{ $subWt[-1] }, $wt if $to_be_pushed;
770 $wt->[0] = 0 if $wt->[0] == Var1;
771 push @subWt, [ $wt ];
776 my $end = @$strCE - 1;
778 $last_is_variable = FALSE;
780 for (my $i = 0; $i <= $end; ) { # no $i++
784 while ($i <= $end && $found_base == 0) {
785 for my $wt ($self->getWt($strCE->[$i][0])) {
786 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
790 $to_be_pushed = FALSE if $last_is_variable;
792 $last_is_variable = ($wt->[0] == Var1);
796 if (@strWt && $wt->[0] == 0) {
797 push @{ $strWt[-1] }, $wt if $to_be_pushed;
798 $finPos[-1] = $strCE->[$i][2];
799 } elsif ($to_be_pushed) {
800 $wt->[0] = 0 if $wt->[0] == Var1;
801 push @strWt, [ $wt ];
802 push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
803 $finPos[-1] = NOMATCHPOS if $found_base;
804 push @finPos, $strCE->[$i][2];
813 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
814 if ($iniPos[0] != NOMATCHPOS &&
815 $finPos[$#subWt] != NOMATCHPOS &&
816 _eqArray(\@strWt, \@subWt, $lev)) {
817 my $temp = $iniPos[0] + $pos;
820 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
821 splice @strWt, 0, $#subWt;
822 splice @iniPos, 0, $#subWt;
823 splice @finPos, 0, $#subWt;
827 ? ($temp, $finPos[$#subWt] - $iniPos[0])
839 : wantarray ? () : NOMATCHPOS;
843 ## scalarref to matching part = match(string, substring)
848 if (my($pos,$len) = $self->index($_[0], $_[1])) {
849 my $temp = substr($_[0], $pos, $len);
850 return wantarray ? $temp : \$temp;
851 # An lvalue ref \substr should be avoided,
852 # since its value is affected by modification of its referent.
860 ## arrayref matching parts = gmatch(string, substring)
867 return map substr($str, $_->[0], $_->[1]),
868 $self->index($str, $sub, 0, 'g');
872 ## bool subst'ed = subst(string, substring, replace)
877 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
879 if (my($pos,$len) = $self->index($_[0], $_[1])) {
881 my $mat = substr($_[0], $pos, $len);
882 substr($_[0], $pos, $len, $code->($mat));
884 substr($_[0], $pos, $len, $_[2]);
894 ## int count = gsubst(string, substring, replace)
899 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
902 # Replacement is carried out from the end, then use reverse.
903 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
905 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
906 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
908 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
920 Unicode::Collate - Unicode Collation Algorithm
924 use Unicode::Collate;
927 $Collator = Unicode::Collate->new(%tailoring);
930 @sorted = $Collator->sort(@not_sorted);
933 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
937 This module is an implementation
938 of Unicode Technical Standard #10 (UTS #10)
939 "Unicode Collation Algorithm."
941 =head2 Constructor and Tailoring
943 The C<new> method returns a collator object.
945 $Collator = Unicode::Collate->new(
946 UCA_Version => $UCA_Version,
947 alternate => $alternate,
948 backwards => $levelNumber, # or \@levelNumbers
950 normalization => $normalization_form,
951 ignoreName => qr/$ignoreName/,
952 ignoreChar => qr/$ignoreChar/,
953 katakana_before_hiragana => $bool,
954 level => $collationLevel,
955 overrideCJK => \&overrideCJK,
956 overrideHangul => \&overrideHangul,
957 preprocess => \&preprocess,
958 rearrange => \@charList,
960 undefName => qr/$undefName/,
961 undefChar => qr/$undefChar/,
962 upper_before_lower => $bool,
964 # if %tailoring is false (i.e. empty),
965 # $Collator should do the default collation.
971 If the version number of the older UCA is given,
972 the older behavior of that version is emulated on collating.
973 If omitted, the return value of C<UCA_Version()> is used.
975 The supported version: 8 or 9.
977 B<This parameter may be removed in the future version,
978 as switching the algorithm would affect the performance.>
982 -- see 3.2.2 Variable Weighting, UTS #10.
984 (the title in UCA version 8: Alternate Weighting)
986 This key allows to alternate weighting for variable collation elements,
987 which are marked with an ASTERISK in the table
988 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
990 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
992 These names are case-insensitive.
993 By default (if specification is omitted), 'shifted' is adopted.
995 'Blanked' Variable elements are made ignorable at levels 1 through 3;
996 considered at the 4th level.
998 'Non-ignorable' Variable elements are not reset to ignorable.
1000 'Shifted' Variable elements are made ignorable at levels 1 through 3
1001 their level 4 weight is replaced by the old level 1 weight.
1002 Level 4 weight for Non-Variable elements is 0xFFFF.
1004 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1009 -- see 3.1.2 French Accents, UTS #10.
1011 backwards => $levelNumber or \@levelNumbers
1013 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1014 If omitted, forwards at all the levels.
1018 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1020 Overrides a default order or defines additional collation elements
1022 entry => <<'ENTRIES', # use the UCA file format
1023 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
1024 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
1025 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1028 B<NOTE:> The code point in the UCA file format (before C<';'>)
1029 B<must> be a Unicode code point, but not a native code point.
1030 So C<0063> must always denote C<U+0063>,
1031 but not a character of C<"\x63">.
1037 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1039 Makes the entry in the table completely ignorable;
1040 i.e. as if the weights were zero at all level.
1042 E.g. when 'a' and 'e' are ignorable,
1043 'element' is equal to 'lament' (or 'lmnt').
1047 -- see 4.3 Form a sort key for each string, UTS #10.
1049 Set the maximum level.
1050 Any higher levels than the specified one are ignored.
1052 Level 1: alphabetic ordering
1053 Level 2: diacritic ordering
1054 Level 3: case ordering
1055 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1059 If omitted, the maximum is the 4th.
1063 -- see 4.1 Normalize each input string, UTS #10.
1065 If specified, strings are normalized before preparation of sort keys
1066 (the normalization is executed after preprocess).
1068 As a form name, one of the following names must be used.
1070 'C' or 'NFC' for Normalization Form C
1071 'D' or 'NFD' for Normalization Form D
1072 'KC' or 'NFKC' for Normalization Form KC
1073 'KD' or 'NFKD' for Normalization Form KD
1075 If omitted, the string is put into Normalization Form D.
1077 If C<undef> is passed explicitly as the value for this key,
1078 any normalization is not carried out (this may make tailoring easier
1079 if any normalization is not desired).
1085 -- see 7.1 Derived Collation Elements, UTS #10.
1087 By default, mapping of CJK Unified Ideographs
1088 uses the Unicode codepoint order.
1089 But the mapping of CJK Unified Ideographs may be overrided.
1091 ex. CJK Unified Ideographs in the JIS code point order.
1093 overrideCJK => sub {
1094 my $u = shift; # get a Unicode codepoint
1095 my $b = pack('n', $u); # to UTF-16BE
1096 my $s = your_unicode_to_sjis_converter($b); # convert
1097 my $n = unpack('n', $s); # convert sjis to short
1098 [ $n, 0x20, 0x2, $u ]; # return the collation element
1101 ex. ignores all CJK Unified Ideographs.
1103 overrideCJK => sub {()}, # CODEREF returning empty list
1105 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1106 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1108 If C<undef> is passed explicitly as the value for this key,
1109 weights for CJK Unified Ideographs are treated as undefined.
1110 But assignment of weight for CJK Unified Ideographs
1111 in table or L<entry> is still valid.
1113 =item overrideHangul
1115 -- see 7.1 Derived Collation Elements, UTS #10.
1117 By default, Hangul Syllables are decomposed into Hangul Jamo.
1118 But the mapping of Hangul Syllables may be overrided.
1120 This tag works like L<overrideCJK>, so see there for examples.
1122 If you want to override the mapping of Hangul Syllables,
1123 the Normalization Forms D and KD are not appropriate
1124 (they will be decomposed before overriding).
1126 If C<undef> is passed explicitly as the value for this key,
1127 weight for Hangul Syllables is treated as undefined
1128 without decomposition into Hangul Jamo.
1129 But definition of weight for Hangul Syllables
1130 in table or L<entry> is still valid.
1134 -- see 5.1 Preprocessing, UTS #10.
1136 If specified, the coderef is used to preprocess
1137 before the formation of sort keys.
1139 ex. dropping English articles, such as "a" or "the".
1140 Then, "the pen" is before "a pencil".
1144 $str =~ s/\b(?:an?|the)\s+//gi;
1150 -- see 3.1.3 Rearrangement, UTS #10.
1152 Characters that are not coded in logical order and to be rearranged.
1155 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1157 If you want to disallow any rearrangement,
1158 pass C<undef> or C<[]> (a reference to an empty list)
1159 as the value for this key.
1161 B<According to the version 9 of UCA, this parameter shall not be used;
1162 but it is not warned at present.>
1166 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1168 You can use another element table if desired.
1169 The table file must be in your C<lib/Unicode/Collate> directory.
1171 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
1173 If C<undef> is passed explicitly as the value for this key,
1174 no file is read (but you can define collation elements via L<entry>).
1176 A typical way to define a collation element table
1177 without any file of table:
1179 $onlyABC = Unicode::Collate->new(
1181 entry => << 'ENTRIES',
1182 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1183 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1184 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1185 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1186 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1187 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1195 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1197 Undefines the collation element as if it were unassigned in the table.
1198 This reduces the size of the table.
1199 If an unassigned character appears in the string to be collated,
1200 the sort key is made from its codepoint
1201 as a single-character collation element,
1202 as it is greater than any other assigned collation elements
1203 (in the codepoint order among the unassigned characters).
1204 But, it'd be better to ignore characters
1205 unfamiliar to you and maybe never used.
1207 =item katakana_before_hiragana
1209 =item upper_before_lower
1211 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1213 By default, lowercase is before uppercase
1214 and hiragana is before katakana.
1216 If the tag is made true, this is reversed.
1218 B<NOTE>: These tags simplemindedly assume
1219 any lowercase/uppercase or hiragana/katakana distinctions
1220 should occur in level 3, and their weights at level 3
1221 should be same as those mentioned in 7.3.1, UTS #10.
1222 If you define your collation elements which violates this,
1223 these tags don't work validly.
1227 =head2 Methods for Collation
1231 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1233 Sorts a list of strings.
1235 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1237 Returns 1 (when C<$a> is greater than C<$b>)
1238 or 0 (when C<$a> is equal to C<$b>)
1239 or -1 (when C<$a> is lesser than C<$b>).
1241 =item C<$result = $Collator-E<gt>eq($a, $b)>
1243 =item C<$result = $Collator-E<gt>ne($a, $b)>
1245 =item C<$result = $Collator-E<gt>lt($a, $b)>
1247 =item C<$result = $Collator-E<gt>le($a, $b)>
1249 =item C<$result = $Collator-E<gt>gt($a, $b)>
1251 =item C<$result = $Collator-E<gt>ge($a, $b)>
1253 They works like the same name operators as theirs.
1255 eq : whether $a is equal to $b.
1256 ne : whether $a is not equal to $b.
1257 lt : whether $a is lesser than $b.
1258 le : whether $a is lesser than $b or equal to $b.
1259 gt : whether $a is greater than $b.
1260 ge : whether $a is greater than $b or equal to $b.
1262 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1264 -- see 4.3 Form a sort key for each string, UTS #10.
1268 You compare the sort keys using a binary comparison
1269 and get the result of the comparison of the strings using UCA.
1271 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1275 $Collator->cmp($a, $b)
1277 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1279 use Unicode::Collate;
1280 my $c = Unicode::Collate->new();
1281 print $c->viewSortKey("Perl"),"\n";
1284 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1285 # Level 1 Level 2 Level 3 Level 4
1287 (If C<UCA_Version> is 8, the output is slightly different.)
1291 =head2 Methods for Searching
1293 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1294 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1295 C<subst>, C<gsubst>) is croaked,
1296 as the position and the length might differ
1297 from those on the specified string.
1298 (And the C<rearrange> tag is neglected.)
1300 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1301 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1302 but they are not aware of any pattern, but only a literal substring.
1306 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1308 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1310 If C<$substring> matches a part of C<$string>, returns
1311 the position of the first occurrence of the matching part in scalar context;
1312 in list context, returns a two-element list of
1313 the position and the length of the matching part.
1315 If C<$substring> does not match any part of C<$string>,
1316 returns C<-1> in scalar context and
1317 an empty list in list context.
1321 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1322 # (normalization => undef) is REQUIRED.
1323 my $str = "Ich muß studieren Perl.";
1326 if (my($pos,$len) = $Collator->index($str, $sub)) {
1327 $match = substr($str, $pos, $len);
1330 and get C<"muß"> in C<$match> since C<"muß">
1331 is primary equal to C<"MÜSS">.
1333 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1335 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1337 If C<$substring> matches a part of C<$string>, in scalar context, returns
1338 B<a reference to> the first occurrence of the matching part
1339 (C<$match_ref> is always true if matches,
1340 since every reference is B<true>);
1341 in list context, returns the first occurrence of the matching part.
1343 If C<$substring> does not match any part of C<$string>,
1344 returns C<undef> in scalar context and
1345 an empty list in list context.
1349 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1350 print "matches [$$match_ref].\n";
1352 print "doesn't match.\n";
1357 if (($match) = $Collator->match($str, $sub)) { # list context
1358 print "matches [$match].\n";
1360 print "doesn't match.\n";
1363 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1365 If C<$substring> matches a part of C<$string>, returns
1366 all the matching parts (or matching count in scalar context).
1368 If C<$substring> does not match any part of C<$string>,
1369 returns an empty list.
1371 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1373 If C<$substring> matches a part of C<$string>,
1374 the first occurrence of the matching part is replaced by C<$replacement>
1375 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1377 C<$replacement> can be a C<CODEREF>,
1378 taking the matching part as an argument,
1379 and returning a string to replace the matching part
1380 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1382 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1384 If C<$substring> matches a part of C<$string>,
1385 all the occurrences of the matching part is replaced by C<$replacement>
1386 (C<$string> is modified) and return C<$count>.
1388 C<$replacement> can be a C<CODEREF>,
1389 taking the matching part as an argument,
1390 and returning a string to replace the matching part
1391 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1395 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1396 # (normalization => undef) is REQUIRED.
1397 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1398 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1400 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1401 # i.e., all the camels are made bold-faced.
1405 =head2 Other Methods
1409 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1411 Change the value of specified keys and returns the changed part.
1413 $Collator = Unicode::Collate->new(level => 4);
1415 $Collator->eq("perl", "PERL"); # false
1417 %old = $Collator->change(level => 2); # returns (level => 4).
1419 $Collator->eq("perl", "PERL"); # true
1421 $Collator->change(%old); # returns (level => 2).
1423 $Collator->eq("perl", "PERL"); # false
1425 Not all C<(key,value)>s are allowed to be changed.
1426 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1428 In the scalar context, returns the modified collator
1429 (but it is B<not> a clone from the original).
1431 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1433 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1435 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1439 Returns the version number of UTS #10 this module consults.
1441 =item Base_Unicode_Version
1443 Returns the version number of the Unicode Standard
1444 this module is based on.
1454 Unicode::Collate has not been ported to EBCDIC.
1455 IMHO, use of utf8::unicode_to_native()/utf8::native_to_unicode()
1456 at the proper postions should allow
1457 this module to work on EBCDIC platform...
1461 Use of the C<normalization> parameter requires
1462 the B<Unicode::Normalize> module.
1464 If you need not it (say, in the case when you need not
1465 handle any combining characters),
1466 assign C<normalization =E<gt> undef> explicitly.
1468 -- see 6.5 Avoiding Normalization, UTS #10.
1470 =head2 Conformance Test
1472 The Conformance Test for the UCA is provided
1473 in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1474 and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1476 For F<CollationTest_SHIFTED.txt>,
1477 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1478 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1479 C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1481 B<Unicode::Normalize is required to try The Conformance Test.>
1485 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1487 http://homepage1.nifty.com/nomenclator/perl/
1489 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1491 This library is free software; you can redistribute it
1492 and/or modify it under the same terms as Perl itself.
1498 =item http://www.unicode.org/reports/tr10/
1500 Unicode Collation Algorithm - UTS #10
1502 =item http://www.unicode.org/reports/tr10/allkeys.txt
1504 The Default Unicode Collation Element Table
1506 =item http://www.unicode.org/reports/tr10/CollationTest.html
1507 http://www.unicode.org/reports/tr10/CollationTest.zip
1509 The latest versions of the conformance test for the UCA
1511 =item http://www.unicode.org/reports/tr15/
1513 Unicode Normalization Forms - UAX #15
1515 =item L<Unicode::Normalize>