1 package Unicode::Collate;
5 die "Unicode::Collate not ported to EBCDIC\n";
17 our $VERSION = '0.20';
18 our $PACKAGE = __PACKAGE__;
20 our @ISA = qw(Exporter);
22 our %EXPORT_TAGS = ();
26 (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27 our $KeyFile = "allkeys.txt";
31 eval { require Unicode::UCD };
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
36 else { # XXX, Perl 5.6.1
38 foreach my $d (@INC) {
39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
41 $UNICODE_VERSION = '3.0.1';
48 our $getCombinClass; # coderef for combining class from Unicode::Normalize
50 use constant Min2 => 0x20; # minimum weight at level 2
51 use constant Min3 => 0x02; # minimum weight at level 3
54 use constant VCE_FORMAT => 'Cn4'; # for variable + CE with 4 levels
57 use constant NON_VAR => 0; # Non-Variable character
58 use constant VAR => 1; # Variable character
60 our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
62 sub UCA_Version { "9" }
64 sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
68 blanked non-ignorable shifted shift-trimmed
72 alternate backwards level normalization rearrange
73 katakana_before_hiragana upper_before_lower
74 overrideHangul overrideCJK preprocess UCA_Version
78 entry entries table ignored combining maxlength
79 ignoreChar ignoreName undefChar undefName
80 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
81 derivCode normCode rearrangeHash isShift L3ignorable
84 my (%ChangeOK, %ChangeNG);
85 @ChangeOK{ @ChangeOK } = ();
86 @ChangeNG{ @ChangeNG } = ();
92 foreach my $k (keys %hash) {
93 if (exists $ChangeOK{$k}) {
94 $old{$k} = $self->{$k};
95 $self->{$k} = $hash{$k};
97 elsif (exists $ChangeNG{$k}) {
98 croak "change of $k via change() is not allowed!";
102 $self->checkCollator;
103 return wantarray ? %old : $self;
108 croak "Illegal level lower than 1 (passed $self->{level})."
109 if $self->{level} < 1;
110 croak "A level higher than 4 (passed $self->{level}) is not supported."
111 if 4 < $self->{level};
114 $self->{UCA_Version} == -1 ? \&broken_derivCE :
115 $self->{UCA_Version} == 8 ? \&derivCE_8 :
116 $self->{UCA_Version} == 9 ? \&derivCE_9 :
117 croak "Illegal UCA version (passed $self->{UCA_Version}).";
119 $self->{alternate} = lc($self->{alternate});
120 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
121 unless exists $AlternateOK{ $self->{alternate} };
123 $self->{isShift} = $self->{alternate} eq 'shifted' ||
124 $self->{alternate} eq 'shift-trimmed';
126 $self->{backwards} = []
127 if ! defined $self->{backwards};
128 $self->{backwards} = [ $self->{backwards} ]
129 if ! ref $self->{backwards};
131 $self->{rearrange} = []
132 if ! defined $self->{rearrange};
133 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
134 if ! ref $self->{rearrange};
136 # keys of $self->{rearrangeHash} are $self->{rearrange}.
137 $self->{rearrangeHash} = undef;
139 if (@{ $self->{rearrange} }) {
140 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
143 $self->{normCode} = undef;
145 if (defined $self->{normalization}) {
146 eval { require Unicode::Normalize };
147 croak "Unicode/Normalize.pm is required to normalize strings: $@"
150 Unicode::Normalize->import();
151 $getCombinClass = \&Unicode::Normalize::getCombinClass
152 if ! $getCombinClass;
155 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
156 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
157 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
158 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
159 croak "$PACKAGE unknown normalization form name: "
160 . $self->{normalization};
168 my $self = bless { @_ }, $class;
170 # If undef is passed explicitly, no file is read.
171 $self->{table} = $KeyFile if ! exists $self->{table};
172 $self->read_table if defined $self->{table};
174 if ($self->{entry}) {
175 $self->parseEntry($_) foreach split /\n/, $self->{entry};
178 $self->{level} ||= 4;
179 $self->{UCA_Version} ||= UCA_Version();
181 $self->{overrideHangul} = ''
182 if ! exists $self->{overrideHangul};
183 $self->{overrideCJK} = ''
184 if ! exists $self->{overrideCJK};
185 $self->{normalization} = 'D'
186 if ! exists $self->{normalization};
187 $self->{alternate} = $self->{alternateTable} || 'shifted'
188 if ! exists $self->{alternate};
189 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
190 if ! exists $self->{rearrange};
191 $self->{backwards} = $self->{backwardsTable}
192 if ! exists $self->{backwards};
194 $self->checkCollator;
201 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
203 my $filepath = File::Spec->catfile($Path, $file);
204 open my $fk, "<$filepath"
205 or croak "File does not exist at $filepath";
210 if (/^\s*\@version\s*(\S*)/) {
211 $self->{versionTable} ||= $1;
213 elsif (/^\s*\@alternate\s+(\S*)/) {
214 $self->{alternateTable} ||= $1;
216 elsif (/^\s*\@backwards\s+(\S*)/) {
217 push @{ $self->{backwardsTable} }, $1;
219 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
220 push @{ $self->{forwardsTable} }, $1;
222 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
223 push @{ $self->{rearrangeTable} }, _getHexArray($1);
227 $self->parseEntry($_);
234 ## get $line, parse it, and write an entry in $self
240 my($name, $ele, @key);
242 return if $line !~ /^\s*[0-9A-Fa-f]/;
244 # removes comment and gets name
246 if $line =~ s/[#%]\s*(.*)//;
247 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
250 my($e, $k) = split /;/, $line;
251 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
254 my @e = _getHexArray($e);
257 $ele = pack('U*', @e);
258 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
261 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
262 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
264 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
267 my $combining = 1; # primary = 0, secondary != 0;
270 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
271 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
272 my @arr = _getHexArray($arr);
273 push @key, pack(VCE_FORMAT, $var, @arr);
274 $combining = 0 unless $arr[0] == 0 && $arr[1] != 0;
275 $level3ingore = 1 if $arr[0] == 0 && $arr[1] == 0 && $arr[2] == 0;
277 $self->{entries}{$ele} = \@key;
279 $self->{combining}{$ele} = 1
282 $self->{L3ignorable}{$e[0]} = 1
283 if @e == 1 && $level3ingore;
285 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
289 ## arrayref CE = altCE(bool variable?, list[num] weights)
294 my($var, @c) = unpack(VCE_FORMAT, shift);
296 $self->{alternate} eq 'blanked' ?
297 $var ? [0,0,0,$c[3]] : \@c :
298 $self->{alternate} eq 'non-ignorable' ?
300 $self->{alternate} eq 'shifted' ?
301 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
302 $self->{alternate} eq 'shift-trimmed' ?
303 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
304 croak "$PACKAGE unknown alternate name: $self->{alternate}";
310 my $ver = $self->{UCA_Version};
312 my $key = $self->getSortKey(@_);
313 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
315 $view =~ s/ ?0000 ?/|/g;
317 $view =~ s/\b0000\b/|/g;
324 ## list[strings] elements = splitCE(string arg)
329 my $code = $self->{preprocess};
330 my $norm = $self->{normCode};
331 my $ent = $self->{entries};
332 my $max = $self->{maxlength};
333 my $reH = $self->{rearrangeHash};
334 my $L3i = $self->{L3ignorable};
335 my $ver9 = $self->{UCA_Version} > 8;
337 my $str = ref $code ? &$code(shift) : shift;
338 $str = &$norm($str) if ref $norm;
340 my @src = unpack('U*', $str);
345 for (my $i = 0; $i < @src; $i++) {
346 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
347 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
354 @src = grep ! $L3i->{$_}, @src;
357 for (my $i = 0; $i < @src; $i++) {
362 next unless defined $u;
363 next if $u < 0 || 0x10FFFF < $u # out of range
364 || (0xD800 <= $u && $u <= 0xDFFF) # unpaired surrogates
365 || (0xFDD0 <= $u && $u <= 0xFDEF) # non-character
368 my $four = $u & 0xFFFF;
369 next if $four == 0xFFFE || $four == 0xFFFF;
371 if ($max->{$u}) { # contract
372 for (my $j = $max->{$u}; $j >= 1; $j--) {
373 next unless $i+$j-1 < @src;
374 $ch = pack 'U*', @src[$i .. $i+$j-1];
375 $i += $j-1, last if $ent->{$ch};
381 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
382 if ($getCombinClass && defined $ch) {
383 for (my $j = $i+1; $j < @src; $j++) {
384 next unless defined $src[$j];
385 last unless $getCombinClass->( $src[$j] );
386 my $comb = pack 'U', $src[$j];
387 next if ! $ent->{ $ch.$comb };
394 wantarray ? @buf : \@buf;
399 ## list[arrayrefs] weight = getWt(string element)
405 my $ent = $self->{entries};
406 my $ign = $self->{ignored};
407 my $cjk = $self->{overrideCJK};
408 my $hang = $self->{overrideHangul};
409 my $der = $self->{derivCode};
411 return if !defined $ch || $ign->{$ch}; # ignored
412 return map($self->altCE($_), @{ $ent->{$ch} })
415 my $u = unpack('U', $ch);
417 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
418 return map $self->altCE($_),
420 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$hang($u))
424 my $vCE = $ent->{pack('U', $v)};
425 $vCE ? @$vCE : $der->($v);
429 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
430 0x4E00 <= $u && $u <= 0x9FA5 ||
431 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
432 return map $self->altCE($_),
434 ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$cjk($u))
435 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
436 ? pack(VCE_FORMAT, NON_VAR, $u, 0x20, 0x02, $u)
440 return map $self->altCE($_), $der->($u);
445 ## int = index(string, substring)
450 my $lev = $self->{level};
451 my $comb = $self->{combining};
452 my $str = $self->splitCE(shift);
453 my $sub = $self->splitCE(shift);
455 return wantarray ? (0,0) : 0 if ! @$sub;
456 return wantarray ? () : -1 if ! @$str;
458 my @subWt = grep _ignorableAtLevel($_,$lev),
459 map $self->getWt($_), @$sub;
463 for (my $i = 0; $i < @$str; $i++) {
466 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
467 $go_ahead += length $str->[$i];
469 # /*XXX*/ still broken.
470 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
471 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
473 # go ahead as far as we find a combining character;
474 while ($i + 1 < @$str &&
475 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
477 next if ! defined $str->[$i];
478 $go_ahead += length $str->[$i];
480 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
484 push @strPt, ($count) x @tmp;
487 while (@strWt >= @subWt) {
488 if (_eqArray(\@strWt, \@subWt, $lev)) {
490 return wantarray ? ($pos, $count-$pos) : $pos;
496 return wantarray ? () : -1;
500 ## bool _eqArray(arrayref, arrayref, level)
504 my $a = shift; # length $a >= length $b;
507 for my $v (0..$lev-1) {
508 for my $c (0..@$b-1){
509 return if $a->[$c][$v] != $b->[$c][$v];
517 ## bool _ignorableAtLevel(CE, level)
519 sub _ignorableAtLevel($$)
522 return unless defined $ce;
524 return ! grep { ! $ce->[$_] } 0..$lv-1;
529 ## string sortkey = getSortKey(string arg)
534 my $lev = $self->{level};
535 my $rCE = $self->splitCE(shift); # get an arrayref
536 my $ver9 = $self->{UCA_Version} > 8;
537 my $sht = $self->{isShift};
540 my (@buf, $last_is_variable);
542 foreach my $ce (@$rCE) {
543 my @t = $self->getWt($ce);
545 if (@t == 1 && $t[0][0] == 0) {
546 if ($t[0][1] == 0 && $t[0][2] == 0) {
547 $last_is_variable = 1;
549 next if $last_is_variable;
552 $last_is_variable = 0;
559 my @ret = ([],[],[],[]);
560 foreach my $v (0..$lev-1) {
561 foreach my $b (@buf) {
562 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
565 foreach (@{ $self->{backwards} }) {
567 @{ $ret[$v] } = reverse @{ $ret[$v] };
570 # modification of tertiary weights
571 if ($self->{upper_before_lower}) {
572 foreach (@{ $ret[2] }) {
573 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
574 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
575 elsif ($_ == 0x1C) { $_ += 1 } # square upper
576 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
579 if ($self->{katakana_before_hiragana}) {
580 foreach (@{ $ret[2] }) {
581 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
582 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
585 join "\0\0", map pack('n*', @$_), @ret;
590 ## int compare = cmp(string a, string b)
592 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
593 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
594 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
595 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
596 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
597 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
598 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
601 ## list[strings] sorted = sort(list[strings] arg)
607 sort{ $a->[0] cmp $b->[0] }
608 map [ $obj->getSortKey($_), $_ ], @_;
615 (0x4E00 <= $u && $u <= 0x9FA5) # CJK
617 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
620 my $aaaa = $base + ($u >> 15);
621 my $bbbb = ($u & 0x7FFF) | 0x8000;
623 pack(VCE_FORMAT, NON_VAR, $aaaa, Min2, Min3, $u),
624 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $u);
629 my $aaaa = 0xFF80 + ($code >> 15);
630 my $bbbb = ($code & 0x7FFF) | 0x8000;
632 pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
633 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
636 sub broken_derivCE { # NG
638 my $aaaa = 0xFFC2 + ($code >> 15);
639 my $bbbb = $code & 0x7FFF | 0x1000;
641 pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
642 pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
646 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
648 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
651 # $code must be in Hangul syllable.
652 # Check it before you enter here.
656 my $SIndex = $code - 0xAC00;
657 my $LIndex = int( $SIndex / 588);
658 my $VIndex = int(($SIndex % 588) / 28);
659 my $TIndex = $SIndex % 28;
663 $TIndex ? (0x11A7 + $TIndex) : (),
672 Unicode::Collate - Unicode Collation Algorithm
676 use Unicode::Collate;
679 $Collator = Unicode::Collate->new(%tailoring);
682 @sorted = $Collator->sort(@not_sorted);
685 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
689 =head2 Constructor and Tailoring
691 The C<new> method returns a collator object.
693 $Collator = Unicode::Collate->new(
694 UCA_Version => $UCA_Version,
695 alternate => $alternate,
696 backwards => $levelNumber, # or \@levelNumbers
698 normalization => $normalization_form,
699 ignoreName => qr/$ignoreName/,
700 ignoreChar => qr/$ignoreChar/,
701 katakana_before_hiragana => $bool,
702 level => $collationLevel,
703 overrideCJK => \&overrideCJK,
704 overrideHangul => \&overrideHangul,
705 preprocess => \&preprocess,
706 rearrange => \@charList,
708 undefName => qr/$undefName/,
709 undefChar => qr/$undefChar/,
710 upper_before_lower => $bool,
712 # if %tailoring is false (i.e. empty),
713 # $Collator should do the default collation.
719 If the version number of the older UCA is given,
720 the older behavior of that version is emulated on collating.
721 If omitted, the return value of C<UCA_Version()> is used.
723 The supported version: 8 or 9.
725 B<This parameter may be removed in the future version,
726 as switching the algorithm would affect the performance.>
730 -- see 3.2.2 Alternate Weighting, UTR #10.
732 This key allows to alternate weighting for variable collation elements,
733 which are marked with an ASTERISK in the table
734 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
736 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
738 These names are case-insensitive.
739 By default (if specification is omitted), 'shifted' is adopted.
741 'Blanked' Variable elements are ignorable at levels 1 through 3;
742 considered at the 4th level.
744 'Non-ignorable' Variable elements are not reset to ignorable.
746 'Shifted' Variable elements are ignorable at levels 1 through 3
747 their level 4 weight is replaced by the old level 1 weight.
748 Level 4 weight for Non-Variable elements is 0xFFFF.
750 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
755 -- see 3.1.2 French Accents, UTR #10.
757 backwards => $levelNumber or \@levelNumbers
759 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
760 If omitted, forwards at all the levels.
764 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
766 Overrides a default order or defines additional collation elements
768 entry => <<'ENTRIES', # use the UCA file format
769 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
770 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
771 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
778 -- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
780 Makes the entry in the table ignorable.
781 If a collation element is ignorable,
782 it is ignored as if the element had been deleted from there.
784 E.g. when 'a' and 'e' are ignorable,
785 'element' is equal to 'lament' (or 'lmnt').
789 -- see 4.3 Form a sort key for each string, UTR #10.
791 Set the maximum level.
792 Any higher levels than the specified one are ignored.
794 Level 1: alphabetic ordering
795 Level 2: diacritic ordering
796 Level 3: case ordering
797 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
801 If omitted, the maximum is the 4th.
805 -- see 4.1 Normalize each input string, UTR #10.
807 If specified, strings are normalized before preparation of sort keys
808 (the normalization is executed after preprocess).
810 As a form name, one of the following names must be used.
812 'C' or 'NFC' for Normalization Form C
813 'D' or 'NFD' for Normalization Form D
814 'KC' or 'NFKC' for Normalization Form KC
815 'KD' or 'NFKD' for Normalization Form KD
817 If omitted, the string is put into Normalization Form D.
819 If C<undef> is passed explicitly as the value for this key,
820 any normalization is not carried out (this may make tailoring easier
821 if any normalization is not desired).
827 -- see 7.1 Derived Collation Elements, UTR #10.
829 By default, mapping of CJK Unified Ideographs
830 uses the Unicode codepoint order.
831 But the mapping of CJK Unified Ideographs may be overrided.
833 ex. CJK Unified Ideographs in the JIS code point order.
836 my $u = shift; # get a Unicode codepoint
837 my $b = pack('n', $u); # to UTF-16BE
838 my $s = your_unicode_to_sjis_converter($b); # convert
839 my $n = unpack('n', $s); # convert sjis to short
840 [ $n, 0x20, 0x2, $u ]; # return the collation element
843 ex. ignores all CJK Unified Ideographs.
845 overrideCJK => sub {()}, # CODEREF returning empty list
847 # where ->eq("Pe\x{4E00}rl", "Perl") is true
848 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
850 If C<undef> is passed explicitly as the value for this key,
851 weights for CJK Unified Ideographs are treated as undefined.
852 But assignment of weight for CJK Unified Ideographs
853 in table or L<entry> is still valid.
857 -- see 7.1 Derived Collation Elements, UTR #10.
859 By default, Hangul Syllables are decomposed into Hangul Jamo.
860 But the mapping of Hangul Syllables may be overrided.
862 This tag works like L<overrideCJK>, so see there for examples.
864 If you want to override the mapping of Hangul Syllables,
865 the Normalization Forms D and KD are not appropriate
866 (they will be decomposed before overriding).
868 If C<undef> is passed explicitly as the value for this key,
869 weight for Hangul Syllables is treated as undefined
870 without decomposition into Hangul Jamo.
871 But definition of weight for Hangul Syllables
872 in table or L<entry> is still valid.
876 -- see 5.1 Preprocessing, UTR #10.
878 If specified, the coderef is used to preprocess
879 before the formation of sort keys.
881 ex. dropping English articles, such as "a" or "the".
882 Then, "the pen" is before "a pencil".
886 $str =~ s/\b(?:an?|the)\s+//gi;
892 -- see 3.1.3 Rearrangement, UTR #10.
894 Characters that are not coded in logical order and to be rearranged.
897 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
899 If you want to disallow any rearrangement,
900 pass C<undef> or C<[]> (a reference to an empty list)
901 as the value for this key.
903 B<According to the version 9 of UCA, this parameter shall not be used;
904 but it is not warned at present.>
908 -- see 3.2 Default Unicode Collation Element Table, UTR #10.
910 You can use another element table if desired.
911 The table file must be in your C<lib/Unicode/Collate> directory.
913 By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
915 If C<undef> is passed explicitly as the value for this key,
916 no file is read (but you can define collation elements via L<entry>).
918 A typical way to define a collation element table
919 without any file of table:
921 $onlyABC = Unicode::Collate->new(
923 entry => << 'ENTRIES',
924 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
925 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
926 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
927 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
928 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
929 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
937 -- see 6.3.4 Reducing the Repertoire, UTR #10.
939 Undefines the collation element as if it were unassigned in the table.
940 This reduces the size of the table.
941 If an unassigned character appears in the string to be collated,
942 the sort key is made from its codepoint
943 as a single-character collation element,
944 as it is greater than any other assigned collation elements
945 (in the codepoint order among the unassigned characters).
946 But, it'd be better to ignore characters
947 unfamiliar to you and maybe never used.
949 =item katakana_before_hiragana
951 =item upper_before_lower
953 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
955 By default, lowercase is before uppercase
956 and hiragana is before katakana.
958 If the tag is made true, this is reversed.
960 B<NOTE>: These tags simplemindedly assume
961 any lowercase/uppercase or hiragana/katakana distinctions
962 should occur in level 3, and their weights at level 3
963 should be same as those mentioned in 7.3.1, UTR #10.
964 If you define your collation elements which violates this,
965 these tags doesn't work validly.
969 =head2 Methods for Collation
973 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
975 Sorts a list of strings.
977 =item C<$result = $Collator-E<gt>cmp($a, $b)>
979 Returns 1 (when C<$a> is greater than C<$b>)
980 or 0 (when C<$a> is equal to C<$b>)
981 or -1 (when C<$a> is lesser than C<$b>).
983 =item C<$result = $Collator-E<gt>eq($a, $b)>
985 =item C<$result = $Collator-E<gt>ne($a, $b)>
987 =item C<$result = $Collator-E<gt>lt($a, $b)>
989 =item C<$result = $Collator-E<gt>le($a, $b)>
991 =item C<$result = $Collator-E<gt>gt($a, $b)>
993 =item C<$result = $Collator-E<gt>ge($a, $b)>
995 They works like the same name operators as theirs.
997 eq : whether $a is equal to $b.
998 ne : whether $a is not equal to $b.
999 lt : whether $a is lesser than $b.
1000 le : whether $a is lesser than $b or equal to $b.
1001 gt : whether $a is greater than $b.
1002 ge : whether $a is greater than $b or equal to $b.
1004 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1006 -- see 4.3 Form a sort key for each string, UTR #10.
1010 You compare the sort keys using a binary comparison
1011 and get the result of the comparison of the strings using UCA.
1013 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1017 $Collator->cmp($a, $b)
1019 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1021 use Unicode::Collate;
1022 my $c = Unicode::Collate->new();
1023 print $c->viewSortKey("Perl"),"\n";
1026 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1027 # Level 1 Level 2 Level 3 Level 4
1029 (If C<UCA_Version> is 8, the output is slightly different.)
1031 =item C<$position = $Collator-E<gt>index($string, $substring)>
1033 =item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
1035 -- see 6.8 Searching, UTR #10.
1037 If C<$substring> matches a part of C<$string>, returns
1038 the position of the first occurrence of the matching part in scalar context;
1039 in list context, returns a two-element list of
1040 the position and the length of the matching part.
1042 B<Notice> that the length of the matching part may differ from
1043 the length of C<$substring>.
1045 B<Note> that the position and the length are counted on the string
1046 after the process of preprocess, normalization, and rearrangement.
1047 Therefore, in case the specified string is not binary equal to
1048 the preprocessed/normalized/rearranged string, the position and the length
1049 may differ form those on the specified string. But it is guaranteed
1050 that, if matched, it returns a non-negative value as C<$position>.
1052 If C<$substring> does not match any part of C<$string>,
1053 returns C<-1> in scalar context and
1054 an empty list in list context.
1058 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1059 my $str = "Ich mu\x{00DF} studieren.";
1060 my $sub = "m\x{00FC}ss";
1062 if (my($pos,$len) = $Collator->index($str, $sub)) {
1063 $match = substr($str, $pos, $len);
1066 and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
1067 is primary equal to C<"m>E<252>C<ss">.
1071 =head2 Other Methods
1075 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1077 Change the value of specified keys and returns the changed part.
1079 $Collator = Unicode::Collate->new(level => 4);
1081 $Collator->eq("perl", "PERL"); # false
1083 %old = $Collator->change(level => 2); # returns (level => 4).
1085 $Collator->eq("perl", "PERL"); # true
1087 $Collator->change(%old); # returns (level => 2).
1089 $Collator->eq("perl", "PERL"); # false
1091 Not all C<(key,value)>s are allowed to be changed.
1092 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1094 In the scalar context, returns the modified collator
1095 (but it is B<not> a clone from the original).
1097 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1099 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1101 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1105 Returns the version number of Unicode Technical Standard 10
1106 this module consults.
1108 =item Base_Unicode_Version
1110 Returns the version number of the Unicode Standard
1111 this module is based on.
1121 Unicode::Collate has not been ported to EBCDIC. The code mostly would
1122 work just fine but a decision needs to be made: how the module should
1123 work in EBCDIC? Should the low 256 characters be understood as
1124 Unicode or as EBCDIC code points? Should one be chosen or should
1125 there be a way to do either? Or should such translation be left
1126 outside the module for the user to do, for example by using
1128 (or utf8::unicode_to_native()/utf8::native_to_unicode()?)
1132 Use of the C<normalization> parameter requires
1133 the B<Unicode::Normalize> module.
1135 If you need not it (say, in the case when you need not
1136 handle any combining characters),
1137 assign C<normalization =E<gt> undef> explicitly.
1139 -- see 6.5 Avoiding Normalization, UTR #10.
1141 =head2 Conformance Test
1143 The Conformance Test for the UCA is provided
1144 in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1145 and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1147 For F<CollationTest_SHIFTED.txt>,
1148 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1149 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1150 C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1152 B<Unicode::Normalize is required to try this test.>
1156 C<index()> is an experimental method and
1157 its return value may be unreliable.
1158 The correct implementation for C<index()> must be based
1159 on Locale-Sensitive Support: Level 3 in UTR #18,
1160 F<Unicode Regular Expression Guidelines>.
1162 See also 4.2 Locale-Dependent Graphemes in UTR #18.
1166 SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
1168 http://homepage1.nifty.com/nomenclator/perl/
1170 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1172 This library is free software; you can redistribute it
1173 and/or modify it under the same terms as Perl itself.
1179 =item http://www.unicode.org/reports/tr10/
1181 Unicode Collation Algorithm - UTR #10
1183 =item http://www.unicode.org/reports/tr10/allkeys.txt
1185 The Default Unicode Collation Element Table
1187 =item http://www.unicode.org/reports/tr10/CollationTest.html
1188 http://www.unicode.org/reports/tr10/CollationTest.zip
1190 The latest versions of the conformance test for the UCA
1192 =item http://www.unicode.org/reports/tr15/
1194 Unicode Normalization Forms - UAX #15
1196 =item http://www.unicode.org/reports/tr18
1198 Unicode Regular Expression Guidelines - UTR #18
1200 =item L<Unicode::Normalize>