From: 貞廣知行 Date: Mon, 12 Nov 2001 00:22:57 +0000 (+0900) Subject: Unicode::Collate v0.09 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=5398038ef5e01ea763adf26da6c759974eda549b;p=p5sagit%2Fp5-mst-13.2.git Unicode::Collate v0.09 Message-Id: <20011112002232.BD46.BQW10602@nifty.com> p4raw-id: //depot/perl@12942 --- diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 2ffda37..3393d43 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -4,9 +4,10 @@ use 5.006; use strict; use warnings; use Carp; + require Exporter; -our $VERSION = '0.08'; +our $VERSION = '0.09'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -18,16 +19,7 @@ our @EXPORT = (); (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//; our $KeyFile = "allkeys.txt"; -# Lingua::KO::Hangul::Util not part of the standard distribution -# but it will be used if available. - -eval { require Lingua::KO::Hangul::Util }; -my $hasHangulUtil = ! $@; -if ($hasHangulUtil) { - Lingua::KO::Hangul::Util->import(); -} - -our %Combin; # combining class from Unicode::Normalize +our $getCombinClass; # coderef for combining class from Unicode::Normalize use constant Min2 => 0x20; # minimum weight at level 2 use constant Min3 => 0x02; # minimum weight at level 3 @@ -47,14 +39,20 @@ sub new ! defined $self->{alternate} ? '' : $self->{alternate}; # collation level - $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3); + $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3); # normalization form $self->{normalization} = 'D' if ! exists $self->{normalization}; - eval "use Unicode::Normalize;" if defined $self->{normalization}; + if(defined $self->{normalization}){ + eval "use Unicode::Normalize;"; + croak "you'd install Unicode::Normalize for normalization forms: $@" + if $@; + $getCombinClass = \&Unicode::Normalize::getCombinClass + if ! $getCombinClass; + } - $self->{normalize} = + $self->{UNF} = ! defined $self->{normalization} ? undef : $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC : $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD : @@ -62,8 +60,6 @@ sub new $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD : croak "$PACKAGE unknown normalization form name: $self->{normalization}"; - *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin; - # backwards $self->{backwards} ||= []; $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards}; @@ -188,7 +184,7 @@ sub splitCE { my $self = shift; my $code = $self->{preprocess}; - my $norm = $self->{normalize}; + my $norm = $self->{UNF}; my $ent = $self->{entries}; my $max = $self->{maxlength}; my $rear = $self->{rearrangeHash}; @@ -229,15 +225,15 @@ sub splitCE } else { $ch = pack('U', $u) } - if(%Combin && defined $ch) # with Combining Char + # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize. + if($getCombinClass && defined $ch) { - for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++) + for(my $j = $i+1; $j < @src && $getCombinClass->( $src[$j] ); $j++) { my $comb = pack 'U', $src[$j]; next if ! $ent->{ $ch.$comb }; $ch .= $comb; splice(@src, $j, 1); - last; } } push @buf, $ch; @@ -264,10 +260,7 @@ sub getWt _isHangul($u) ? $hang ? &$hang($u) - : ($hasHangulUtil ? - map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u)) : - # runtime compile error... - (eval 'use Lingua::KO::Hangul::Util', print $@)) + : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u)) : _isCJK($u) ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u)) : map($self->altCE(0,@$_), _derivCE($u)); @@ -383,13 +376,13 @@ sub getSortKey ## ## int compare = cmp(string a, string b) ## -sub cmp -{ - my $obj = shift; - my $a = shift; - my $b = shift; - $obj->getSortKey($a) cmp $obj->getSortKey($b); -} +sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } +sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } +sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } +sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } +sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } +sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } +sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } ## ## list[strings] sorted = sort(list[strings] arg) @@ -419,11 +412,7 @@ sub _derivCE ## ## "hhhh hhhh hhhh" to (dddd, dddd, dddd) ## -sub _getHexArray -{ - my $str = shift; - map hex(), $str =~ /([0-9a-fA-F]+)/g; -} +sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g } ## ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint) @@ -439,19 +428,25 @@ sub _isCJK ## ## list[arrayref] CE = _CJK(int codepoint_of_CJK) ## -sub _CJK -{ - my $u = shift; - $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u]; -} +sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] } ## ## bool is_a_Hangul_Syllable = _isHangul(int codepoint) ## -sub _isHangul -{ - my $code = shift; - return 0xAC00 <= $code && $code <= 0xD7A3; +sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 } + +sub _decompHangul { + my $code = shift; + # $code must be in Hangul syllable. check it before you enter here. + my $SIndex = $code - 0xAC00; + my $LIndex = int( $SIndex / 588); + my $VIndex = int(($SIndex % 588) / 28); + my $TIndex = $SIndex % 28; + return ( + 0x1100 + $LIndex, + 0x1161 + $VIndex, + $TIndex ? (0x11A7 + $TIndex) : (), + ); } 1; @@ -466,13 +461,13 @@ Unicode::Collate - use UCA (Unicode Collation Algorithm) use Unicode::Collate; #construct - $UCA = Unicode::Collate->new(%tailoring); + $Collator = Unicode::Collate->new(%tailoring); #sort - @sorted = $UCA->sort(@not_sorted); + @sorted = $Collator->sort(@not_sorted); #compare - $result = $UCA->cmp($a, $b); # returns 1, 0, or -1. + $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. =head1 DESCRIPTION @@ -480,7 +475,7 @@ Unicode::Collate - use UCA (Unicode Collation Algorithm) The C method returns a collator object. - $UCA = Unicode::Collate->new( + $Collator = Unicode::Collate->new( alternate => $alternate, backwards => $levelNumber, # or \@levelNumbers entry => $element, @@ -499,7 +494,7 @@ The C method returns a collator object. upper_before_lower => $bool, ); # if %tailoring is false (empty), - # $UCA should do the default collation. + # $Collator should do the default collation. =over 4 @@ -675,17 +670,38 @@ If the parameter is true, this is reversed. =over 4 -=item C<@sorted = $UCA-Esort(@not_sorted)> +=item C<@sorted = $Collator-Esort(@not_sorted)> Sorts a list of strings. -=item C<$result = $UCA-Ecmp($a, $b)> +=item C<$result = $Collator-Ecmp($a, $b)> Returns 1 (when C<$a> is greater than C<$b>) or 0 (when C<$a> is equal to C<$b>) or -1 (when C<$a> is lesser than C<$b>). -=item C<$sortKey = $UCA-EgetSortKey($string)> +=item C<$result = $Collator-Eeq($a, $b)> + +=item C<$result = $Collator-Ene($a, $b)> + +=item C<$result = $Collator-Elt($a, $b)> + +=item C<$result = $Collator-Ele($a, $b)> + +=item C<$result = $Collator-Egt($a, $b)> + +=item C<$result = $Collator-Ege($a, $b)> + +They works like the same name operators as theirs. + + eq : whether $a is equal to $b. + ne : whether $a is not equal to $b. + lt : whether $a is lesser than $b. + le : whether $a is lesser than $b or equal to $b. + gt : whether $a is greater than $b. + ge : whether $a is greater than $b or equal to $b. + +=item C<$sortKey = $Collator-EgetSortKey($string)> -- see 4.3 Form a sort key for each string, UTR #10. @@ -694,15 +710,15 @@ Returns a sort key. You compare the sort keys using a binary comparison and get the result of the comparison of the strings using UCA. - $UCA->getSortKey($a) cmp $UCA->getSortKey($b) + $Collator->getSortKey($a) cmp $Collator->getSortKey($b) is equivalent to - $UCA->cmp($a, $b) + $Collator->cmp($a, $b) -=item C<$position = $UCA-Eindex($string, $substring)> +=item C<$position = $Collator-Eindex($string, $substring)> -=item C<($position, $length) = $UCA-Eindex($string, $substring)> +=item C<($position, $length) = $Collator-Eindex($string, $substring)> -- see 6.8 Searching, UTR #10. @@ -727,12 +743,12 @@ an empty list in list context. e.g. you say - my $UCA = Unicode::Collate->new( normalization => undef, level => 1 ); + my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); my $str = "Ich mu\x{00DF} studieren."; my $sub = "m\x{00FC}ss"; my $match; - if(my @tmp = $UCA->index($str, $sub)){ - $match = substr($str, $tmp[0], $tmp[1]); + if(my($pos,$len) = $Collator->index($str, $sub)){ + $match = substr($str, $pos, $len); } and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<"> @@ -749,10 +765,12 @@ None by default. Use of the C parameter requires the B module. -If you need not it (e.g. in the case when you need not +If you need not it (say, in the case when you need not handle any combining characters), assign C undef> explicitly. +-- see 6.5 Avoiding Normalization, UTR #10. + =head1 AUTHOR SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE @@ -768,18 +786,14 @@ SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE =over 4 -=item L +=item Unicode Collation Algorithm - Unicode TR #10 -utility functions for Hangul Syllables +http://www.unicode.org/unicode/reports/tr10/ =item L normalized forms of Unicode text -=item Unicode Collation Algorithm - Unicode TR #10 - -http://www.unicode.org/unicode/reports/tr10/ - =back =cut diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index 9b4ed1b..3d6acdb 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,5 +1,14 @@ Revision history for Perl extension Unicode::Collate. +0.09 Sun Nov 11 17:02:40:18 2001 + - add the following methods: eq, ne, lt, le, gt, le. + - relies on &Unicode::Normalize::getCombinClass() + in place of %Unicode::Normalize::Combin + (the hash is not defined in the XS version of Unicode::Normalize). + then you should install Unicode::Normalize 0.10 or later. + - now independent of Lingua::KO::Hangul::Util + (this module does decomposition of Hangul syllables for itself) + 0.08 Mon Aug 20 22:40:18 2001 - add the index method. diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 0cf2bc3..c84a73c 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.08 +Unicode/Collate version 0.09 =============================== use UCA (Unicode Collation Algorithm) @@ -17,13 +17,15 @@ SYNOPSIS use Unicode::Collate; #construct - $UCA = Unicode::Collate->new(%tailoring); + $Collator = Unicode::Collate->new(%tailoring); #sort - @sorted = $UCA->sort(@not_sorted); + @sorted = $Collator->sort(@not_sorted); #compare - $result = $UCA->cmp($a, $b); # returns 1, 0, or -1. + $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. + $result = $Collator->eq($a, $b); # returns true/false + (similarly ->ne, ->lt, ->le, ->gt, ->ge) INSTALLATION @@ -38,13 +40,7 @@ To install this module type the following: DEPENDENCIES -This module requires these other modules and libraries: - -Carp -Exporter -Lingua::KO::Hangul::Util - -It's better if Unicode::Normalize has been installed +It's better if you have Unicode::Normalize (v 0.10 or later) although Unicode::Collate can be used without Unicode::Normalize. COPYRIGHT AND LICENCE diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t index 352c44f..48bf412 100644 --- a/lib/Unicode/Collate/t/test.t +++ b/lib/Unicode/Collate/t/test.t @@ -4,21 +4,21 @@ ######################### use Test; -BEGIN { plan tests => 22 }; +BEGIN { plan tests => 50 }; use Unicode::Collate; ok(1); # If we made it this far, we're ok. ######################### -my $UCA = Unicode::Collate->new( +my $Collator = Unicode::Collate->new( table => 'keys.txt', normalization => undef, ); -ok(ref $UCA, "Unicode::Collate"); +ok(ref $Collator, "Unicode::Collate"); ok( - join(':', $UCA->sort( + join(':', $Collator->sort( qw/ lib strict Carp ExtUtils CGI Time warnings Math overload Pod CPAN / ) ), join(':', @@ -29,10 +29,13 @@ ok( my $A_acute = pack('U', 0x00C1); my $acute = pack('U', 0x0301); -ok($UCA->cmp("A$acute", $A_acute), -1); +ok($Collator->cmp("A$acute", $A_acute), -1); -ok($UCA->cmp("", ""), 0); -ok($UCA->cmp("", "perl"), -1); +ok($Collator->cmp("", ""), 0); +ok(! $Collator->ne("", "") ); +ok( $Collator->eq("", "") ); + +ok($Collator->cmp("", "perl"), -1); eval "use Unicode::Normalize"; @@ -53,6 +56,7 @@ my $tr = Unicode::Collate->new( entry => <<'ENTRIES', 0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish +00DF ; [.09F3.0154.0004.00DF] [.09F3.0020.0004.00DF] # eszet in Germany ENTRIES ); @@ -66,7 +70,7 @@ ok( ); ok( - join(':', $UCA->sort( + join(':', $Collator->sort( qw/ acha aca ada acia acka / ) ), join(':', @@ -74,36 +78,60 @@ ok( ), ); -my $old_level = $UCA->{level}; +my $old_level = $Collator->{level}; my $hiragana = "\x{3042}\x{3044}"; my $katakana = "\x{30A2}\x{30A4}"; -$UCA->{level} = 2; +$Collator->{level} = 2; + +ok( $Collator->cmp("ABC","abc"), 0); +ok( $Collator->eq("ABC","abc") ); +ok( $Collator->le("ABC","abc") ); +ok( $Collator->cmp($hiragana, $katakana), 0); +ok( $Collator->eq($hiragana, $katakana) ); +ok( $Collator->ge($hiragana, $katakana) ); -ok( $UCA->cmp("ABC","abc"), 0); -ok( $UCA->cmp($hiragana, $katakana), 0); +# hangul +ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") ); +ok( $Collator->eq("a\x{AE00}b", "a\x{1100}\x{1173}\x{11AF}b") ); +ok( $Collator->gt("a\x{AE00}b", "a\x{1100}\x{1173}b\x{11AF}") ); +ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") ); +ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") ); +ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana -$UCA->{level} = $old_level; +$Collator->{level} = $old_level; -$UCA->{katakana_before_hiragana} = 1; +$Collator->{katakana_before_hiragana} = 1; -ok( $UCA->cmp("abc", "ABC"), -1); -ok( $UCA->cmp($hiragana, $katakana), 1); +ok( $Collator->cmp("abc", "ABC"), -1); +ok( $Collator->ne("abc", "ABC") ); +ok( $Collator->lt("abc", "ABC") ); +ok( $Collator->le("abc", "ABC") ); +ok( $Collator->cmp($hiragana, $katakana), 1); +ok( $Collator->ne($hiragana, $katakana) ); +ok( $Collator->gt($hiragana, $katakana) ); +ok( $Collator->ge($hiragana, $katakana) ); -$UCA->{upper_before_lower} = 1; +$Collator->{upper_before_lower} = 1; -ok( $UCA->cmp("abc", "ABC"), 1); -ok( $UCA->cmp($hiragana, $katakana), 1); +ok( $Collator->cmp("abc", "ABC"), 1); +ok( $Collator->ge("abc", "ABC"), 1); +ok( $Collator->gt("abc", "ABC"), 1); +ok( $Collator->cmp($hiragana, $katakana), 1); +ok( $Collator->ge($hiragana, $katakana), 1); +ok( $Collator->gt($hiragana, $katakana), 1); -$UCA->{katakana_before_hiragana} = 0; +$Collator->{katakana_before_hiragana} = 0; -ok( $UCA->cmp("abc", "ABC"), 1); -ok( $UCA->cmp($hiragana, $katakana), -1); +ok( $Collator->cmp("abc", "ABC"), 1); +ok( $Collator->cmp($hiragana, $katakana), -1); -$UCA->{upper_before_lower} = 0; +$Collator->{upper_before_lower} = 0; -ok( $UCA->cmp("abc", "ABC"), -1); -ok( $UCA->cmp($hiragana, $katakana), -1); +ok( $Collator->cmp("abc", "ABC"), -1); +ok( $Collator->le("abc", "ABC") ); +ok( $Collator->cmp($hiragana, $katakana), -1); +ok( $Collator->lt($hiragana, $katakana) ); my $ign = Unicode::Collate->new( table => 'keys.txt', @@ -113,27 +141,60 @@ my $ign = Unicode::Collate->new( ok( $ign->cmp("element","lament"), 0); -$UCA->{level} = 2; +$Collator->{level} = 2; -my $orig = "This is a Perl book."; my $str; + +my $orig = "This is a Perl book."; my $sub = "PERL"; my $rep = "camel"; my $ret = "This is a camel book."; $str = $orig; -if(my @tmp = $UCA->index($str, $sub)){ - substr($str, $tmp[0], $tmp[1], $rep); +if(my($pos,$len) = $Collator->index($str, $sub)){ + substr($str, $pos, $len, $rep); } ok($str, $ret); -$UCA->{level} = $old_level; +$Collator->{level} = $old_level; $str = $orig; -if(my @tmp = $UCA->index($str, $sub)){ - substr($str, $tmp[0], $tmp[1], $rep); +if(my($pos,$len) = $Collator->index($str, $sub)){ + substr($str, $pos, $len, $rep); } ok($str, $orig); +$tr->{level} = 1; + +$str = "Ich mu\x{00DF} studieren."; +$sub = "m\x{00FC}ss"; +my $match = undef; +if(my($pos, $len) = $tr->index($str, $sub)){ + $match = substr($str, $pos, $len); +} +ok($match, "mu\x{00DF}"); + +$tr->{level} = $old_level; + +$str = "Ich mu\x{00DF} studieren."; +$sub = "m\x{00FC}ss"; +$match = undef; +if(my($pos, $len) = $tr->index($str, $sub)){ + $match = substr($str, $pos, $len); +} +ok($match, undef); + +$match = undef; +if(my($pos,$len) = $Collator->index("", "")){ + $match = substr("", $pos, $len); +} +ok($match, ""); + +$match = undef; +if(my($pos,$len) = $Collator->index("", "abc")){ + $match = substr("", $pos, $len); +} +ok($match, undef); +