From: SADAHIRO Tomoyuki Date: Sun, 16 Dec 2001 21:22:12 +0000 (+0900) Subject: [patch @13687] Unicode::Collate 0.10 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=905aa9f0848ed318032cdb14109ace09a223b727;p=p5sagit%2Fp5-mst-13.2.git [patch @13687] Unicode::Collate 0.10 Message-Id: <20011216211615.8ED5.BQW10602@nifty.com> p4raw-id: //depot/perl@13720 --- diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 3393d43..bd10fdd 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -7,7 +7,7 @@ use Carp; require Exporter; -our $VERSION = '0.09'; +our $VERSION = '0.10'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -68,8 +68,26 @@ sub new $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII) $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange}; - # open the table file - my $file = defined $self->{table} ? $self->{table} : $KeyFile; + # open a table file. + # if undef is passed explicitly, no file is read. + $self->{table} = $KeyFile unless exists $self->{table}; + $self->read_table if defined $self->{table}; + + if($self->{entry}){ + $self->parseEntry($_) foreach split /\n/, $self->{entry}; + } + + # keys of $self->{rearrangeHash} are $self->{rearrange}. + $self->{rearrangeHash} = {}; + @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); + + return $self; +} + + +sub read_table { + my $self = shift; + my $file = $self->{table} ne '' ? $self->{table} : $KeyFile; open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file"; while(<$fk>){ @@ -92,17 +110,9 @@ sub new $self->parseEntry($_); } close $fk; - if($self->{entry}){ - $self->parseEntry($_) foreach split /\n/, $self->{entry}; - } - - # keys of $self->{rearrangeHash} are $self->{rearrange}. - $self->{rearrangeHash} = {}; - @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); - - return $self; } + ## ## get $line, parse it, and write an entry in $self ## @@ -209,6 +219,7 @@ sub splitCE my $u = $src[$i]; # non-characters + next unless defined $u; next if $u < 0 || 0x10FFFF < $u # out of range || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates my $four = $u & 0xFFFF; @@ -228,12 +239,14 @@ sub splitCE # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize. if($getCombinClass && defined $ch) { - for(my $j = $i+1; $j < @src && $getCombinClass->( $src[$j] ); $j++) + for(my $j = $i+1; $j < @src; $j++) { + next unless defined $src[$j]; + last unless $getCombinClass->( $src[$j] ); my $comb = pack 'U', $src[$j]; next if ! $ent->{ $ch.$comb }; $ch .= $comb; - splice(@src, $j, 1); + $src[$j] = undef; } } push @buf, $ch; @@ -519,7 +532,7 @@ If omitted, forwards at all the levels. -- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10. -Overrides a default order or adds a new element +Overrides a default order or adds a new collation element entry => <<'ENTRIES', # use the UCA file format 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature as @@ -558,7 +571,7 @@ Any higher levels than the specified one are ignored. -- see 4.1 Normalize each input string, UTR #10. -If specified, strings are normalized before preparation sort keys +If specified, strings are normalized before preparation of sort keys (the normalization is executed after preprocess). As a form name, one of the following names must be used. @@ -637,6 +650,9 @@ The table file must be in your C directory. By default, the file C is used. +If undefined explicitly (as C undef>), +no file is read (you'd define collation elements using L). + =item undefName =item undefChar diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index 3d6acdb..d0ad3c1 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,5 +1,9 @@ Revision history for Perl extension Unicode::Collate. +0.10 Tue Dec 11 23:26:42 2001 + - now you are allowed to use no table file. + - fix: fetching CE with two or more combining characters. + 0.09 Sun Nov 11 17:02:40:18 2001 - add the following methods: eq, ne, lt, le, gt, le. - relies on &Unicode::Normalize::getCombinClass() diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index c84a73c..3ff073f 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.09 +Unicode/Collate version 0.10 =============================== use UCA (Unicode Collation Algorithm) diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t index 48bf412..d9ee1fe 100644 --- a/lib/Unicode/Collate/t/test.t +++ b/lib/Unicode/Collate/t/test.t @@ -4,7 +4,7 @@ ######################### use Test; -BEGIN { plan tests => 50 }; +BEGIN { plan tests => 54 }; use Unicode::Collate; ok(1); # If we made it this far, we're ok. @@ -42,11 +42,32 @@ eval "use Unicode::Normalize"; if(!$@){ my $NFD = Unicode::Collate->new( table => 'keys.txt', + entry => <<'ENTRIES', +0430 ; [.0B01.0020.0002.0430] # CYRILLIC SMALL LETTER A +0410 ; [.0B01.0020.0008.0410] # CYRILLIC CAPITAL LETTER A +04D3 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS +0430 0308 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS +04D3 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS +0430 0308 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS +04D2 ; [.0B09.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +0410 0308 ; [.0B09.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +0430 3099 ; [.0B10.0020.0002.04D3] # A WITH KATAKANA VOICED +0430 3099 0308 ; [.0B11.0020.0002.04D3] # A WITH KATAKANA VOICED, DIAERESIS +ENTRIES ); - ok($NFD->cmp("A$acute", $A_acute), 0); + ok($NFD->eq("A$acute", $A_acute)); + ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}")); + ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B")); + ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A")); + ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}", + "\x{0430}\x{309A}\x{3099}\x{0308}") ); } else{ ok(1); + ok(1); + ok(1); + ok(1); + ok(1); } my $tr = Unicode::Collate->new(