From: Nicholas Clark Date: Sat, 29 Nov 2003 17:29:15 +0000 (+0000) Subject: Update Unicode::Collate to 0.31 (Only the .pm version for now) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=10d7ec48cd7252976e5d98d8245df9ed1b239c74;p=p5sagit%2Fp5-mst-13.2.git Update Unicode::Collate to 0.31 (Only the .pm version for now) p4raw-id: //depot/perl@21810 --- diff --git a/MANIFEST b/MANIFEST index a4682af..854a1a7 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1747,6 +1747,8 @@ lib/Unicode/Collate/README Unicode::Collate lib/Unicode/Collate/t/contract.t Unicode::Collate lib/Unicode/Collate/t/hangtype.t Unicode::Collate lib/Unicode/Collate/t/hangul.t Unicode::Collate +lib/Unicode/Collate/t/illegal.t Unicode::Collate +lib/Unicode/Collate/t/illegalp.t Unicode::Collate lib/Unicode/Collate/t/index.t Unicode::Collate lib/Unicode/Collate/t/normal.t Unicode::Collate lib/Unicode/Collate/t/test.t Unicode::Collate diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index a4d6d80..e700597 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -12,9 +12,11 @@ use warnings; use Carp; use File::Spec; +no warnings 'utf8'; + require Exporter; -our $VERSION = '0.30'; +our $VERSION = '0.31'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -206,7 +208,7 @@ sub checkCollator { or croak "Illegal UCA version (passed $self->{UCA_Version})."; $self->{variable} ||= $self->{alternate} || $self->{variableTable} || - $self->{alternateTable} || $self->{alternate} || 'shifted'; + $self->{alternateTable} || 'shifted'; $self->{variable} = $self->{alternate} = lc($self->{variable}); exists $VariableOK{ $self->{variable} } or croak "$PACKAGE unknown variable tag name: $self->{variable}"; @@ -499,7 +501,7 @@ sub splitEnt } for (my $i = 0; $i < @src; $i++) { - next if _isNonCharacter($src[$i]); + next if _isIllegal($src[$i]); my $i_orig = $i; my $jcps = $src[$i]; @@ -801,7 +803,7 @@ sub _decompHangul { ); } -sub _isNonCharacter { +sub _isIllegal { my $code = shift; return ! defined $code # removed || ($code < 0 || 0x10FFFF < $code) # out of range @@ -1344,11 +1346,10 @@ but it is not warned at present.> You can use another collation element table if desired. The table file must be put into a directory -where F is installed. -E.g. in F directory -when you have F. +where F is installed; e.g. into +F if you have F. -By default, the filename F<"allkeys.txt"> is used. +By default, the filename F is used. If C is passed explicitly as the value for this key, no file is read (but you can define collation elements via C). @@ -1680,9 +1681,8 @@ assign C undef> explicitly. =head2 Conformance Test -The Conformance Test for the UCA is provided -in L -and L +The Conformance Test for the UCA is available +under L. For F, a collator via Cnew( )> should be used; @@ -1693,7 +1693,7 @@ B =head1 AUTHOR -SADAHIRO Tomoyuki, +SADAHIRO Tomoyuki http://homepage1.nifty.com/nomenclator/perl/ @@ -1712,17 +1712,17 @@ L =item The Default Unicode Collation Element Table (DUCET) -L +L =item The conformance test for the UCA -L +L -L +L =item Hangul Syllable Type -http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt +L =item Unicode Normalization Forms - UAX #15 diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index 7f92d7a..df60b97 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,14 +1,25 @@ Revision history for Perl module Unicode::Collate. +0.31 Sun Nov 16 15:40:15 2003 + - Illegal code points (surrogate and noncharacter; they are definitely + ignorable) will be distinguished from NULL ("\0"); + but porting is not successful in the case of ((Pure Perl) and + (Perl 5.7.3 or before)). If perl 5.6.X is used, XSUB may help it + in place of broken CORE::unpack('U*') in older perl. + - added illegal.t and illegalp.t. + - added XSUB edition (EXPERIMENTAL) where some functions are implemented + in XSUB (Pure Perl edition is also supported.) + 0.30 Mon Oct 13 21:26:37 2003 - fix: Completely ignorable in table should be able to be overrided by non-ignorable in entry. - fix: Maximum length for contraction must not be shortened - by a shorter contraction following. + by a shorter contraction following in table and/or entry. - added normal.t. - some doc fixes 0.29 Mon Oct 13 12:18:23 2003 + - now UCA Version 11. - supported hangul_terminator. - fix: Base_Unicode_Version falsely returns Perl's Unicode version. C4 in UTS #10 requires UTS's Unicode version. diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 6a4b712..2fc4e5f 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.30 +Unicode/Collate version 0.31 =============================== NAME @@ -23,6 +23,22 @@ SYNOPSIS INSTALLATION Perl 5.6.1 or later +(recommended: Perl 5.8.0 or later) + +To use this module, it is recommended to install a table file +in the UCA format, by copying it into the directory +where F is installed; +e.g. into F directory +if you have F. + +The most preferable one is "The Default Unicode Collation Element Table", +available from the Unicode consortium's website: + + http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version) + +Though this distribution contains a subset of allkeys.txt, named "keys.txt", +this one is intended only for doing a test of this module +and practically useless for any other purpose. To install this module type the following: @@ -31,17 +47,24 @@ To install this module type the following: make test make install -To use this module, it is better to install a table file in the UCA format, -by copying it into the lib/Unicode/Collate directory. +If you have a C compiler and want to use XSUB edition, +type the following (!! "enableXS" must run before "Makefile.PL" !!): -The most preferable one is "The Default Unicode Collation Element Table", -available from the Unicode consortium's website: + perl enableXS + perl Makefile.PL + make + make test + make install - http://www.unicode.org/reports/tr10/allkeys.txt +If you decide to install pure Perl (i.e. non-XS) edition after trying +to build XSUB, type the following: -Though this distribution contains a subset of allkeys.txt, named "keys.txt", -this one is intended only for doing a test of this module -and practically useless for any other purpose. + make clean + perl disableXS + perl Makefile.PL + make + make test + make install DEPENDENCIES diff --git a/lib/Unicode/Collate/t/illegal.t b/lib/Unicode/Collate/t/illegal.t new file mode 100644 index 0000000..b9961b6 --- /dev/null +++ b/lib/Unicode/Collate/t/illegal.t @@ -0,0 +1,85 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +use strict; +use warnings; + +BEGIN { + use Unicode::Collate; + + unless (exists &Unicode::Collate::bootstrap or 5.008 <= $]) { + print "1..0 # skipped: XSUB, or Perl 5.8.0 or later". + " needed for this test\n"; + print $@; + exit; + } +} + +BEGIN { plan tests => 22 }; + +ok(1); + +######################### + +no warnings 'utf8'; + +# NULL is tailorable but illegal code points are not. +# illegal code points should be always ingored +# (cf. UCA, 7.1.1 Illegal code points). + +my $illeg = Unicode::Collate->new( + entry => <<'ENTRIES', +0000 ; [.0020.0000.0000.0000] # [0000] NULL +0001 ; [.0021.0000.0000.0001] # [0001] START OF HEADING +FFFE ; [.0022.0000.0000.FFFE] # +FFFF ; [.0023.0000.0000.FFFF] # +D800 ; [.0024.0000.0000.D800] # +DFFF ; [.0025.0000.0000.DFFF] # +FDD0 ; [.0026.0000.0000.FDD0] # +FDEF ; [.0027.0000.0000.FDEF] # +0002 ; [.0030.0000.0000.0002] # [0002] START OF TEXT +10FFFF; [.0040.0000.0000.10FFFF] # +110000; [.0041.0000.0000.110000] # +ENTRIES + level => 1, + table => undef, + normalization => undef, +); + +ok($illeg->lt("", "\x00")); +ok($illeg->lt("", "\x01")); +ok($illeg->eq("", "\x{FFFE}")); +ok($illeg->eq("", "\x{FFFF}")); +ok($illeg->eq("", "\x{D800}")); +ok($illeg->eq("", "\x{DFFF}")); +ok($illeg->eq("", "\x{FDD0}")); +ok($illeg->eq("", "\x{FDEF}")); +ok($illeg->lt("", "\x02")); +ok($illeg->eq("", "\x{10FFFF}")); +ok($illeg->eq("", "\x{110000}")); + +ok($illeg->lt("\x00", "\x01")); +ok($illeg->lt("\x01", "\x02")); +ok($illeg->ne("\0", "\x{D800}")); +ok($illeg->ne("\0", "\x{DFFF}")); +ok($illeg->ne("\0", "\x{FDD0}")); +ok($illeg->ne("\0", "\x{FDEF}")); +ok($illeg->ne("\0", "\x{FFFE}")); +ok($illeg->ne("\0", "\x{FFFF}")); +ok($illeg->ne("\0", "\x{10FFFF}")); +ok($illeg->ne("\0", "\x{110000}")); + diff --git a/lib/Unicode/Collate/t/illegalp.t b/lib/Unicode/Collate/t/illegalp.t new file mode 100644 index 0000000..690c88d --- /dev/null +++ b/lib/Unicode/Collate/t/illegalp.t @@ -0,0 +1,80 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 17 }; + +use strict; +use warnings; + +ok(1); + +# +# No test for Unicode::Collate is included in this .t file. +# +# UCA conformance test requires completely ignorable characters +# (including noncharacters) must be able to be ordered in code point order; +# If not so, Unicode::Collate must not be compliant with UCA. +# +# ~~~ CollationTest_SHIFTED.txt in CollationTest-4.0.0 +# +# 206F 0021; # ! NOMINAL DIGIT SHAPES [| | | 0251] +# D800 0021; # ! [| | | 0251] +# DFFF 0021; # ! [| | | 0251] +# FDD0 0021; # ! [| | | 0251] +# FFFB 0021; # ! INTERLINEAR ANNOTATION TERMINATOR [| | | 0251] +# FFFE 0021; # ! [| | | 0251] +# FFFF 0021; # ! [| | | 0251] +# 1D165 0021; # ! MS. Cm. STEM [| | | 0251] +# +# ~~~ CollationTest_NON_IGNORABLE.txt in CollationTest-4.0.0 +# +# 206F 0021; # ! NOMINAL DIGIT SHAPES [0251 | 0020 | 0002 |] +# D800 0021; # ! [0251 | 0020 | 0002 |] +# DFFF 0021; # ! [0251 | 0020 | 0002 |] +# FDD0 0021; # ! [0251 | 0020 | 0002 |] +# FFFB 0021; # ! INTERLINEAR ANNOTATION TERMINATOR [0251 | 0020 | 0002 |] +# FFFE 0021; # ! [0251 | 0020 | 0002 |] +# FFFF 0021; # ! [0251 | 0020 | 0002 |] +# 1D165 0021; # ! MS. Cm. STEM [0251 | 0020 | 0002 |] +# + +no warnings 'utf8'; + +ok("\x{206F}!" lt "\x{D800}!"); +ok(pack('U*', 0x206F, 0x21) lt pack('U*', 0xD800, 0x21)); + +ok("\x{D800}!" lt "\x{DFFF}!"); +ok(pack('U*', 0xD800, 0x21) lt pack('U*', 0xDFFF, 0x21)); + +ok("\x{DFFF}!" lt "\x{FDD0}!"); +ok(pack('U*', 0xDFFF, 0x21) lt pack('U*', 0xFDD0, 0x21) ); + +ok("\x{FDD0}!" lt "\x{FFFB}!"); +ok(pack('U*', 0xFDD0, 0x21) lt pack('U*', 0xFFFB, 0x21)); + +ok("\x{FFFB}!" lt "\x{FFFE}!"); +ok(pack('U*', 0xFFFB, 0x21) lt pack('U*', 0xFFFE, 0x21)); + +ok("\x{FFFE}!" lt "\x{FFFF}!"); +ok(pack('U*', 0xFFFE, 0x21) lt pack('U*', 0xFFFF, 0x21)); + +ok("\x{FFFF}!" lt "\x{1D165}!"); +ok(pack('U*', 0xFFFF, 0x21) lt pack('U*', 0x1D165, 0x21)); + +ok("\000!" lt "\x{FFFF}!"); +ok(pack('U*', 0, 0x21) lt pack('U*', 0xFFFF, 0x21)); +