Integrate:

[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate / t / test.t
diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t

index d6d7288..777e9fb 100644 (file)
--- a/lib/Unicode/Collate/t/test.t
+++ b/lib/Unicode/Collate/t/test.t
@@ -1,7 +1,3 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-#########################
 
 BEGIN {
     if (ord("A") == 193) {
@@ -10,14 +6,24 @@ BEGIN {
     }
 }
 
+BEGIN {
+    if ($ENV{PERL_CORE}) {
+        chdir('t') if -d 't';
+        @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+    }
+}
+
 use Test;
-BEGIN { plan tests => 160 };
+BEGIN { plan tests => 194 };
 use Unicode::Collate;
-ok(1); # If we made it this far, we're ok.
+
+our $IsEBCDIC = ord("A") != 0x41;
 
 #########################
 
-my $UCA_Version = "8.0";
+ok(1); # If we made it this far, we're ok.
+
+my $UCA_Version = "9";
 
 ok(Unicode::Collate::UCA_Version, $UCA_Version);
 ok(Unicode::Collate->UCA_Version, $UCA_Version);
@@ -41,36 +47,51 @@ ok(
   ),
 );
 
-my $A_acute = pack('U', 0x00C1);
-my $acute   = pack('U', 0x0301);
-
-ok($Collator->cmp("A$acute", $A_acute), -1);
 ok($Collator->cmp("", ""), 0);
-ok(! $Collator->ne("", "") );
-ok(  $Collator->eq("", "") );
+ok($Collator->eq("", ""));
 ok($Collator->cmp("", "perl"), -1);
 
 ##############
 
+# Use pack('U'), not chr(), for Perl 5.6.1.
+my $A_acute = pack('U', $IsEBCDIC ? 0x65 : 0xC1);
+my $a_acute = pack('U', $IsEBCDIC ? 0x45 : 0xE1);
+my $acute   = pack('U', 0x0301);
+
+ok($Collator->cmp("A$acute", $A_acute), 0); # @version 3.1.1 (prev: -1)
+ok($Collator->cmp($a_acute, $A_acute), -1);
+ok($Collator->eq("A\cA$acute", $A_acute)); # UCA v9. \cA is invariant.
+
+my %old_level = $Collator->change(level => 1);
+ok($Collator->eq("A$acute", $A_acute));
+ok($Collator->eq("A", $A_acute));
+
+ok($Collator->change(level => 2)->eq($a_acute, $A_acute));
+ok($Collator->lt("A", $A_acute));
+
+ok($Collator->change(%old_level)->lt("A", $A_acute));
+ok($Collator->lt("A", $A_acute));
+ok($Collator->lt("A", $a_acute));
+ok($Collator->lt($a_acute, $A_acute));
+
+##############
+
 eval { require Unicode::Normalize };
 
-if (!$@) {
+if (!$@ && !$IsEBCDIC) {
   my $NFD = Unicode::Collate->new(
-    table => 'keys.txt',
+    table => undef,
     entry => <<'ENTRIES',
-0430 ; [.0B01.0020.0002.0430] # CYRILLIC SMALL LETTER A
-0410 ; [.0B01.0020.0008.0410] # CYRILLIC CAPITAL LETTER A
-04D3 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
-0430 0308 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
-04D3 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
-0430 0308 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
-04D2 ; [.0B09.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
-0410 0308 ; [.0B09.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
-0430 3099 ; [.0B10.0020.0002.04D3] # A WITH KATAKANA VOICED
-0430 3099 0308 ; [.0B11.0020.0002.04D3] # A WITH KATAKANA VOICED, DIAERESIS
+0430  ; [.0CB5.0020.0002.0430] # CYRILLIC SMALL LETTER A
+0410  ; [.0CB5.0020.0008.0410] # CYRILLIC CAPITAL LETTER A
+04D3  ; [.0CBD.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
+0430 0308 ; [.0CBD.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
+04D2  ; [.0CBD.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
+0410 0308 ; [.0CBD.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
+0430 3099 ; [.0CBE.0020.0002.04D3] # A WITH KATAKANA VOICED
+0430 3099 0308 ; [.0CBF.0020.0002.04D3] # A WITH KATAKANA VOICED, DIAERESIS
 ENTRIES
   );
-  ok($NFD->eq("A$acute", $A_acute));
   ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}"));
   ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B"));
   ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A"));
@@ -82,7 +103,6 @@ else {
   ok(1);
   ok(1);
   ok(1);
-  ok(1);
 }
 
 ##############
@@ -93,11 +113,13 @@ my $trad = Unicode::Collate->new(
   ignoreName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/,
   level => 4,
   entry => << 'ENTRIES',
- 0063 0068 ; [.0893.0020.0002.0063] % "ch" in traditional Spanish
- 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
- 00DF ; [.09F3.0154.0004.00DF] [.09F3.0020.0004.00DF] # eszet in Germany
+ 0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish
+ 0043 0068 ; [.0A3F.0020.0008.0043] # "Ch" in traditional Spanish
 ENTRIES
 );
+# 0063  ; [.0A3D.0020.0002.0063] # LATIN SMALL LETTER C
+# 0064  ; [.0A49.0020.0002.0064] # LATIN SMALL LETTER D
+# Deutsch sz is included in 'keys.txt';
 
 ok(
   join(':', $trad->sort( qw/ acha aca ada acia acka / ) ),
@@ -108,6 +130,8 @@ ok(
   join(':', $Collator->sort( qw/ acha aca ada acia acka / ) ),
   join(':',                  qw/ aca acha acia acka ada / ),
 );
+ok($trad->eq("ocho", "oc\cAho")); # UCA v9
+ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9
 
 my $hiragana = "\x{3042}\x{3044}";
 my $katakana = "\x{30A2}\x{30A4}";
@@ -120,9 +144,9 @@ ok($trad->eq($katakana, $hiragana));
 
 ##############
 
-my $old_level = $Collator->{level};
+$Collator->change(level => 2);
 
-$Collator->{level} = 2;
+ok($Collator->{level}, 2);
 
 ok( $Collator->cmp("ABC","abc"), 0);
 ok( $Collator->eq("ABC","abc") );
@@ -139,9 +163,9 @@ ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") );
 ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") );
 ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana
 
-$Collator->{level} = $old_level;
+$Collator->change(%old_level, katakana_before_hiragana => 1);
 
-$Collator->{katakana_before_hiragana} = 1;
+ok($Collator->{level}, 4);
 
 ok( $Collator->cmp("abc", "ABC"), -1);
 ok( $Collator->ne("abc", "ABC") );
@@ -152,7 +176,7 @@ ok( $Collator->ne($hiragana, $katakana) );
 ok( $Collator->gt($hiragana, $katakana) );
 ok( $Collator->ge($hiragana, $katakana) );
 
-$Collator->{upper_before_lower} = 1;
+$Collator->change(upper_before_lower => 1);
 
 ok( $Collator->cmp("abc", "ABC"), 1);
 ok( $Collator->ge("abc", "ABC"), 1);
@@ -161,12 +185,12 @@ ok( $Collator->cmp($hiragana, $katakana), 1);
 ok( $Collator->ge($hiragana, $katakana), 1);
 ok( $Collator->gt($hiragana, $katakana), 1);
 
-$Collator->{katakana_before_hiragana} = 0;
+$Collator->change(katakana_before_hiragana => 0);
 
 ok( $Collator->cmp("abc", "ABC"), 1);
 ok( $Collator->cmp($hiragana, $katakana), -1);
 
-$Collator->{upper_before_lower} = 0;
+$Collator->change(upper_before_lower => 0);
 
 ok( $Collator->cmp("abc", "ABC"), -1);
 ok( $Collator->le("abc", "ABC") );
@@ -188,6 +212,7 @@ ok($ignoreAE->eq("Perl","ePrl"));
 
 my $onlyABC = Unicode::Collate->new(
     table => undef,
+    normalization => undef,
     entry => << 'ENTRIES',
 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
@@ -218,92 +243,6 @@ ok($Collator->lt("lake","like"));
 
 ##############
 
-$Collator->{level} = 2;
-
-my $str;
-
-my $orig = "This is a Perl book.";
-my $sub = "PERL";
-my $rep = "camel";
-my $ret = "This is a camel book.";
-
-$str = $orig;
-if (my($pos,$len) = $Collator->index($str, $sub)) {
-  substr($str, $pos, $len, $rep);
-}
-
-ok($str, $ret);
-
-$Collator->{level} = $old_level;
-
-$str = $orig;
-if (my($pos,$len) = $Collator->index($str, $sub)) {
-  substr($str, $pos, $len, $rep);
-}
-
-ok($str, $orig);
-
-##############
-
-my $match;
-
-$Collator->{level} = 1;
-
-$str = "Pe\x{300}rl";
-$sub = "pe";
-$match = undef;
-if (my($pos, $len) = $Collator->index($str, $sub)) {
-    $match = substr($str, $pos, $len);
-}
-ok($match, "Pe\x{300}");
-
-$str = "P\x{300}e\x{300}\x{301}\x{303}rl";
-$sub = "pE";
-$match = undef;
-if (my($pos, $len) = $Collator->index($str, $sub)) {
-    $match = substr($str, $pos, $len);
-}
-ok($match, "P\x{300}e\x{300}\x{301}\x{303}");
-
-$Collator->{level} = $old_level;
-
-##############
-
-$trad->{level} = 1;
-
-$str = "Ich mu\x{00DF} studieren.";
-$sub = "m\x{00FC}ss";
-$match = undef;
-if (my($pos, $len) = $trad->index($str, $sub)) {
-    $match = substr($str, $pos, $len);
-}
-ok($match, "mu\x{00DF}");
-
-$trad->{level} = $old_level;
-
-$str = "Ich mu\x{00DF} studieren.";
-$sub = "m\x{00FC}ss";
-$match = undef;
-
-if (my($pos, $len) = $trad->index($str, $sub)) {
-    $match = substr($str, $pos, $len);
-}
-ok($match, undef);
-
-$match = undef;
-if (my($pos,$len) = $Collator->index("", "")) {
-    $match = substr("", $pos, $len);
-}
-ok($match, "");
-
-$match = undef;
-if (my($pos,$len) = $Collator->index("", "abc")) {
-    $match = substr("", $pos, $len);
-}
-ok($match, undef);
-
-##############
-
 # Table is undefined, then no entry is defined.
 
 my $undef_table = Unicode::Collate->new(
@@ -369,21 +308,41 @@ ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}"));
 
 ##############
 
-my $all_undef = Unicode::Collate->new(
+my $all_undef_8 = Unicode::Collate->new(
   table => undef,
   normalization => undef,
   overrideCJK => undef,
   overrideHangul => undef,
+  UCA_Version => 8,
 );
 
 # All in the Unicode code point order.
 # No hangul decomposition.
 
-ok($all_undef->lt("\x{3042}", "\x{4E00}"));
-ok($all_undef->lt("\x{4DFF}", "\x{4E00}"));
-ok($all_undef->lt("\x{4E00}", "\x{AC00}"));
-ok($all_undef->gt("\x{AC00}", "\x{1100}\x{1161}"));
-ok($all_undef->gt("\x{AC00}", "\x{ABFF}"));
+ok($all_undef_8->lt("\x{3402}", "\x{4E00}"));
+ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}"));
+ok($all_undef_8->lt("\x{4E00}", "\x{AC00}"));
+ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}"));
+ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}"));
+
+##############
+
+my $all_undef_9 = Unicode::Collate->new(
+  table => undef,
+  normalization => undef,
+  overrideCJK => undef,
+  overrideHangul => undef,
+  UCA_Version => 9,
+);
+
+# CJK Ideo. < CJK ext A/B < Others.
+# No hangul decomposition.
+
+ok($all_undef_9->lt("\x{4E00}", "\x{3402}"));
+ok($all_undef_9->lt("\x{3402}", "\x{20000}"));
+ok($all_undef_9->lt("\x{20000}", "\x{AC00}"));
+ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}"));
+ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); # U+ABFF: not assigned
 
 ##############
 
@@ -425,59 +384,41 @@ ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned.
 
 ##############
 
-my $blanked = Unicode::Collate->new(
-  table => 'keys.txt',
-  normalization => undef,
-  alternate => 'Blanked',
-);
-
-ok($blanked->lt("death", "de luge"));
-ok($blanked->lt("de luge", "de-luge"));
-ok($blanked->lt("de-luge", "deluge"));
-ok($blanked->lt("deluge", "de\x{2010}luge"));
-ok($blanked->lt("deluge", "de Luge"));
+my %origAlter = $Collator->change(alternate => 'Blanked');
 
-##############
+ok($Collator->lt("death", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deluge"));
+ok($Collator->lt("deluge", "de\x{2010}luge"));
+ok($Collator->lt("deluge", "de Luge"));
 
-my $nonIgn = Unicode::Collate->new(
-  table => 'keys.txt',
-  normalization => undef,
-  alternate => 'Non-ignorable',
-);
+$Collator->change(alternate => 'Non-ignorable');
 
-ok($nonIgn->lt("de luge", "de Luge"));
-ok($nonIgn->lt("de Luge", "de-luge"));
-ok($nonIgn->lt("de-Luge", "de\x{2010}luge"));
-ok($nonIgn->lt("de-luge", "death"));
-ok($nonIgn->lt("death", "deluge"));
+ok($Collator->lt("de luge", "de Luge"));
+ok($Collator->lt("de Luge", "de-luge"));
+ok($Collator->lt("de-Luge", "de\x{2010}luge"));
+ok($Collator->lt("de-luge", "death"));
+ok($Collator->lt("death", "deluge"));
 
-##############
+$Collator->change(alternate => 'Shifted');
 
-my $shifted = Unicode::Collate->new(
-  table => 'keys.txt',
-  normalization => undef,
-  alternate => 'Shifted',
-);
+ok($Collator->lt("death", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deluge"));
+ok($Collator->lt("deluge", "de Luge"));
+ok($Collator->lt("de Luge", "deLuge"));
 
-ok($shifted->lt("death", "de luge"));
-ok($shifted->lt("de luge", "de-luge"));
-ok($shifted->lt("de-luge", "deluge"));
-ok($shifted->lt("deluge", "de Luge"));
-ok($shifted->lt("de Luge", "deLuge"));
+$Collator->change(alternate => 'Shift-Trimmed');
 
-##############
+ok($Collator->lt("death", "deluge"));
+ok($Collator->lt("deluge", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deLuge"));
+ok($Collator->lt("deLuge", "de Luge"));
 
-my $shTrim = Unicode::Collate->new(
-  table => 'keys.txt',
-  normalization => undef,
-  alternate => 'Shift-Trimmed',
-);
+$Collator->change(%origAlter);
 
-ok($shTrim->lt("death", "deluge"));
-ok($shTrim->lt("deluge", "de luge"));
-ok($shTrim->lt("de luge", "de-luge"));
-ok($shTrim->lt("de-luge", "deLuge"));
-ok($shTrim->lt("deLuge", "de Luge"));
+ok($Collator->{alternate}, 'shifted');
 
 ##############
 
@@ -503,19 +444,30 @@ ok($overCJK->lt("a\x{4E03}", "A\x{4E00}"));
 
 ##############
 
-# rearranged : 0x0E40..0x0E44, 0x0EC0..0x0EC4
+# rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default)
 
-ok($Collator->lt("A", "B"));
+my %old_rearrange = $Collator->change(rearrange => undef);
+
+ok($Collator->gt("\x{0E41}A", "\x{0E40}B"));
+ok($Collator->gt("A\x{0E41}A", "A\x{0E40}B"));
+
+$Collator->change(rearrange => [ 0x61 ]);
+ # U+0061, 'a': This is a Unicode value, never a native value.
+
+ok($Collator->gt("ab", "AB")); # as 'ba' > 'AB'
+
+$Collator->change(%old_rearrange);
+
+ok($Collator->lt("ab", "AB"));
 ok($Collator->lt("\x{0E40}", "\x{0E41}"));
 ok($Collator->lt("\x{0E40}A", "\x{0E41}B"));
 ok($Collator->lt("\x{0E41}A", "\x{0E40}B"));
 ok($Collator->lt("A\x{0E41}A", "A\x{0E40}B"));
 
-ok($all_undef->lt("A", "B"));
-ok($all_undef->lt("\x{0E40}", "\x{0E41}"));
-ok($all_undef->lt("\x{0E40}A", "\x{0E41}B"));
-ok($all_undef->lt("\x{0E41}A", "\x{0E40}B"));
-ok($all_undef->lt("A\x{0E41}A", "A\x{0E40}B"));
+ok($all_undef_8->lt("\x{0E40}", "\x{0E41}"));
+ok($all_undef_8->lt("\x{0E40}A", "\x{0E41}B"));
+ok($all_undef_8->lt("\x{0E41}A", "\x{0E40}B"));
+ok($all_undef_8->lt("A\x{0E41}A", "A\x{0E40}B"));
 
 ##############
 
@@ -533,8 +485,6 @@ ok($no_rearrange->gt("A\x{0E41}A", "A\x{0E40}B"));
 
 ##############
 
-# equivalent to $no_rearrange
-
 my $undef_rearrange = Unicode::Collate->new(
   table => undef,
   normalization => undef,
@@ -592,7 +542,6 @@ ok($backLevel2->gt("ca\x{300}ca\x{302}", "Ca\x{302}ca\x{300}"));
 ok($Collator  ->lt("Ca\x{300}ca\x{302}", "ca\x{302}ca\x{300}"));
 ok($Collator  ->lt("ca\x{300}ca\x{302}", "Ca\x{302}ca\x{300}"));
 
-
 # HIRAGANA and KATAKANA are made undefined via undefName.
 # So they are after CJK Unified Ideographs.
 
@@ -602,3 +551,65 @@ ok($Collator  ->gt("\x{4E00}", $hiragana));
 ok($Collator  ->gt("\x{4E03}", $katakana));
 
 ##############
+
+# ignorable after variable
+
+# Shifted;
+ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!"));
+ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute"));
+ok($Collator->eq("?\x{300}", "?"));
+ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs.
+
+$Collator->change(level => 3);
+ok($Collator->eq("\cA", "?"));
+
+$Collator->change(alternate => 'blanked', level => 4);
+ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!"));
+ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute"));
+ok($Collator->eq("?\x{300}", "?"));
+ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs.
+
+$Collator->change(level => 3);
+ok($Collator->eq("\cA", "?"));
+
+$Collator->change(alternate => 'Non-ignorable', level => 4);
+
+ok($Collator->lt("?\x{300}", "?!"));
+ok($Collator->gt("?\x{300}A$acute", "?$A_acute"));
+ok($Collator->gt("?\x{300}", "?"));
+ok($Collator->gt("?\x{344}", "?"));
+
+$Collator->change(level => 3);
+ok($Collator->lt("\cA", "?"));
+
+$Collator->change(alternate => 'Shifted', level => 4);
+
+##############
+
+# According to Conformance Test,
+# a L3-ignorable is treated as a completely ignorable.
+
+my $L3ignorable = Unicode::Collate->new(
+  alternate => 'Non-ignorable',
+  table => undef,
+  normalization => undef,
+  entry => <<'ENTRIES',
+0000  ; [.0000.0000.0000.0000] # [0000] NULL (in 6429)
+0001  ; [.0000.0000.0000.0000] # [0001] START OF HEADING (in 6429)
+0591  ; [.0000.0000.0000.0591] # HEBREW ACCENT ETNAHTA
+1D165 ; [.0000.0000.0000.1D165] # MUSICAL SYMBOL COMBINING STEM
+0021  ; [*024B.0020.0002.0021] # EXCLAMATION MARK
+09BE  ; [.114E.0020.0002.09BE] # BENGALI VOWEL SIGN AA
+09C7  ; [.1157.0020.0002.09C7] # BENGALI VOWEL SIGN E
+09CB  ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O
+09C7 09BE ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O
+ENTRIES
+);
+
+ok($L3ignorable->lt("\cA", "!"));
+ok($L3ignorable->lt("\x{591}", "!"));
+ok($L3ignorable->eq("\cA", "\x{591}"));
+ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\cA\x{09BE}A"));
+ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{0591}\x{09BE}A"));
+ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{1D165}\x{09BE}A"));
+ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09CB}A"));