Add the fruits of Larry Shatzer's version verifying script.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode / Tcl.t
CommitLineData
71a18b0f 1BEGIN {
2 chdir 't' if -d 't';
96d6357c 3# @INC = '../lib';
71a18b0f 4 require Config; import Config;
5 if ($Config{'extensions'} !~ /\bEncode\b/) {
6 print "1..0 # Skip: Encode was not built\n";
7 exit 0;
8 }
9}
10use Test;
11use Encode qw(encode decode);
12use Encode::Tcl;
13
14my @encodings = qw(euc-cn euc-jp euc-kr big5 shiftjis); # CJK
15my $n = 2;
16
17my %greek = (
18 'euc-cn' => [0xA6A1..0xA6B8,0xA6C1..0xA6D8],
19 'euc-jp' => [0xA6A1..0xA6B8,0xA6C1..0xA6D8],
20 'euc-kr' => [0xA5C1..0xA5D8,0xA5E1..0xA5F8],
21 'big5' => [0xA344..0xA35B,0xA35C..0xA373],
22 'shiftjis' => [0x839F..0x83B6,0x83BF..0x83D6],
23 'utf8' => [0x0391..0x03A1,0x03A3..0x03A9,0x03B1..0x03C1,0x03C3..0x03C9],
24);
25my @greek = qw(
26 ALPHA BETA GAMMA DELTA EPSILON ZETA ETA
27 THETA IOTA KAPPA LAMBDA MU NU XI OMICRON
28 PI RHO SIGMA TAU UPSILON PHI CHI PSI OMEGA
29 alpha beta gamma delta epsilon zeta eta
30 theta iota kappa lambda mu nu xi omicron
31 pi rho sigma tau upsilon phi chi psi omega
32);
33
34my %ideodigit = ( # cjk ideograph 'one' to 'ten'
35 'euc-cn' => [qw(d2bb b6fe c8fd cbc4 cee5 c1f9 c6df b0cb bec5 caae)],
36 'euc-jp' => [qw(b0ec c6f3 bbb0 bbcd b8de cfbb bcb7 c8ac b6e5 bdbd)],
37 'euc-kr' => [qw(ece9 eca3 dfb2 decc e7e9 d7bf f6d2 f8a2 cefa e4a8)],
38 'big5' => [qw(a440 a447 a454 a57c a4ad a4bb a443 a44b a445 a451)],
39 'shiftjis' => [qw(88ea 93f1 8e4f 8e6c 8cdc 985a 8eb5 94aa 8be3 8f5c)],
40 'utf8' => [qw(4e00 4e8c 4e09 56db 4e94 516d 4e03 516b 4e5d 5341)],
41);
42my @ideodigit = qw(one two three four five six seven eight nine ten);
43
33f2f539 44my $jis = '7bit-jis';
62756987 45my $kr = '2022-kr';
33f2f539 46my %esc_str;
47
48$esc_str{$jis} = {qw(
49 1b24422422242424262428242a1b2842
50 3042304430463048304a
51 1b284931323334355d1b2842
52 ff71ff72ff73ff74ff75ff9d
53 1b2442467c4b5c1b2842
54 65e5672c
55 3132331b244234413b7a1b28425065726c
56 0031003200336f225b57005000650072006c
57 546573740a1b24422546253925481b28420a
58 0054006500730074000a30c630b930c8000a
59)};
60
61$esc_str{$kr} = {qw(
62 1b2429430e2a22213e0f410d0a
63 304200b10041000d000a
64 1b2429430e3021332a34593673383639593b673e46405a0f0d0a
65 ac00b098b2e4b77cb9c8bc14c0acc544c790000d000a
66 1b2429434142430d0a
67 004100420043000d000a
68)};
69
70my $num_esc = $n * keys(%esc_str);
71foreach (values %esc_str){ $num_esc += $n * keys %$_ }
72
c72d1b38 73my $FS_preserves_case = 1; # Unix e.g.
74if ($^O eq 'VMS') { # || $^O eq ...
75 $FS_preserves_case = 0;
76}
466d6cd3 77my $hz = 'HZ'; # HanZi
c72d1b38 78if (!$FS_preserves_case) {
79 $hz = 'hz'; # HanZi
80}
466d6cd3 81
82my @hz_txt = (
83 "~~in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.~~",
84 "~~in GB.~{<:Ky2;S{#,~}~\cJ~{NpJ)l6HK!#~}Bye.~~",
85 "~~in GB.~\cJ~{<:Ky2;S{#,NpJ)l6HK!#~}~\cJBye.~~",
86);
87
88my $hz_exp = '007e0069006e002000470042002e5df162404e0d6b32'
89 . 'ff0c52ff65bd65bc4eba3002004200790065002e007e';
90
96d6357c 91use constant BUFSIZ => 64; # for test
92use constant hiragana => "\x{3042}\x{3044}\x{3046}\x{3048}\x{304A}";
93use constant han_kana => "\x{FF71}\x{FF72}\x{FF73}\x{FF74}\x{FF75}";
94use constant macron => "\x{0100}\x{0112}\x{012a}\x{014c}\x{016a}";
95use constant TAIL => 'bbb';
96use constant YES => 1;
97
98my @ary_buff = ( # [ encoding, decoded, encoded ]
99# type-M
100 ["euc-cn", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ],
101 ["euc-jp", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ],
102 ["euc-jp", han_kana, "\x8E\xB1\x8E\xB2\x8E\xB3\x8E\xB4\x8E\xB5" ],
103 ["euc-kr", hiragana, "\xAA\xA2\xAA\xA4\xAA\xA6\xAA\xA8\xAA\xAA" ],
104 ["shiftjis", hiragana, "\x82\xA0\x82\xA2\x82\xA4\x82\xA6\x82\xA8" ],
105 ["shiftjis", han_kana, "\xB1\xB2\xB3\xB4\xB5" ],
106# type-E
107 ["2022-cn", hiragana, "\e\$)A\cN". '$"$$$&$($*' . "\cO" ],
108 ["2022-jp", hiragana, "\e\$B".'$"$$$&$($*'."\e(B" ],
109 ["2022-kr", hiragana, "\e\$)C\cN". '*"*$*&*(**' . "\cO" ],
110 [ $jis, han_kana, "\e\(I".'12345'."\e(B" ],
111 ["2022-jp1", macron, "\e\$(D\x2A\x27\x2A\x37\x2A\x45\x2A\x57\x2A\x69\e(B"],
112 ["2022-jp2", "\x{C0}" . macron . "\x{C1}",
113 "\e\$(D\e.A\eN\x40\x2A\x27\x2A\x37\x2A\x45\x2A\x57\x2A\x69\e(B\eN\x41"],
114# type-X
115 ["euc-jp-0212", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ],
116 ["euc-jp-0212", han_kana, "\x8E\xB1\x8E\xB2\x8E\xB3\x8E\xB4\x8E\xB5" ],
117 ["euc-jp-0212", macron,
118 "\x8F\xAA\xA7\x8F\xAA\xB7\x8F\xAA\xC5\x8F\xAA\xD7\x8F\xAA\xE9" ],
119# type-H
120 [ $hz, hiragana, "~{". '$"$$$&$($*' . "~}" ],
121 [ $hz, hiragana, "~{". '$"$$' ."~\cJ". '$&$($*' . "~}" ],
122);
123
33f2f539 124plan test => $n*@encodings + $n*@encodings*@greek
96d6357c 125 + $n*@encodings*@ideodigit + $num_esc + $n + @hz_txt + @ary_buff;
71a18b0f 126
127foreach my $enc (@encodings)
128 {
129 my $tab = Encode->getEncoding($enc);
130 ok(1,defined($tab),"Could not load $enc");
131 my $str = join('',map(chr($_),0x20..0x7E));
132 my $uni = $tab->decode($str);
133 my $cpy = $tab->encode($uni);
134 ok($cpy,$str,"$enc mangled translating to Unicode and back");
135 }
136
137foreach my $enc (@encodings)
138 {
139 my $tab = Encode->getEncoding($enc);
140 foreach my $gk (0..$#greek)
141 {
142 my $uni = unpack 'U', $tab->decode(pack 'n', $greek{$enc}[$gk]);
143 ok($uni,$greek{'utf8'}[$gk],
144 "$enc mangled translating to Unicode GREEK $greek[$gk]");
145 my $cpy = unpack 'n',$tab->encode(pack 'U',$uni);
146 ok($cpy,$greek{$enc}[$gk],
147 "$enc mangled translating from Unicode GREEK $greek[$gk]");
148 }
149 }
150
151foreach my $enc (@encodings)
152 {
153 my $tab = Encode->getEncoding($enc);
154 foreach my $id (0..$#ideodigit)
155 {
156 my $uni = unpack 'U',$tab->decode(pack 'H*', $ideodigit{$enc}[$id]);
157 ok($uni,hex($ideodigit{'utf8'}[$id]),
158 "$enc mangled translating to Unicode CJK IDEOGRAPH $ideodigit[$id]");
159 my $cpy = lc unpack 'H*', $tab->encode(pack 'U',$uni);
160 ok($cpy,$ideodigit{$enc}[$id],
161 "$enc mangled translating from Unicode CJK IDEOGRAPH $ideodigit[$id]");
162 }
163 }
164
33f2f539 165{
166 sub to_unicode
167 {
168 my $enc = shift;
169 return unpack('H*', pack 'n*', unpack 'U*',
170 decode $enc, pack 'H*', join '', @_);
171 }
172
173 sub from_unicode
174 {
175 my $enc = shift;
176 return unpack('H*', encode $enc,
177 pack 'U*', unpack 'n*', pack 'H*', join '', @_);
178 }
179
180 foreach my $enc (sort keys %esc_str)
181 {
182 my $tab = Encode->getEncoding($enc);
183 ok(1,defined($tab),"Could not load $enc");
184 my %strings = %{ $esc_str{$enc} };
185 foreach my $estr (sort keys %strings)
186 {
187 my $ustr = to_unicode($enc, $estr);
188 ok($ustr, $strings{$estr},
189 "$enc mangled translating to Unicode");
190 ok(from_unicode($enc, $ustr), $estr,
191 "$enc mangled translating from Unicode");
192 }
193 ok(to_unicode($enc, keys %strings), join('', values %strings),
194 "$enc mangled translating to Unicode");
195 }
196}
466d6cd3 197
198
199{
200 my $hz_to_unicode = sub
201 {
202 return unpack('H*', pack 'n*', unpack 'U*', decode $hz, shift);
203 };
204
205 my $hz_from_unicode = sub
206 {
207 return encode($hz, pack 'U*', unpack 'n*', pack 'H*', shift);
208 };
209
210 foreach my $enc ($hz)
211 {
212 my $tab = Encode->getEncoding($enc);
213 ok(1,defined($tab),"Could not load $enc");
214
215 ok(&$hz_from_unicode($hz_exp), $hz_txt[0],
216 "$enc mangled translating from Unicode");
217
218 foreach my $str (@hz_txt)
219 {
220 ok(&$hz_to_unicode($str), $hz_exp,
221 "$enc mangled translating to Unicode");
222 }
223 }
224}
96d6357c 225
226for my $ary (@ary_buff) {
227 my $NG = 0;
228 my $enc = $ary->[0];
229 for my $n ( int(BUFSIZ/2) .. 2*BUFSIZ+4 ){
230 my $dst = "a"x$n. $ary->[1] . TAIL;
231 my $src = "a"x$n. $ary->[2] . TAIL;
232 my $utf = buff_decode($enc, $src);
233 $NG++ unless $dst eq $utf;
234 }
235 ok($NG, 0, "$enc mangled translating to Unicode");
236}
237
238sub buff_decode {
239 my($enc, $str) = @_;
240 my $utf8 = '';
241 my $inconv = '';
242 while(length $str){
243 my $buff = $inconv.substr($str,0,BUFSIZ - length $inconv,'');
244 my $decoded = decode($enc, $buff, YES);
245 if(length $decoded){
246 $utf8 .= $decoded;
247 $inconv = $buff;
248 } else {
249 last; # malformed?
250 }
251 }
252 return $utf8;
253}
254