Commit | Line | Data |
51ef4e11 |
1 | package Encode::Tcl; |
2 | use strict; |
3 | use Encode qw(find_encoding); |
4 | use base 'Encode::Encoding'; |
5 | use Carp; |
6 | |
fc6a272d |
7 | =head1 NAME |
8 | |
9 | Encode::Tcl - Tcl encodings |
10 | |
11 | =cut |
51ef4e11 |
12 | |
13 | sub INC_search |
14 | { |
15 | foreach my $dir (@INC) |
16 | { |
17 | if (opendir(my $dh,"$dir/Encode")) |
18 | { |
19 | while (defined(my $name = readdir($dh))) |
20 | { |
21 | if ($name =~ /^(.*)\.enc$/) |
22 | { |
23 | my $canon = $1; |
24 | my $obj = find_encoding($canon); |
25 | if (!defined($obj)) |
26 | { |
27 | my $obj = bless { Name => $canon, File => "$dir/Encode/$name"},__PACKAGE__; |
28 | $obj->Define( $canon ); |
29 | # warn "$canon => $obj\n"; |
30 | } |
31 | } |
32 | } |
33 | closedir($dh); |
34 | } |
35 | } |
36 | } |
37 | |
38 | sub import |
39 | { |
40 | INC_search(); |
41 | } |
42 | |
43 | sub encode |
44 | { |
45 | my $obj = shift; |
46 | my $new = $obj->loadEncoding; |
47 | return undef unless (defined $new); |
48 | return $new->encode(@_); |
49 | } |
50 | |
51 | sub new_sequence |
52 | { |
53 | my $obj = shift; |
54 | my $new = $obj->loadEncoding; |
55 | return undef unless (defined $new); |
56 | return $new->new_sequence(@_); |
57 | } |
58 | |
59 | sub decode |
60 | { |
61 | my $obj = shift; |
62 | my $new = $obj->loadEncoding; |
63 | return undef unless (defined $new); |
64 | return $new->decode(@_); |
65 | } |
66 | |
67 | sub loadEncoding |
68 | { |
69 | my $obj = shift; |
70 | my $file = $obj->{'File'}; |
71 | my $name = $obj->name; |
72 | if (open(my $fh,$file)) |
73 | { |
74 | my $type; |
75 | while (1) |
76 | { |
77 | my $line = <$fh>; |
78 | $type = substr($line,0,1); |
79 | last unless $type eq '#'; |
80 | } |
466d6cd3 |
81 | my $class = ref($obj).('::'.(($type eq 'H') ? 'HanZi' : ($type eq 'E') ? 'Escape' : 'Table')); |
71a18b0f |
82 | # carp "Loading $file"; |
51ef4e11 |
83 | bless $obj,$class; |
84 | return $obj if $obj->read($fh,$obj->name,$type); |
85 | } |
86 | else |
87 | { |
88 | croak("Cannot open $file for ".$obj->name); |
89 | } |
90 | $obj->Undefine($name); |
91 | return undef; |
92 | } |
93 | |
94 | sub INC_find |
95 | { |
96 | my ($class,$name) = @_; |
97 | my $enc; |
98 | foreach my $dir (@INC) |
99 | { |
100 | last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc")); |
101 | } |
102 | return $enc; |
103 | } |
104 | |
105 | package Encode::Tcl::Table; |
106 | use base 'Encode::Encoding'; |
107 | |
108 | use Data::Dumper; |
109 | |
110 | sub read |
111 | { |
112 | my ($obj,$fh,$name,$type) = @_; |
f57a1a59 |
113 | my($rep, @leading); |
51ef4e11 |
114 | my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); |
115 | my @touni; |
116 | my %fmuni; |
117 | my $count = 0; |
118 | $def = hex($def); |
119 | while ($pages--) |
120 | { |
121 | my $line = <$fh>; |
122 | chomp($line); |
123 | my $page = hex($line); |
124 | my @page; |
f57a1a59 |
125 | $leading[$page] = 1 if $page; |
51ef4e11 |
126 | my $ch = $page * 256; |
127 | for (my $i = 0; $i < 16; $i++) |
128 | { |
129 | my $line = <$fh>; |
130 | for (my $j = 0; $j < 16; $j++) |
131 | { |
132 | my $val = hex(substr($line,0,4,'')); |
133 | if ($val || !$ch) |
134 | { |
f57a1a59 |
135 | my $uch = pack('U', $val); # chr($val); |
51ef4e11 |
136 | push(@page,$uch); |
137 | $fmuni{$uch} = $ch; |
138 | $count++; |
139 | } |
140 | else |
141 | { |
142 | push(@page,undef); |
143 | } |
144 | $ch++; |
145 | } |
146 | } |
147 | $touni[$page] = \@page; |
148 | } |
f57a1a59 |
149 | $rep = $type ne 'M' ? $obj->can("rep_$type") : |
150 | sub { ($_[0] > 255) || $leading[$_[0]] ? 'n' : 'C'}; |
51ef4e11 |
151 | $obj->{'Rep'} = $rep; |
152 | $obj->{'ToUni'} = \@touni; |
153 | $obj->{'FmUni'} = \%fmuni; |
154 | $obj->{'Def'} = $def; |
155 | $obj->{'Num'} = $count; |
156 | return $obj; |
157 | } |
158 | |
159 | sub rep_S { 'C' } |
160 | |
161 | sub rep_D { 'n' } |
162 | |
f57a1a59 |
163 | #sub rep_M { ($_[0] > 255) ? 'n' : 'C' } |
51ef4e11 |
164 | |
165 | sub representation |
166 | { |
167 | my ($obj,$ch) = @_; |
168 | $ch = 0 unless @_ > 1; |
f57a1a59 |
169 | $obj->{'Rep'}->($ch); |
51ef4e11 |
170 | } |
171 | |
172 | sub decode |
173 | { |
174 | my ($obj,$str,$chk) = @_; |
175 | my $rep = $obj->{'Rep'}; |
176 | my $touni = $obj->{'ToUni'}; |
e91cad5b |
177 | my $uni; |
51ef4e11 |
178 | while (length($str)) |
179 | { |
180 | my $ch = ord(substr($str,0,1,'')); |
181 | my $x; |
182 | if (&$rep($ch) eq 'C') |
183 | { |
184 | $x = $touni->[0][$ch]; |
185 | } |
186 | else |
187 | { |
188 | $x = $touni->[$ch][ord(substr($str,0,1,''))]; |
189 | } |
190 | unless (defined $x) |
191 | { |
192 | last if $chk; |
193 | # What do we do here ? |
194 | $x = ''; |
195 | } |
196 | $uni .= $x; |
197 | } |
198 | $_[1] = $str if $chk; |
199 | return $uni; |
200 | } |
201 | |
202 | |
203 | sub encode |
204 | { |
205 | my ($obj,$uni,$chk) = @_; |
206 | my $fmuni = $obj->{'FmUni'}; |
51ef4e11 |
207 | my $def = $obj->{'Def'}; |
208 | my $rep = $obj->{'Rep'}; |
e91cad5b |
209 | my $str; |
51ef4e11 |
210 | while (length($uni)) |
211 | { |
212 | my $ch = substr($uni,0,1,''); |
213 | my $x = $fmuni->{chr(ord($ch))}; |
214 | unless (defined $x) |
215 | { |
216 | last if ($chk); |
217 | $x = $def; |
218 | } |
219 | $str .= pack(&$rep($x),$x); |
220 | } |
221 | $_[1] = $uni if $chk; |
222 | return $str; |
223 | } |
224 | |
225 | package Encode::Tcl::Escape; |
226 | use base 'Encode::Encoding'; |
227 | |
228 | use Carp; |
229 | |
230 | sub read |
231 | { |
e91cad5b |
232 | my ($obj,$fh,$name) = @_; |
83ea2aad |
233 | my(%tbl, @seq, $enc, @esc, %grp); |
51ef4e11 |
234 | while (<$fh>) |
235 | { |
236 | my ($key,$val) = /^(\S+)\s+(.*)$/; |
237 | $val =~ s/^\{(.*?)\}/$1/g; |
238 | $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge; |
83ea2aad |
239 | |
e91cad5b |
240 | if($enc = Encode->getEncoding($key)){ |
241 | $tbl{$val} = ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc; |
d9da9e35 |
242 | push @seq, $val; |
83ea2aad |
243 | $grp{$val} = |
244 | $val =~ m|[(]| ? 0 : # G0 : SI eq "\cO" |
245 | $val =~ m|[)-]| ? 1 : # G1 : SO eq "\cN" |
246 | $val =~ m|[*.]| ? 2 : # G2 : SS2 eq "\eN" |
247 | $val =~ m|[+/]| ? 3 : # G3 : SS3 eq "\eO" |
248 | 0; # G0 |
e91cad5b |
249 | }else{ |
250 | $obj->{$key} = $val; |
251 | } |
d9da9e35 |
252 | if($val =~ /^\e(.*)/){ push(@esc, quotemeta $1) } |
51ef4e11 |
253 | } |
83ea2aad |
254 | $obj->{'Grp'} = \%grp; # graphic chars |
d9da9e35 |
255 | $obj->{'Seq'} = \@seq; # escape sequences |
256 | $obj->{'Tbl'} = \%tbl; # encoding tables |
257 | $obj->{'Esc'} = join('|', @esc); # regex of sequences following ESC |
e91cad5b |
258 | return $obj; |
51ef4e11 |
259 | } |
260 | |
261 | sub decode |
262 | { |
e91cad5b |
263 | my ($obj,$str,$chk) = @_; |
264 | my $tbl = $obj->{'Tbl'}; |
d9da9e35 |
265 | my $seq = $obj->{'Seq'}; |
83ea2aad |
266 | my $grp = $obj->{'Grp'}; |
d9da9e35 |
267 | my $esc = $obj->{'Esc'}; |
e91cad5b |
268 | my $ini = $obj->{'init'}; |
269 | my $fin = $obj->{'final'}; |
d9da9e35 |
270 | my $std = $seq->[0]; |
e91cad5b |
271 | my $cur = $std; |
83ea2aad |
272 | my @sta = ($std, undef, undef, undef); # G0 .. G3 state |
273 | my($g1,$g2,$g3) = (0,0,0); |
e91cad5b |
274 | my $uni; |
275 | while (length($str)){ |
276 | my $uch = substr($str,0,1,''); |
277 | if($uch eq "\e"){ |
d9da9e35 |
278 | if($str =~ s/^($esc)//) |
279 | { |
280 | my $esc = "\e$1"; |
83ea2aad |
281 | $sta[ $grp->{$esc} ] = $esc if $tbl->{$esc}; |
282 | } |
283 | # appearance of "\eN\eO" or "\eO\eN" isn't supposed. |
284 | # but coincidental ON of G2 and G3 is explicitly avoided. |
285 | elsif($str =~ s/^N//) |
286 | { |
287 | $g2 = 1; $g3 = 0; |
288 | } |
289 | elsif($str =~ s/^O//) |
290 | { |
291 | $g3 = 1; $g2 = 0; |
d9da9e35 |
292 | } |
293 | else |
294 | { |
295 | $str =~ s/^([\x20-\x2F]*[\x30-\x7E])//; |
296 | carp "unknown escape sequence: ESC $1"; |
297 | } |
e91cad5b |
298 | next; |
299 | } |
83ea2aad |
300 | if($uch eq "\x0e"){ |
301 | $g1 = 1; next; |
302 | } |
303 | if($uch eq "\x0f"){ |
304 | $g1 = 0; next; |
e91cad5b |
305 | } |
83ea2aad |
306 | |
307 | $cur = $g3 ? $sta[3] : $g2 ? $sta[2] : $g1 ? $sta[1] : $sta[0]; |
308 | |
e91cad5b |
309 | if(ref($tbl->{$cur}) eq 'Encode::XS'){ |
310 | $uni .= $tbl->{$cur}->decode($uch); |
83ea2aad |
311 | $g2 = $g3 = 0; |
e91cad5b |
312 | next; |
313 | } |
d9da9e35 |
314 | my $ch = ord($uch); |
e91cad5b |
315 | my $rep = $tbl->{$cur}->{'Rep'}; |
316 | my $touni = $tbl->{$cur}->{'ToUni'}; |
d9da9e35 |
317 | my $x; |
e91cad5b |
318 | if (&$rep($ch) eq 'C') |
319 | { |
320 | $x = $touni->[0][$ch]; |
321 | } |
322 | else |
323 | { |
324 | $x = $touni->[$ch][ord(substr($str,0,1,''))]; |
325 | } |
326 | unless (defined $x) |
327 | { |
328 | last if $chk; |
329 | # What do we do here ? |
330 | $x = ''; |
331 | } |
332 | $uni .= $x; |
83ea2aad |
333 | $g2 = $g3 = 0; |
e91cad5b |
334 | } |
335 | $_[1] = $str if $chk; |
336 | return $uni; |
51ef4e11 |
337 | } |
338 | |
339 | sub encode |
340 | { |
e91cad5b |
341 | my ($obj,$uni,$chk) = @_; |
342 | my $tbl = $obj->{'Tbl'}; |
d9da9e35 |
343 | my $seq = $obj->{'Seq'}; |
83ea2aad |
344 | my $grp = $obj->{'Grp'}; |
e91cad5b |
345 | my $ini = $obj->{'init'}; |
346 | my $fin = $obj->{'final'}; |
d9da9e35 |
347 | my $std = $seq->[0]; |
e91cad5b |
348 | my $str = $ini; |
83ea2aad |
349 | my @sta = ($std,undef,undef,undef); |
350 | my @pre = ($std,undef,undef,undef); |
351 | my $cur = $std; |
352 | my $pG = 0; |
353 | my $cG = 0; |
354 | |
355 | if($ini) |
356 | { |
357 | $sta[ $grp->{$ini} ] = $pre[ $grp->{$ini} ] = $ini; |
358 | } |
51ef4e11 |
359 | |
e91cad5b |
360 | while (length($uni)){ |
83ea2aad |
361 | my $ch = substr($uni,0,1,''); |
466d6cd3 |
362 | my $x; |
83ea2aad |
363 | foreach my $e_seq (@$seq){ |
466d6cd3 |
364 | $x = ref($tbl->{$e_seq}) eq 'Encode::XS' |
365 | ? $tbl->{$e_seq}->encode($ch,1) |
366 | : $tbl->{$e_seq}->{FmUni}->{$ch}; |
83ea2aad |
367 | $cur = $e_seq, last if defined $x; |
e91cad5b |
368 | } |
466d6cd3 |
369 | if(ref($tbl->{$cur}) ne 'Encode::XS') |
e91cad5b |
370 | { |
466d6cd3 |
371 | my $def = $tbl->{$cur}->{'Def'}; |
372 | my $rep = $tbl->{$cur}->{'Rep'}; |
373 | unless (defined $x){ |
374 | last if ($chk); |
375 | $x = $def; |
376 | } |
377 | $x = pack(&$rep($x),$x); |
378 | } |
83ea2aad |
379 | $cG = $grp->{$cur}; |
380 | $str .= $pre[ $cG ] = $cur if $cur ne $pre[ $cG ]; |
381 | |
382 | $str .= $cG == 0 && $pG == 1 ? "\cO" : |
383 | $cG == 1 && $pG == 0 ? "\cN" : |
384 | $cG == 2 ? "\eN" : |
385 | $cG == 3 ? "\eO" : ""; |
386 | $str .= $x; |
387 | $pG = $cG if $cG < 2; |
466d6cd3 |
388 | } |
83ea2aad |
389 | $str .= $std unless $cur eq $std; |
390 | $str .= "\cO" if $pG == 1; # back to G0 |
391 | $str .= $fin; # necessary? |
466d6cd3 |
392 | $_[1] = $uni if $chk; |
393 | return $str; |
394 | } |
395 | |
396 | package Encode::Tcl::HanZi; |
397 | use base 'Encode::Encoding'; |
398 | |
399 | use Carp; |
400 | |
401 | sub read |
402 | { |
403 | my ($obj,$fh,$name) = @_; |
404 | my(%tbl, @seq, $enc); |
405 | while (<$fh>) |
406 | { |
407 | my ($key,$val) = /^(\S+)\s+(.*)$/; |
408 | $val =~ s/^\{(.*?)\}/$1/g; |
409 | $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge; |
410 | if($enc = Encode->getEncoding($key)){ |
411 | $tbl{$val} = ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc; |
412 | push @seq, $val; |
413 | }else{ |
414 | $obj->{$key} = $val; |
415 | } |
416 | } |
417 | $obj->{'Seq'} = \@seq; # escape sequences |
418 | $obj->{'Tbl'} = \%tbl; # encoding tables |
419 | return $obj; |
420 | } |
421 | |
422 | sub decode |
423 | { |
424 | my ($obj,$str,$chk) = @_; |
425 | my $tbl = $obj->{'Tbl'}; |
426 | my $seq = $obj->{'Seq'}; |
427 | my $std = $seq->[0]; |
428 | my $cur = $std; |
429 | my $uni; |
430 | while (length($str)){ |
431 | my $uch = substr($str,0,1,''); |
432 | if($uch eq "~"){ |
433 | if($str =~ s/^\cJ//) |
434 | { |
435 | next; |
436 | } |
437 | elsif($str =~ s/^\~//) |
438 | { |
439 | 1; |
440 | } |
441 | elsif($str =~ s/^([{}])//) |
442 | { |
443 | $cur = "~$1"; |
444 | next; |
445 | } |
446 | else |
447 | { |
448 | $str =~ s/^([^~])//; |
449 | carp "unknown HanZi escape sequence: ~$1"; |
450 | next; |
451 | } |
e91cad5b |
452 | } |
466d6cd3 |
453 | if(ref($tbl->{$cur}) eq 'Encode::XS'){ |
454 | $uni .= $tbl->{$cur}->decode($uch); |
455 | next; |
456 | } |
457 | my $ch = ord($uch); |
458 | my $rep = $tbl->{$cur}->{'Rep'}; |
459 | my $touni = $tbl->{$cur}->{'ToUni'}; |
460 | my $x; |
461 | if (&$rep($ch) eq 'C') |
462 | { |
463 | $x = $touni->[0][$ch]; |
464 | } |
465 | else |
466 | { |
467 | $x = $touni->[$ch][ord(substr($str,0,1,''))]; |
468 | } |
469 | unless (defined $x) |
470 | { |
471 | last if $chk; |
472 | # What do we do here ? |
473 | $x = ''; |
474 | } |
475 | $uni .= $x; |
e91cad5b |
476 | } |
466d6cd3 |
477 | $_[1] = $str if $chk; |
478 | return $uni; |
479 | } |
480 | |
481 | sub encode |
482 | { |
483 | my ($obj,$uni,$chk) = @_; |
484 | my $tbl = $obj->{'Tbl'}; |
485 | my $seq = $obj->{'Seq'}; |
486 | my $std = $seq->[0]; |
487 | my $str; |
488 | my $pre = $std; |
489 | my $cur = $pre; |
490 | |
491 | while (length($uni)){ |
492 | my $ch = chr(ord(substr($uni,0,1,''))); |
493 | my $x; |
494 | foreach my $e_seq (@$seq){ |
495 | $x = ref($tbl->{$e_seq}) eq 'Encode::XS' |
496 | ? $tbl->{$e_seq}->encode($ch,1) |
497 | : $tbl->{$e_seq}->{FmUni}->{$ch}; |
498 | $cur = $e_seq and last if defined $x; |
e91cad5b |
499 | } |
466d6cd3 |
500 | if(ref($tbl->{$cur}) ne 'Encode::XS') |
501 | { |
502 | my $def = $tbl->{$cur}->{'Def'}; |
503 | my $rep = $tbl->{$cur}->{'Rep'}; |
504 | unless (defined $x){ |
505 | last if ($chk); |
506 | $x = $def; |
507 | } |
508 | $x = pack(&$rep($x),$x); |
509 | } |
510 | $str .= $cur eq $pre ? $x : ($pre = $cur).$x; |
511 | $str .= '~' if $x eq '~'; # to '~~' |
e91cad5b |
512 | } |
513 | $str .= $std unless $cur eq $std; |
e91cad5b |
514 | $_[1] = $uni if $chk; |
515 | return $str; |
516 | } |
466d6cd3 |
517 | |
51ef4e11 |
518 | 1; |
519 | __END__ |