Encode::Tcl docs (was Re: UTF-16 and other missing(?) encodings)
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Tcl / Escape.pm
CommitLineData
df1df145 1package Encode::Tcl::Escape;
2use strict;
3our $VERSION = do {my @r=(q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r};
4use base 'Encode::Encoding';
5
6use Carp;
7
8use constant SI => "\cO";
9use constant SO => "\cN";
28e59c41 10use constant SS2 => "\e\x4E"; # ESC N
11use constant SS3 => "\e\x4F"; # ESC O
df1df145 12
13sub read
14{
15 my ($obj,$fh,$name) = @_;
28e59c41 16 my(%tbl, @seq, $enc, @esc, %grp, %mbc);
df1df145 17 while (<$fh>)
18 {
19 next unless /^(\S+)\s+(.*)$/;
20 my ($key,$val) = ($1,$2);
21 $val =~ s/^\{(.*?)\}/$1/g;
22 $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
23
28e59c41 24 if ($enc = Encode->getEncoding($key))
df1df145 25 {
28e59c41 26 $tbl{$val} =
27 ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc;
28
29 $mbc{$val} =
30 $val !~ /\e\x24/ ? 1 : # single-byte
31 $val =~ /[\x30-\x3F]$/ ? 2 : # (only 2 is supported)
32 $val =~ /[\x40-\x5F]$/ ? 2 : # double byte
33 $val =~ /[\x60-\x6F]$/ ? 3 : # triple byte
34 $val =~ /[\x70-\x7F]$/ ? 4 :
35 # 4 or more (only 4 is supported)
36 croak("odd sequence is defined");
37
df1df145 38 push @seq, $val;
28e59c41 39
df1df145 40 $grp{$val} =
28e59c41 41 $val =~ /\e\x24?[\x28]/ ? 0 : # G0 : SI
42 $val =~ /\e\x24?[\x29\x2D]/ ? 1 : # G1 : SO
43 $val =~ /\e\x24?[\x2A\x2E]/ ? 2 : # G2 : SS2
44 $val =~ /\e\x24?[\x2B\x2F]/ ? 3 : # G3 : SS3
45 0; # G0 (ESC 02/04 F, etc.)
df1df145 46 }
47 else
48 {
49 $obj->{$key} = $val;
50 }
28e59c41 51 if ($val =~ /^\e(.*)/)
df1df145 52 {
53 push(@esc, quotemeta $1);
54 }
55 }
56 $obj->{'Grp'} = \%grp; # graphic chars
28e59c41 57 $obj->{'Mbc'} = \%mbc; # bytes per char
df1df145 58 $obj->{'Seq'} = \@seq; # escape sequences
59 $obj->{'Tbl'} = \%tbl; # encoding tables
60 $obj->{'Esc'} = join('|', @esc); # regex of sequences following ESC
61 return $obj;
62}
63
64sub decode
65{
66 my ($obj,$str,$chk) = @_;
df1df145 67 my $tbl = $obj->{'Tbl'};
68 my $seq = $obj->{'Seq'};
28e59c41 69 my $mbc = $obj->{'Mbc'};
df1df145 70 my $grp = $obj->{'Grp'};
71 my $esc = $obj->{'Esc'};
df1df145 72 my $std = $seq->[0];
73 my $cur = $std;
74 my @sta = ($std, undef, undef, undef); # G0 .. G3 state
75 my $s = 0; # state of SO-SI. 0 (G0) or 1 (G1);
76 my $ss = 0; # state of SS2,SS3. 0 (G0), 2 (G2) or 3 (G3);
77 my $uni;
78 while (length($str))
79 {
28e59c41 80 if ($str =~ s/^\e//)
df1df145 81 {
28e59c41 82 if ($str =~ s/^($esc)//)
df1df145 83 {
84 my $e = "\e$1";
85 $sta[ $grp->{$e} ] = $e if $tbl->{$e};
86 }
87 # appearance of "\eN\eO" or "\eO\eN" isn't supposed.
88 # but in that case, the former will be ignored.
28e59c41 89 elsif ($str =~ s/^\x4E//)
df1df145 90 {
91 $ss = 2;
92 }
28e59c41 93 elsif ($str =~ s/^\x4F//)
df1df145 94 {
95 $ss = 3;
96 }
97 else
98 {
99 # strictly, ([\x20-\x2F]*[\x30-\x7E]). '?' for chopped.
100 $str =~ s/^([\x20-\x2F]*[\x30-\x7E]?)//;
28e59c41 101 if ($chk && ! length $str)
df1df145 102 {
103 $str = "\e$1"; # split sequence
104 last;
105 }
106 croak "unknown escape sequence: ESC $1";
107 }
108 next;
109 }
28e59c41 110 if ($str =~ s/^\cN//) # SO
df1df145 111 {
112 $s = 1; next;
113 }
28e59c41 114 if ($str =~ s/^\cO//) # SI
df1df145 115 {
116 $s = 0; next;
117 }
118
119 $cur = $ss ? $sta[$ss] : $sta[$s];
120
28e59c41 121 length($str) < $mbc->{$cur} and last; # split leading byte
122
123 my $cc = substr($str, 0, $mbc->{$cur}, '');
124
125 my $x = $tbl->{$cur}->decode($cc);
126 defined $x or Encode::Tcl::no_map_in_decode($obj->{'Name'}, $cc);
df1df145 127 $uni .= $x;
128 $ss = 0;
129 }
28e59c41 130 if ($chk)
df1df145 131 {
132 my $back = join('', grep defined($_) && $_ ne $std, @sta);
133 $back .= SO if $s;
28e59c41 134 $back .= $ss == 2 ? SS2 : $ss == 3 ? SS3 : '';
df1df145 135 $_[1] = $back.$str;
136 }
137 return $uni;
138}
139
140sub encode
141{
142 my ($obj,$uni,$chk) = @_;
df1df145 143 my $tbl = $obj->{'Tbl'};
144 my $seq = $obj->{'Seq'};
145 my $grp = $obj->{'Grp'};
146 my $ini = $obj->{'init'};
df1df145 147 my $std = $seq->[0];
148 my $str = $ini;
149 my @sta = ($std,undef,undef,undef); # G0 .. G3 state
150 my $cur = $std;
151 my $pG = 0; # previous G: 0 or 1.
152 my $cG = 0; # current G: 0,1,2,3.
153
28e59c41 154 if ($ini && defined $grp->{$ini})
df1df145 155 {
156 $sta[ $grp->{$ini} ] = $ini;
157 }
158
159 while (length($uni))
160 {
161 my $ch = substr($uni,0,1,'');
162 my $x;
163 foreach my $e_seq (@$seq)
164 {
28e59c41 165 $x = $tbl->{$e_seq}->encode($ch, 1);
df1df145 166 $cur = $e_seq, last if defined $x;
167 }
168 unless (defined $x)
169 {
28e59c41 170 $chk or Encode::Tcl::no_map_in_encode(ord($ch), $obj->{'Name'});
df1df145 171 return undef;
172 }
df1df145 173 $cG = $grp->{$cur};
174 $str .= $sta[$cG] = $cur unless $cG < 2 && $cur eq $sta[$cG];
175
176 $str .= $cG == 0 && $pG == 1 ? SI :
177 $cG == 1 && $pG == 0 ? SO :
178 $cG == 2 ? SS2 :
179 $cG == 3 ? SS3 : "";
180 $str .= $x;
181 $pG = $cG if $cG < 2;
182 }
183 $str .= SI if $pG == 1; # back to G0
28e59c41 184 $str .= $std unless $std eq $sta[0]; # G0 to ASCII
185 $str .= $obj->{'final'}; # necessary? I don't know what is this for.
df1df145 186 $_[1] = $uni if $chk;
187 return $str;
188}
189
1901;
191__END__
6b6c03af 192
193=head1 NAME
194
195Encode::Tcl::Escape - Tcl Escape encodings
196
197=head1 SYNOPSIS
198
199none
200
201=head1 DESCRIPTION
202
203This module is used internally by Encode::Tcl
204and handles type E of Tcl encodings (7-bit code only).
205
206Control sequences supported by this module are
207ESCAPE SEQUENCEs to designate graphic character sets
208and the following:
209
210 name (abbr.) bit combination
211
212 ESCAPE (ESC) 01/11
213 SHIFT-IN (SI) 00/15
214 SHIFT-OUT (SO) 00/14
215 SINGLE SHIFT TWO (SS2) ESC 04/14
216 SINGLE SHIFT THREE (SS3) ESC 04/15
217
218Designation of control character sets are not supported.
219
220=head1 SEE ALSO
221
222L<Encode>
223
224L<Encode::Tcl>
225
226L<http://www.itscj.ipsj.or.jp/ISO-IR/> [ISOREG]
227
228=cut