Commit | Line | Data |
c0d88b76 |
1 | package Encode::CN::HZ; |
2 | |
00a464f7 |
3 | use strict; |
00a464f7 |
4 | |
eb042f38 |
5 | use vars qw($VERSION); |
8676e7d3 |
6 | $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
eb042f38 |
7 | |
8676e7d3 |
8 | use Encode qw(:fallbacks); |
10c5ecbb |
9 | |
10 | use base qw(Encode::Encoding); |
11 | __PACKAGE__->Define('hz'); |
c0d88b76 |
12 | |
8676e7d3 |
13 | # HZ is a combination of ASCII and escaped GB, so we implement it |
14 | # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. |
10c5ecbb |
15 | |
8676e7d3 |
16 | # not ported for EBCDIC. Which should be used, "~" or "\x7E"? |
c0d88b76 |
17 | |
0ab8f81e |
18 | sub needs_lines { 1 } |
19 | |
8676e7d3 |
20 | sub perlio_ok { 1 } |
0ab8f81e |
21 | |
8676e7d3 |
22 | sub decode ($$;$) |
c0d88b76 |
23 | { |
8676e7d3 |
24 | use bytes; |
c0d88b76 |
25 | my ($obj,$str,$chk) = @_; |
8676e7d3 |
26 | |
27 | my $GB = Encode::find_encoding('gb2312-raw'); |
28 | my $ret = ''; |
29 | my $in_ascii = 1; # default mode is ASCII. |
30 | |
31 | while (length $str) { |
32 | if ($in_ascii) { # ASCII mode |
33 | if ($str =~ s/^([\x00-\x7D\x7F]+)//) { # no '~' => ASCII |
34 | $ret .= $1; |
35 | # EBCDIC should need ascii2native, but not ported. |
36 | } |
37 | elsif ($str =~ s/^\x7E\x7E//) { # escaped tilde |
38 | $ret .= '~'; |
39 | } |
40 | elsif ($str =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII |
41 | 1; # no-op |
42 | } |
43 | elsif ($str =~ s/^\x7E\x7B//) { # '~{' |
44 | $in_ascii = 0; # to GB |
45 | } |
46 | else { # encounters an invalid escape, \x80 or greater |
47 | last; |
48 | } |
49 | } |
50 | else { # GB mode; the byte ranges are as in RFC 1843. |
51 | if ($str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)//) { |
52 | $ret .= $GB->decode($1, $chk); |
53 | } |
54 | elsif ($str =~ s/^\x7E\x7D//) { # '~}' |
55 | $in_ascii = 1; |
56 | } |
57 | else { # invalid |
58 | last; |
59 | } |
60 | } |
61 | } |
62 | $_[1] = $str if $chk; |
63 | return $ret; |
64 | } |
65 | |
66 | sub cat_decode { |
67 | use bytes; |
68 | |
69 | my ($obj, undef, $src, $pos, $trm, $chk) = @_; |
70 | my ($rdst, $rsrc, $rpos) = \@_[1..3]; |
71 | |
72 | my $GB = Encode::find_encoding('gb2312-raw'); |
73 | my $ret = ''; |
74 | my $in_ascii = 1; # default mode is ASCII. |
75 | |
76 | my $ini_pos = pos($$rsrc); |
77 | |
78 | substr($src, 0, $pos) = ''; |
79 | |
80 | my $ini_len = bytes::length($src); |
81 | |
82 | # $trm is the first of the pair '~~', then 2nd tilde is to be removed. |
83 | # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? |
84 | $src =~ s/^\x7E// if $trm eq "\x7E"; |
85 | |
86 | while (length $src) { |
87 | my $now; |
88 | if ($in_ascii) { # ASCII mode |
89 | if ($src =~ s/^([\x00-\x7D\x7F])//) { # no '~' => ASCII |
90 | $now = $1; |
91 | } |
92 | elsif ($src =~ s/^\x7E\x7E//) { # escaped tilde |
93 | $now = '~'; |
94 | } |
95 | elsif ($src =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII |
96 | next; |
97 | } |
98 | elsif ($src =~ s/^\x7E\x7B//) { # '~{' |
99 | $in_ascii = 0; # to GB |
100 | next; |
101 | } |
102 | else { # encounters an invalid escape, \x80 or greater |
103 | last; |
104 | } |
105 | } |
106 | else { # GB mode; the byte ranges are as in RFC 1843. |
107 | if ($src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)//) { |
108 | $now = $GB->decode($1, $chk); |
109 | } |
110 | elsif ($src =~ s/^\x7E\x7D//) { # '~}' |
111 | $in_ascii = 1; |
112 | next; |
113 | } |
114 | else { # invalid |
115 | last; |
116 | } |
117 | } |
118 | |
119 | next if ! defined $now; |
120 | |
121 | $ret .= $now; |
122 | |
123 | if ($now eq $trm) { |
124 | $$rdst .= $ret; |
125 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
126 | pos($$rsrc) = $ini_pos; |
127 | return 1; |
128 | } |
129 | } |
130 | |
131 | $$rdst .= $ret; |
132 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
133 | pos($$rsrc) = $ini_pos; |
134 | return ''; # terminator not found |
c0d88b76 |
135 | } |
136 | |
8676e7d3 |
137 | |
138 | sub encode($$;$) |
c0d88b76 |
139 | { |
140 | my ($obj,$str,$chk) = @_; |
8676e7d3 |
141 | |
142 | my $GB = Encode::find_encoding('gb2312-raw'); |
143 | my $ret = ''; |
144 | my $in_ascii = 1; # default mode is ASCII. |
145 | |
146 | no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. |
147 | |
148 | while (length $str) { |
149 | if ($str =~ s/^([[:ascii:]]+)//) { |
150 | my $tmp = $1; |
151 | $tmp =~ s/~/~~/g; # escapes tildes |
152 | if (! $in_ascii) { |
153 | $ret .= "\x7E\x7D"; # '~}' |
154 | $in_ascii = 1; |
155 | } |
156 | $ret .= pack 'a*', $tmp; # remove UTF8 flag. |
157 | } |
158 | elsif ($str =~ s/(.)//) { |
159 | my $tmp = $GB->encode($1, $chk); |
160 | last if !defined $tmp; |
161 | if (length $tmp == 2) { # maybe a valid GB char (XXX) |
162 | if ($in_ascii) { |
163 | $ret .= "\x7E\x7B"; # '~{' |
164 | $in_ascii = 0; |
165 | } |
166 | $ret .= $tmp; |
167 | } |
168 | elsif (length $tmp) { # maybe FALLBACK in ASCII (XXX) |
169 | if (!$in_ascii) { |
170 | $ret .= "\x7E\x7D"; # '~}' |
171 | $in_ascii = 1; |
172 | } |
173 | $ret .= $tmp; |
174 | } |
00a464f7 |
175 | } |
8676e7d3 |
176 | else { # if $str is malformed UTF8 *and* if length $str != 0. |
177 | last; |
00a464f7 |
178 | } |
179 | } |
8676e7d3 |
180 | $_[1] = $str if $chk; |
00a464f7 |
181 | |
8676e7d3 |
182 | # The state at the end of the chunk is discarded, even if in GB mode. |
183 | # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". |
184 | # Parhaps it is harmless, but further investigations may be required... |
00a464f7 |
185 | |
8676e7d3 |
186 | if (! $in_ascii) { |
187 | $ret .= "\x7E\x7D"; # '~}' |
188 | $in_ascii = 1; |
189 | } |
190 | return $ret; |
c0d88b76 |
191 | } |
192 | |
193 | 1; |
194 | __END__ |
67d7b5ef |
195 | |
67d7b5ef |
196 | =head1 NAME |
197 | |
198 | Encode::CN::HZ -- internally used by Encode::CN |
199 | |
200 | =cut |