Commit | Line | Data |
c0d88b76 |
1 | package Encode::CN::HZ; |
2 | |
00a464f7 |
3 | use strict; |
00a464f7 |
4 | |
eb042f38 |
5 | use vars qw($VERSION); |
41c240f5 |
6 | $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
eb042f38 |
7 | |
8676e7d3 |
8 | use Encode qw(:fallbacks); |
10c5ecbb |
9 | |
10 | use base qw(Encode::Encoding); |
11 | __PACKAGE__->Define('hz'); |
c0d88b76 |
12 | |
8676e7d3 |
13 | # HZ is a combination of ASCII and escaped GB, so we implement it |
14 | # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. |
10c5ecbb |
15 | |
8676e7d3 |
16 | # not ported for EBCDIC. Which should be used, "~" or "\x7E"? |
c0d88b76 |
17 | |
0ab8f81e |
18 | sub needs_lines { 1 } |
19 | |
8676e7d3 |
20 | sub decode ($$;$) |
c0d88b76 |
21 | { |
22 | my ($obj,$str,$chk) = @_; |
8676e7d3 |
23 | |
24 | my $GB = Encode::find_encoding('gb2312-raw'); |
25 | my $ret = ''; |
26 | my $in_ascii = 1; # default mode is ASCII. |
27 | |
28 | while (length $str) { |
29 | if ($in_ascii) { # ASCII mode |
30 | if ($str =~ s/^([\x00-\x7D\x7F]+)//) { # no '~' => ASCII |
31 | $ret .= $1; |
32 | # EBCDIC should need ascii2native, but not ported. |
33 | } |
34 | elsif ($str =~ s/^\x7E\x7E//) { # escaped tilde |
35 | $ret .= '~'; |
36 | } |
37 | elsif ($str =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII |
38 | 1; # no-op |
39 | } |
40 | elsif ($str =~ s/^\x7E\x7B//) { # '~{' |
41 | $in_ascii = 0; # to GB |
42 | } |
43 | else { # encounters an invalid escape, \x80 or greater |
44 | last; |
45 | } |
46 | } |
47 | else { # GB mode; the byte ranges are as in RFC 1843. |
41c240f5 |
48 | no warnings 'uninitialized'; |
8676e7d3 |
49 | if ($str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)//) { |
50 | $ret .= $GB->decode($1, $chk); |
51 | } |
52 | elsif ($str =~ s/^\x7E\x7D//) { # '~}' |
53 | $in_ascii = 1; |
54 | } |
55 | else { # invalid |
56 | last; |
57 | } |
58 | } |
59 | } |
90f5826e |
60 | $_[1] = '' if $chk; # needs_lines guarantees no partial character |
8676e7d3 |
61 | return $ret; |
62 | } |
63 | |
64 | sub cat_decode { |
8676e7d3 |
65 | my ($obj, undef, $src, $pos, $trm, $chk) = @_; |
66 | my ($rdst, $rsrc, $rpos) = \@_[1..3]; |
67 | |
68 | my $GB = Encode::find_encoding('gb2312-raw'); |
69 | my $ret = ''; |
70 | my $in_ascii = 1; # default mode is ASCII. |
71 | |
72 | my $ini_pos = pos($$rsrc); |
73 | |
74 | substr($src, 0, $pos) = ''; |
75 | |
76 | my $ini_len = bytes::length($src); |
77 | |
78 | # $trm is the first of the pair '~~', then 2nd tilde is to be removed. |
79 | # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? |
80 | $src =~ s/^\x7E// if $trm eq "\x7E"; |
81 | |
82 | while (length $src) { |
83 | my $now; |
84 | if ($in_ascii) { # ASCII mode |
85 | if ($src =~ s/^([\x00-\x7D\x7F])//) { # no '~' => ASCII |
86 | $now = $1; |
87 | } |
88 | elsif ($src =~ s/^\x7E\x7E//) { # escaped tilde |
89 | $now = '~'; |
90 | } |
91 | elsif ($src =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII |
92 | next; |
93 | } |
94 | elsif ($src =~ s/^\x7E\x7B//) { # '~{' |
95 | $in_ascii = 0; # to GB |
96 | next; |
97 | } |
98 | else { # encounters an invalid escape, \x80 or greater |
99 | last; |
100 | } |
101 | } |
102 | else { # GB mode; the byte ranges are as in RFC 1843. |
103 | if ($src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)//) { |
104 | $now = $GB->decode($1, $chk); |
105 | } |
106 | elsif ($src =~ s/^\x7E\x7D//) { # '~}' |
107 | $in_ascii = 1; |
108 | next; |
109 | } |
110 | else { # invalid |
111 | last; |
112 | } |
113 | } |
114 | |
115 | next if ! defined $now; |
116 | |
117 | $ret .= $now; |
118 | |
119 | if ($now eq $trm) { |
120 | $$rdst .= $ret; |
121 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
122 | pos($$rsrc) = $ini_pos; |
123 | return 1; |
124 | } |
125 | } |
126 | |
127 | $$rdst .= $ret; |
128 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
129 | pos($$rsrc) = $ini_pos; |
130 | return ''; # terminator not found |
c0d88b76 |
131 | } |
132 | |
8676e7d3 |
133 | |
134 | sub encode($$;$) |
c0d88b76 |
135 | { |
136 | my ($obj,$str,$chk) = @_; |
8676e7d3 |
137 | |
138 | my $GB = Encode::find_encoding('gb2312-raw'); |
139 | my $ret = ''; |
140 | my $in_ascii = 1; # default mode is ASCII. |
141 | |
142 | no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. |
143 | |
144 | while (length $str) { |
145 | if ($str =~ s/^([[:ascii:]]+)//) { |
146 | my $tmp = $1; |
147 | $tmp =~ s/~/~~/g; # escapes tildes |
148 | if (! $in_ascii) { |
149 | $ret .= "\x7E\x7D"; # '~}' |
150 | $in_ascii = 1; |
151 | } |
152 | $ret .= pack 'a*', $tmp; # remove UTF8 flag. |
153 | } |
154 | elsif ($str =~ s/(.)//) { |
f5cee72b |
155 | my $s = $1; |
156 | my $tmp = $GB->encode($s, $chk); |
8676e7d3 |
157 | last if !defined $tmp; |
158 | if (length $tmp == 2) { # maybe a valid GB char (XXX) |
159 | if ($in_ascii) { |
160 | $ret .= "\x7E\x7B"; # '~{' |
161 | $in_ascii = 0; |
162 | } |
163 | $ret .= $tmp; |
164 | } |
165 | elsif (length $tmp) { # maybe FALLBACK in ASCII (XXX) |
166 | if (!$in_ascii) { |
167 | $ret .= "\x7E\x7D"; # '~}' |
168 | $in_ascii = 1; |
169 | } |
170 | $ret .= $tmp; |
171 | } |
00a464f7 |
172 | } |
8676e7d3 |
173 | else { # if $str is malformed UTF8 *and* if length $str != 0. |
174 | last; |
00a464f7 |
175 | } |
176 | } |
8676e7d3 |
177 | $_[1] = $str if $chk; |
00a464f7 |
178 | |
8676e7d3 |
179 | # The state at the end of the chunk is discarded, even if in GB mode. |
180 | # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". |
181 | # Parhaps it is harmless, but further investigations may be required... |
00a464f7 |
182 | |
8676e7d3 |
183 | if (! $in_ascii) { |
184 | $ret .= "\x7E\x7D"; # '~}' |
185 | $in_ascii = 1; |
186 | } |
187 | return $ret; |
c0d88b76 |
188 | } |
189 | |
190 | 1; |
191 | __END__ |
67d7b5ef |
192 | |
67d7b5ef |
193 | =head1 NAME |
194 | |
195 | Encode::CN::HZ -- internally used by Encode::CN |
196 | |
197 | =cut |