Commit | Line | Data |
c0d88b76 |
1 | package Encode::CN::HZ; |
2 | |
00a464f7 |
3 | use strict; |
00a464f7 |
4 | |
eb042f38 |
5 | use vars qw($VERSION); |
7237418a |
6 | $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
eb042f38 |
7 | |
8676e7d3 |
8 | use Encode qw(:fallbacks); |
10c5ecbb |
9 | |
10 | use base qw(Encode::Encoding); |
11 | __PACKAGE__->Define('hz'); |
c0d88b76 |
12 | |
8676e7d3 |
13 | # HZ is a combination of ASCII and escaped GB, so we implement it |
14 | # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. |
10c5ecbb |
15 | |
8676e7d3 |
16 | # not ported for EBCDIC. Which should be used, "~" or "\x7E"? |
c0d88b76 |
17 | |
0ab8f81e |
18 | sub needs_lines { 1 } |
19 | |
8676e7d3 |
20 | sub decode ($$;$) |
c0d88b76 |
21 | { |
22 | my ($obj,$str,$chk) = @_; |
8676e7d3 |
23 | |
24 | my $GB = Encode::find_encoding('gb2312-raw'); |
25 | my $ret = ''; |
26 | my $in_ascii = 1; # default mode is ASCII. |
27 | |
28 | while (length $str) { |
29 | if ($in_ascii) { # ASCII mode |
30 | if ($str =~ s/^([\x00-\x7D\x7F]+)//) { # no '~' => ASCII |
31 | $ret .= $1; |
32 | # EBCDIC should need ascii2native, but not ported. |
33 | } |
34 | elsif ($str =~ s/^\x7E\x7E//) { # escaped tilde |
35 | $ret .= '~'; |
36 | } |
37 | elsif ($str =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII |
38 | 1; # no-op |
39 | } |
40 | elsif ($str =~ s/^\x7E\x7B//) { # '~{' |
41 | $in_ascii = 0; # to GB |
42 | } |
43 | else { # encounters an invalid escape, \x80 or greater |
44 | last; |
45 | } |
46 | } |
47 | else { # GB mode; the byte ranges are as in RFC 1843. |
48 | if ($str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)//) { |
49 | $ret .= $GB->decode($1, $chk); |
50 | } |
51 | elsif ($str =~ s/^\x7E\x7D//) { # '~}' |
52 | $in_ascii = 1; |
53 | } |
54 | else { # invalid |
55 | last; |
56 | } |
57 | } |
58 | } |
90f5826e |
59 | $_[1] = '' if $chk; # needs_lines guarantees no partial character |
8676e7d3 |
60 | return $ret; |
61 | } |
62 | |
63 | sub cat_decode { |
8676e7d3 |
64 | my ($obj, undef, $src, $pos, $trm, $chk) = @_; |
65 | my ($rdst, $rsrc, $rpos) = \@_[1..3]; |
66 | |
67 | my $GB = Encode::find_encoding('gb2312-raw'); |
68 | my $ret = ''; |
69 | my $in_ascii = 1; # default mode is ASCII. |
70 | |
71 | my $ini_pos = pos($$rsrc); |
72 | |
73 | substr($src, 0, $pos) = ''; |
74 | |
75 | my $ini_len = bytes::length($src); |
76 | |
77 | # $trm is the first of the pair '~~', then 2nd tilde is to be removed. |
78 | # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? |
79 | $src =~ s/^\x7E// if $trm eq "\x7E"; |
80 | |
81 | while (length $src) { |
82 | my $now; |
83 | if ($in_ascii) { # ASCII mode |
84 | if ($src =~ s/^([\x00-\x7D\x7F])//) { # no '~' => ASCII |
85 | $now = $1; |
86 | } |
87 | elsif ($src =~ s/^\x7E\x7E//) { # escaped tilde |
88 | $now = '~'; |
89 | } |
90 | elsif ($src =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII |
91 | next; |
92 | } |
93 | elsif ($src =~ s/^\x7E\x7B//) { # '~{' |
94 | $in_ascii = 0; # to GB |
95 | next; |
96 | } |
97 | else { # encounters an invalid escape, \x80 or greater |
98 | last; |
99 | } |
100 | } |
101 | else { # GB mode; the byte ranges are as in RFC 1843. |
102 | if ($src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)//) { |
103 | $now = $GB->decode($1, $chk); |
104 | } |
105 | elsif ($src =~ s/^\x7E\x7D//) { # '~}' |
106 | $in_ascii = 1; |
107 | next; |
108 | } |
109 | else { # invalid |
110 | last; |
111 | } |
112 | } |
113 | |
114 | next if ! defined $now; |
115 | |
116 | $ret .= $now; |
117 | |
118 | if ($now eq $trm) { |
119 | $$rdst .= $ret; |
120 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
121 | pos($$rsrc) = $ini_pos; |
122 | return 1; |
123 | } |
124 | } |
125 | |
126 | $$rdst .= $ret; |
127 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
128 | pos($$rsrc) = $ini_pos; |
129 | return ''; # terminator not found |
c0d88b76 |
130 | } |
131 | |
8676e7d3 |
132 | |
133 | sub encode($$;$) |
c0d88b76 |
134 | { |
135 | my ($obj,$str,$chk) = @_; |
8676e7d3 |
136 | |
137 | my $GB = Encode::find_encoding('gb2312-raw'); |
138 | my $ret = ''; |
139 | my $in_ascii = 1; # default mode is ASCII. |
140 | |
141 | no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. |
142 | |
143 | while (length $str) { |
144 | if ($str =~ s/^([[:ascii:]]+)//) { |
145 | my $tmp = $1; |
146 | $tmp =~ s/~/~~/g; # escapes tildes |
147 | if (! $in_ascii) { |
148 | $ret .= "\x7E\x7D"; # '~}' |
149 | $in_ascii = 1; |
150 | } |
151 | $ret .= pack 'a*', $tmp; # remove UTF8 flag. |
152 | } |
153 | elsif ($str =~ s/(.)//) { |
154 | my $tmp = $GB->encode($1, $chk); |
155 | last if !defined $tmp; |
156 | if (length $tmp == 2) { # maybe a valid GB char (XXX) |
157 | if ($in_ascii) { |
158 | $ret .= "\x7E\x7B"; # '~{' |
159 | $in_ascii = 0; |
160 | } |
161 | $ret .= $tmp; |
162 | } |
163 | elsif (length $tmp) { # maybe FALLBACK in ASCII (XXX) |
164 | if (!$in_ascii) { |
165 | $ret .= "\x7E\x7D"; # '~}' |
166 | $in_ascii = 1; |
167 | } |
168 | $ret .= $tmp; |
169 | } |
00a464f7 |
170 | } |
8676e7d3 |
171 | else { # if $str is malformed UTF8 *and* if length $str != 0. |
172 | last; |
00a464f7 |
173 | } |
174 | } |
8676e7d3 |
175 | $_[1] = $str if $chk; |
00a464f7 |
176 | |
8676e7d3 |
177 | # The state at the end of the chunk is discarded, even if in GB mode. |
178 | # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". |
179 | # Parhaps it is harmless, but further investigations may be required... |
00a464f7 |
180 | |
8676e7d3 |
181 | if (! $in_ascii) { |
182 | $ret .= "\x7E\x7D"; # '~}' |
183 | $in_ascii = 1; |
184 | } |
185 | return $ret; |
c0d88b76 |
186 | } |
187 | |
188 | 1; |
189 | __END__ |
67d7b5ef |
190 | |
67d7b5ef |
191 | =head1 NAME |
192 | |
193 | Encode::CN::HZ -- internally used by Encode::CN |
194 | |
195 | =cut |