Commit | Line | Data |
c0d88b76 |
1 | package Encode::CN::HZ; |
2 | |
00a464f7 |
3 | use strict; |
00a464f7 |
4 | |
eb042f38 |
5 | use vars qw($VERSION); |
d1256cb1 |
6 | $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
eb042f38 |
7 | |
8676e7d3 |
8 | use Encode qw(:fallbacks); |
10c5ecbb |
9 | |
10 | use base qw(Encode::Encoding); |
11 | __PACKAGE__->Define('hz'); |
c0d88b76 |
12 | |
8676e7d3 |
13 | # HZ is a combination of ASCII and escaped GB, so we implement it |
14 | # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. |
10c5ecbb |
15 | |
8676e7d3 |
16 | # not ported for EBCDIC. Which should be used, "~" or "\x7E"? |
c0d88b76 |
17 | |
d1256cb1 |
18 | sub needs_lines { 1 } |
0ab8f81e |
19 | |
d1256cb1 |
20 | sub decode ($$;$) { |
21 | my ( $obj, $str, $chk ) = @_; |
8676e7d3 |
22 | |
d1256cb1 |
23 | my $GB = Encode::find_encoding('gb2312-raw'); |
8676e7d3 |
24 | my $ret = ''; |
d1256cb1 |
25 | my $in_ascii = 1; # default mode is ASCII. |
26 | |
27 | while ( length $str ) { |
28 | if ($in_ascii) { # ASCII mode |
29 | if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII |
30 | $ret .= $1; |
31 | |
32 | # EBCDIC should need ascii2native, but not ported. |
33 | } |
34 | elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde |
35 | $ret .= '~'; |
36 | } |
37 | elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII |
38 | 1; # no-op |
39 | } |
40 | elsif ( $str =~ s/^\x7E\x7B// ) { # '~{' |
41 | $in_ascii = 0; # to GB |
42 | } |
43 | else { # encounters an invalid escape, \x80 or greater |
44 | last; |
45 | } |
46 | } |
47 | else { # GB mode; the byte ranges are as in RFC 1843. |
48 | no warnings 'uninitialized'; |
49 | if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) { |
50 | $ret .= $GB->decode( $1, $chk ); |
51 | } |
52 | elsif ( $str =~ s/^\x7E\x7D// ) { # '~}' |
53 | $in_ascii = 1; |
54 | } |
55 | else { # invalid |
56 | last; |
57 | } |
58 | } |
8676e7d3 |
59 | } |
d1256cb1 |
60 | $_[1] = '' if $chk; # needs_lines guarantees no partial character |
8676e7d3 |
61 | return $ret; |
62 | } |
63 | |
64 | sub cat_decode { |
d1256cb1 |
65 | my ( $obj, undef, $src, $pos, $trm, $chk ) = @_; |
66 | my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ]; |
8676e7d3 |
67 | |
d1256cb1 |
68 | my $GB = Encode::find_encoding('gb2312-raw'); |
8676e7d3 |
69 | my $ret = ''; |
d1256cb1 |
70 | my $in_ascii = 1; # default mode is ASCII. |
8676e7d3 |
71 | |
72 | my $ini_pos = pos($$rsrc); |
73 | |
d1256cb1 |
74 | substr( $src, 0, $pos ) = ''; |
8676e7d3 |
75 | |
76 | my $ini_len = bytes::length($src); |
77 | |
78 | # $trm is the first of the pair '~~', then 2nd tilde is to be removed. |
79 | # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? |
80 | $src =~ s/^\x7E// if $trm eq "\x7E"; |
81 | |
d1256cb1 |
82 | while ( length $src ) { |
83 | my $now; |
84 | if ($in_ascii) { # ASCII mode |
85 | if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII |
86 | $now = $1; |
87 | } |
88 | elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde |
89 | $now = '~'; |
90 | } |
91 | elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII |
92 | next; |
93 | } |
94 | elsif ( $src =~ s/^\x7E\x7B// ) { # '~{' |
95 | $in_ascii = 0; # to GB |
96 | next; |
97 | } |
98 | else { # encounters an invalid escape, \x80 or greater |
99 | last; |
100 | } |
101 | } |
102 | else { # GB mode; the byte ranges are as in RFC 1843. |
103 | if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) { |
104 | $now = $GB->decode( $1, $chk ); |
105 | } |
106 | elsif ( $src =~ s/^\x7E\x7D// ) { # '~}' |
107 | $in_ascii = 1; |
108 | next; |
109 | } |
110 | else { # invalid |
111 | last; |
112 | } |
113 | } |
114 | |
115 | next if !defined $now; |
116 | |
117 | $ret .= $now; |
118 | |
119 | if ( $now eq $trm ) { |
120 | $$rdst .= $ret; |
121 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
122 | pos($$rsrc) = $ini_pos; |
123 | return 1; |
124 | } |
8676e7d3 |
125 | } |
126 | |
127 | $$rdst .= $ret; |
128 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
129 | pos($$rsrc) = $ini_pos; |
d1256cb1 |
130 | return ''; # terminator not found |
c0d88b76 |
131 | } |
132 | |
d1256cb1 |
133 | sub encode($$;$) { |
134 | my ( $obj, $str, $chk ) = @_; |
8676e7d3 |
135 | |
d1256cb1 |
136 | my $GB = Encode::find_encoding('gb2312-raw'); |
8676e7d3 |
137 | my $ret = ''; |
d1256cb1 |
138 | my $in_ascii = 1; # default mode is ASCII. |
139 | |
140 | no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. |
141 | |
142 | while ( length $str ) { |
143 | if ( $str =~ s/^([[:ascii:]]+)// ) { |
144 | my $tmp = $1; |
145 | $tmp =~ s/~/~~/g; # escapes tildes |
146 | if ( !$in_ascii ) { |
147 | $ret .= "\x7E\x7D"; # '~}' |
148 | $in_ascii = 1; |
149 | } |
150 | $ret .= pack 'a*', $tmp; # remove UTF8 flag. |
151 | } |
152 | elsif ( $str =~ s/(.)// ) { |
153 | my $s = $1; |
154 | my $tmp = $GB->encode( $s, $chk ); |
155 | last if !defined $tmp; |
156 | if ( length $tmp == 2 ) { # maybe a valid GB char (XXX) |
157 | if ($in_ascii) { |
158 | $ret .= "\x7E\x7B"; # '~{' |
159 | $in_ascii = 0; |
160 | } |
161 | $ret .= $tmp; |
162 | } |
163 | elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX) |
164 | if ( !$in_ascii ) { |
165 | $ret .= "\x7E\x7D"; # '~}' |
166 | $in_ascii = 1; |
167 | } |
168 | $ret .= $tmp; |
169 | } |
170 | } |
171 | else { # if $str is malformed UTF8 *and* if length $str != 0. |
172 | last; |
173 | } |
00a464f7 |
174 | } |
8676e7d3 |
175 | $_[1] = $str if $chk; |
00a464f7 |
176 | |
d1256cb1 |
177 | # The state at the end of the chunk is discarded, even if in GB mode. |
178 | # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". |
179 | # Parhaps it is harmless, but further investigations may be required... |
00a464f7 |
180 | |
d1256cb1 |
181 | if ( !$in_ascii ) { |
182 | $ret .= "\x7E\x7D"; # '~}' |
183 | $in_ascii = 1; |
8676e7d3 |
184 | } |
185 | return $ret; |
c0d88b76 |
186 | } |
187 | |
188 | 1; |
189 | __END__ |
67d7b5ef |
190 | |
67d7b5ef |
191 | =head1 NAME |
192 | |
193 | Encode::CN::HZ -- internally used by Encode::CN |
194 | |
195 | =cut |