Commit | Line | Data |
c0d88b76 |
1 | package Encode::CN::HZ; |
2 | |
00a464f7 |
3 | use strict; |
656ebd29 |
4 | use warnings; |
00a464f7 |
5 | |
eb042f38 |
6 | use vars qw($VERSION); |
656ebd29 |
7 | $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
eb042f38 |
8 | |
8676e7d3 |
9 | use Encode qw(:fallbacks); |
10c5ecbb |
10 | |
11 | use base qw(Encode::Encoding); |
12 | __PACKAGE__->Define('hz'); |
c0d88b76 |
13 | |
8676e7d3 |
14 | # HZ is a combination of ASCII and escaped GB, so we implement it |
15 | # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. |
10c5ecbb |
16 | |
8676e7d3 |
17 | # not ported for EBCDIC. Which should be used, "~" or "\x7E"? |
c0d88b76 |
18 | |
d1256cb1 |
19 | sub needs_lines { 1 } |
0ab8f81e |
20 | |
d1256cb1 |
21 | sub decode ($$;$) { |
22 | my ( $obj, $str, $chk ) = @_; |
8676e7d3 |
23 | |
d1256cb1 |
24 | my $GB = Encode::find_encoding('gb2312-raw'); |
8676e7d3 |
25 | my $ret = ''; |
d1256cb1 |
26 | my $in_ascii = 1; # default mode is ASCII. |
27 | |
28 | while ( length $str ) { |
29 | if ($in_ascii) { # ASCII mode |
30 | if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII |
31 | $ret .= $1; |
32 | |
33 | # EBCDIC should need ascii2native, but not ported. |
34 | } |
35 | elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde |
36 | $ret .= '~'; |
37 | } |
38 | elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII |
39 | 1; # no-op |
40 | } |
41 | elsif ( $str =~ s/^\x7E\x7B// ) { # '~{' |
42 | $in_ascii = 0; # to GB |
43 | } |
44 | else { # encounters an invalid escape, \x80 or greater |
45 | last; |
46 | } |
47 | } |
48 | else { # GB mode; the byte ranges are as in RFC 1843. |
49 | no warnings 'uninitialized'; |
50 | if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) { |
51 | $ret .= $GB->decode( $1, $chk ); |
52 | } |
53 | elsif ( $str =~ s/^\x7E\x7D// ) { # '~}' |
54 | $in_ascii = 1; |
55 | } |
56 | else { # invalid |
57 | last; |
58 | } |
59 | } |
8676e7d3 |
60 | } |
d1256cb1 |
61 | $_[1] = '' if $chk; # needs_lines guarantees no partial character |
8676e7d3 |
62 | return $ret; |
63 | } |
64 | |
65 | sub cat_decode { |
d1256cb1 |
66 | my ( $obj, undef, $src, $pos, $trm, $chk ) = @_; |
67 | my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ]; |
8676e7d3 |
68 | |
d1256cb1 |
69 | my $GB = Encode::find_encoding('gb2312-raw'); |
8676e7d3 |
70 | my $ret = ''; |
d1256cb1 |
71 | my $in_ascii = 1; # default mode is ASCII. |
8676e7d3 |
72 | |
73 | my $ini_pos = pos($$rsrc); |
74 | |
d1256cb1 |
75 | substr( $src, 0, $pos ) = ''; |
8676e7d3 |
76 | |
77 | my $ini_len = bytes::length($src); |
78 | |
79 | # $trm is the first of the pair '~~', then 2nd tilde is to be removed. |
80 | # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? |
81 | $src =~ s/^\x7E// if $trm eq "\x7E"; |
82 | |
d1256cb1 |
83 | while ( length $src ) { |
84 | my $now; |
85 | if ($in_ascii) { # ASCII mode |
86 | if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII |
87 | $now = $1; |
88 | } |
89 | elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde |
90 | $now = '~'; |
91 | } |
92 | elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII |
93 | next; |
94 | } |
95 | elsif ( $src =~ s/^\x7E\x7B// ) { # '~{' |
96 | $in_ascii = 0; # to GB |
97 | next; |
98 | } |
99 | else { # encounters an invalid escape, \x80 or greater |
100 | last; |
101 | } |
102 | } |
103 | else { # GB mode; the byte ranges are as in RFC 1843. |
104 | if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) { |
105 | $now = $GB->decode( $1, $chk ); |
106 | } |
107 | elsif ( $src =~ s/^\x7E\x7D// ) { # '~}' |
108 | $in_ascii = 1; |
109 | next; |
110 | } |
111 | else { # invalid |
112 | last; |
113 | } |
114 | } |
115 | |
116 | next if !defined $now; |
117 | |
118 | $ret .= $now; |
119 | |
120 | if ( $now eq $trm ) { |
121 | $$rdst .= $ret; |
122 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
123 | pos($$rsrc) = $ini_pos; |
124 | return 1; |
125 | } |
8676e7d3 |
126 | } |
127 | |
128 | $$rdst .= $ret; |
129 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); |
130 | pos($$rsrc) = $ini_pos; |
d1256cb1 |
131 | return ''; # terminator not found |
c0d88b76 |
132 | } |
133 | |
d1256cb1 |
134 | sub encode($$;$) { |
135 | my ( $obj, $str, $chk ) = @_; |
8676e7d3 |
136 | |
d1256cb1 |
137 | my $GB = Encode::find_encoding('gb2312-raw'); |
8676e7d3 |
138 | my $ret = ''; |
d1256cb1 |
139 | my $in_ascii = 1; # default mode is ASCII. |
140 | |
141 | no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. |
142 | |
143 | while ( length $str ) { |
144 | if ( $str =~ s/^([[:ascii:]]+)// ) { |
145 | my $tmp = $1; |
146 | $tmp =~ s/~/~~/g; # escapes tildes |
147 | if ( !$in_ascii ) { |
148 | $ret .= "\x7E\x7D"; # '~}' |
149 | $in_ascii = 1; |
150 | } |
151 | $ret .= pack 'a*', $tmp; # remove UTF8 flag. |
152 | } |
153 | elsif ( $str =~ s/(.)// ) { |
154 | my $s = $1; |
155 | my $tmp = $GB->encode( $s, $chk ); |
156 | last if !defined $tmp; |
157 | if ( length $tmp == 2 ) { # maybe a valid GB char (XXX) |
158 | if ($in_ascii) { |
159 | $ret .= "\x7E\x7B"; # '~{' |
160 | $in_ascii = 0; |
161 | } |
162 | $ret .= $tmp; |
163 | } |
164 | elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX) |
165 | if ( !$in_ascii ) { |
166 | $ret .= "\x7E\x7D"; # '~}' |
167 | $in_ascii = 1; |
168 | } |
169 | $ret .= $tmp; |
170 | } |
171 | } |
172 | else { # if $str is malformed UTF8 *and* if length $str != 0. |
173 | last; |
174 | } |
00a464f7 |
175 | } |
8676e7d3 |
176 | $_[1] = $str if $chk; |
00a464f7 |
177 | |
d1256cb1 |
178 | # The state at the end of the chunk is discarded, even if in GB mode. |
179 | # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". |
180 | # Parhaps it is harmless, but further investigations may be required... |
00a464f7 |
181 | |
d1256cb1 |
182 | if ( !$in_ascii ) { |
183 | $ret .= "\x7E\x7D"; # '~}' |
184 | $in_ascii = 1; |
8676e7d3 |
185 | } |
186 | return $ret; |
c0d88b76 |
187 | } |
188 | |
189 | 1; |
190 | __END__ |
67d7b5ef |
191 | |
67d7b5ef |
192 | =head1 NAME |
193 | |
194 | Encode::CN::HZ -- internally used by Encode::CN |
195 | |
196 | =cut |