Upgrade to Encode 1.63.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Encoding.pm
CommitLineData
18586f54 1package Encode::Encoding;
2# Base class for classes which implement encodings
3use strict;
10c5ecbb 4our $VERSION = do { my @r = (q$Revision: 1.29 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
18586f54 5
6sub Define
7{
8 my $obj = shift;
9 my $canonical = shift;
10 $obj = bless { Name => $canonical },$obj unless ref $obj;
11 # warn "$canonical => $obj\n";
f2a2953c 12 Encode::define_encoding($obj, $canonical, @_);
18586f54 13}
14
10c5ecbb 15sub name { return shift->{'Name'} }
16sub new_sequence { return $_[0] }
17
18sub needs_lines { 0 };
19
20sub perlio_ok {
21 eval{ require PerlIO::encoding };
22 return $@ ? 0 : 1;
23}
18586f54 24
25# Temporary legacy methods
26sub toUnicode { shift->decode(@_) }
27sub fromUnicode { shift->encode(@_) }
28
10c5ecbb 29#
30# Needs to be overloaded or just croak
31#
18586f54 32
10c5ecbb 33sub encode {
34 require Carp;
35 my $obj = shift;
36 my $class = ref($obj) ? ref($obj) : $obj;
37 Carp::croak $class, "->encode() not defined!";
38}
0ab8f81e 39
10c5ecbb 40sub decode{
41 require Carp;
42 my $obj = shift;
43 my $class = ref($obj) ? ref($obj) : $obj;
44 Carp::croak $class, "->encode() not defined!";
45}
6d1c0808 46
284ee456 47sub DESTROY {}
48
18586f54 491;
50__END__
1b2c56c8 51
52=head1 NAME
53
54Encode::Encoding - Encode Implementation Base Class
55
56=head1 SYNOPSIS
57
58 package Encode::MyEncoding;
59 use base qw(Encode::Encoding);
60
61 __PACKAGE__->Define(qw(myCanonical myAlias));
62
5129552c 63=head1 DESCRIPTION
1b2c56c8 64
65As mentioned in L<Encode>, encodings are (in the current
10c5ecbb 66implementation at least) defined as objects. The mapping of encoding
67name to object is via the C<%Encode::Encoding> hash. Though you can
68directly manipulate this hash, it is strongly encouraged to use this
69base class module and add encode() and decode() methods.
1b2c56c8 70
10c5ecbb 71=head2 Methods you should implement
1b2c56c8 72
10c5ecbb 73You are strongly encouraged to implement methods below, at least
74either encode() or decode().
1b2c56c8 75
76=over 4
77
10c5ecbb 78=item -E<gt>encode($string [,$check])
1b2c56c8 79
0ab8f81e 80MUST return the octet sequence representing I<$string>.
81
82=over 2
83
84=item *
85
86If I<$check> is true, it SHOULD modify I<$string> in place to remove
87the converted part (i.e. the whole string unless there is an error).
88If perlio_ok() is true, SHOULD becomes MUST.
89
90=item *
91
92If an error occurs, it SHOULD return the octet sequence for the
93fragment of string that has been converted and modify $string in-place
94to remove the converted part leaving it starting with the problem
95fragment. If perlio_ok() is true, SHOULD becomes MUST.
96
97=item *
1b2c56c8 98
0ab8f81e 99If I<$check> is is false then C<encode> MUST make a "best effort" to
100convert the string - for example, by using a replacement character.
101
102=back
1b2c56c8 103
10c5ecbb 104=item -E<gt>decode($octets [,$check])
1b2c56c8 105
0ab8f81e 106MUST return the string that I<$octets> represents.
107
108=over 2
109
110=item *
111
112If I<$check> is true, it SHOULD modify I<$octets> in place to remove
113the converted part (i.e. the whole sequence unless there is an
114error). If perlio_ok() is true, SHOULD becomes MUST.
115
116=item *
1b2c56c8 117
0ab8f81e 118If an error occurs, it SHOULD return the fragment of string that has
119been converted and modify $octets in-place to remove the converted
120part leaving it starting with the problem fragment. If perlio_ok() is
121true, SHOULD becomes MUST.
122
123=item *
124
125If I<$check> is false then C<decode> should make a "best effort" to
1b2c56c8 126convert the string - for example by using Unicode's "\x{FFFD}" as a
127replacement character.
128
129=back
130
10c5ecbb 131=head2 Other methods defined in Encode::Encodings
132
133You do not have to override methods shown below unless you have to.
134
135=over 4
136
137=item -E<gt>name
138
139Predefined As:
140
141 sub name { return shift->{'Name'} }
142
143MUST return the string representing the canonical name of the encoding.
144
145=item -E<gt>new_sequence
146
147Predefined As:
148
149 sub new_sequence { return $_[0] }
150
151This is a placeholder for encodings with state. It should return an
152object which implements this interface. All current implementations
153return the original object.
154
0ab8f81e 155=item -E<gt>perlio_ok()
156
10c5ecbb 157Predefined As:
011b2d2f 158
10c5ecbb 159 sub perlio_ok {
160 eval{ require PerlIO::encoding };
161 return $@ ? 0 : 1;
162 }
0ab8f81e 163
10c5ecbb 164If your encoding does not support PerlIO for some reasons, just;
0ab8f81e 165
166 sub perlio_ok { 0 }
167
168=item -E<gt>needs_lines()
169
10c5ecbb 170Predefined As:
171
172 sub needs_lines { 0 };
173
0ab8f81e 174If your encoding can work with PerlIO but needs line buffering, you
175MUST define this method so it returns true. 7bit ISO-2022 encodings
176are one example that needs this. When this method is missing, false
177is assumed.
178
179=back
180
10c5ecbb 181=head2 Example: Encode::ROT13
182
183 package Encode::ROT13;
184 use strict;
185 use base qw(Encode::Encoding);
186
187 __PACKAGE__->Define('rot13');
188
189 sub encode($$;$){
190 my ($obj, $str, $chk) = @_;
191 $str =~ tr/A-Za-z/N-ZA-Mn-za-m/;
192 $_[1] = '' if $chk; # this is what in-place edit means
193 return $str;
194 }
195
196 # Jr pna or ynml yvxr guvf;
197 *decode = \&encode;
198
199 1;
200
201=head1 Why the heck Encode API is different?
202
0ab8f81e 203It should be noted that the I<$check> behaviour is different from the
1b2c56c8 204outer public API. The logic is that the "unchecked" case is useful
0ab8f81e 205when the encoding is part of a stream which may be reporting errors
206(e.g. STDERR). In such cases, it is desirable to get everything
1b2c56c8 207through somehow without causing additional errors which obscure the
0ab8f81e 208original one. Also, the encoding is best placed to know what the
1b2c56c8 209correct replacement character is, so if that is the desired behaviour
210then letting low level code do it is the most efficient.
211
0ab8f81e 212By contrast, if I<$check> is true, the scheme above allows the
213encoding to do as much as it can and tell the layer above how much
214that was. What is lacking at present is a mechanism to report what
215went wrong. The most likely interface will be an additional method
216call to the object, or perhaps (to avoid forcing per-stream objects
217on otherwise stateless encodings) an additional parameter.
1b2c56c8 218
219It is also highly desirable that encoding classes inherit from
220C<Encode::Encoding> as a base class. This allows that class to define
10c5ecbb 221additional behaviour for all encoding objects.
1b2c56c8 222
223 package Encode::MyEncoding;
224 use base qw(Encode::Encoding);
225
226 __PACKAGE__->Define(qw(myCanonical myAlias));
227
0ab8f81e 228to create an object with C<< bless {Name => ...}, $class >>, and call
1b2c56c8 229define_encoding. They inherit their C<name> method from
230C<Encode::Encoding>.
231
232=head2 Compiled Encodings
233
0ab8f81e 234For the sake of speed and efficiency, most of the encodings are now
235supported via a I<compiled form>: XS modules generated from UCM
236files. Encode provides the enc2xs tool to achieve that. Please see
67d7b5ef 237L<enc2xs> for more details.
1b2c56c8 238
67d7b5ef 239=head1 SEE ALSO
1b2c56c8 240
67d7b5ef 241L<perlmod>, L<enc2xs>
1b2c56c8 242
0ab8f81e 243=begin future
f2a2953c 244
245=over 4
246
247=item Scheme 1
248
0ab8f81e 249The fixup routine gets passed the remaining fragment of string being
250processed. It modifies it in place to remove bytes/characters it can
251understand and returns a string used to represent them. For example:
f2a2953c 252
253 sub fixup {
254 my $ch = substr($_[0],0,1,'');
255 return sprintf("\x{%02X}",ord($ch);
256 }
257
0ab8f81e 258This scheme is close to how the underlying C code for Encode works,
259but gives the fixup routine very little context.
f2a2953c 260
261=item Scheme 2
262
0ab8f81e 263The fixup routine gets passed the original string, an index into
264it of the problem area, and the output string so far. It appends
265what it wants to the output string and returns a new index into the
266original string. For example:
f2a2953c 267
268 sub fixup {
269 # my ($s,$i,$d) = @_;
270 my $ch = substr($_[0],$_[1],1);
271 $_[2] .= sprintf("\x{%02X}",ord($ch);
272 return $_[1]+1;
273 }
274
275This scheme gives maximal control to the fixup routine but is more
0ab8f81e 276complicated to code, and may require that the internals of Encode be tweaked to
277keep the original string intact.
f2a2953c 278
279=item Other Schemes
280
0ab8f81e 281Hybrids of the above.
f2a2953c 282
283Multiple return values rather than in-place modifications.
284
285Index into the string could be C<pos($str)> allowing C<s/\G...//>.
286
287=back
288
0ab8f81e 289=end future
290
1b2c56c8 291=cut