File::Basename doesn't lazy load Carp right.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Encoding.pm
CommitLineData
18586f54 1package Encode::Encoding;
2# Base class for classes which implement encodings
3use strict;
621b0f8d 4our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5
6require Encode;
18586f54 7
8sub Define
9{
10 my $obj = shift;
11 my $canonical = shift;
12 $obj = bless { Name => $canonical },$obj unless ref $obj;
13 # warn "$canonical => $obj\n";
f2a2953c 14 Encode::define_encoding($obj, $canonical, @_);
18586f54 15}
16
10c5ecbb 17sub name { return shift->{'Name'} }
18sub new_sequence { return $_[0] }
19
20sub needs_lines { 0 };
21
22sub perlio_ok {
23 eval{ require PerlIO::encoding };
24 return $@ ? 0 : 1;
25}
18586f54 26
27# Temporary legacy methods
28sub toUnicode { shift->decode(@_) }
29sub fromUnicode { shift->encode(@_) }
30
10c5ecbb 31#
32# Needs to be overloaded or just croak
33#
18586f54 34
10c5ecbb 35sub encode {
36 require Carp;
37 my $obj = shift;
38 my $class = ref($obj) ? ref($obj) : $obj;
6286f723 39 Carp::croak($class, "->encode() not defined!");
10c5ecbb 40}
0ab8f81e 41
10c5ecbb 42sub decode{
43 require Carp;
44 my $obj = shift;
45 my $class = ref($obj) ? ref($obj) : $obj;
6286f723 46 Carp::croak($class, "->encode() not defined!");
10c5ecbb 47}
6d1c0808 48
284ee456 49sub DESTROY {}
50
18586f54 511;
52__END__
1b2c56c8 53
54=head1 NAME
55
56Encode::Encoding - Encode Implementation Base Class
57
58=head1 SYNOPSIS
59
60 package Encode::MyEncoding;
61 use base qw(Encode::Encoding);
62
63 __PACKAGE__->Define(qw(myCanonical myAlias));
64
5129552c 65=head1 DESCRIPTION
1b2c56c8 66
67As mentioned in L<Encode>, encodings are (in the current
10c5ecbb 68implementation at least) defined as objects. The mapping of encoding
69name to object is via the C<%Encode::Encoding> hash. Though you can
70directly manipulate this hash, it is strongly encouraged to use this
71base class module and add encode() and decode() methods.
1b2c56c8 72
10c5ecbb 73=head2 Methods you should implement
1b2c56c8 74
10c5ecbb 75You are strongly encouraged to implement methods below, at least
76either encode() or decode().
1b2c56c8 77
78=over 4
79
10c5ecbb 80=item -E<gt>encode($string [,$check])
1b2c56c8 81
0ab8f81e 82MUST return the octet sequence representing I<$string>.
83
84=over 2
85
86=item *
87
88If I<$check> is true, it SHOULD modify I<$string> in place to remove
89the converted part (i.e. the whole string unless there is an error).
90If perlio_ok() is true, SHOULD becomes MUST.
91
92=item *
93
94If an error occurs, it SHOULD return the octet sequence for the
95fragment of string that has been converted and modify $string in-place
96to remove the converted part leaving it starting with the problem
97fragment. If perlio_ok() is true, SHOULD becomes MUST.
98
99=item *
1b2c56c8 100
0ab8f81e 101If I<$check> is is false then C<encode> MUST make a "best effort" to
102convert the string - for example, by using a replacement character.
103
104=back
1b2c56c8 105
10c5ecbb 106=item -E<gt>decode($octets [,$check])
1b2c56c8 107
0ab8f81e 108MUST return the string that I<$octets> represents.
109
110=over 2
111
112=item *
113
114If I<$check> is true, it SHOULD modify I<$octets> in place to remove
115the converted part (i.e. the whole sequence unless there is an
116error). If perlio_ok() is true, SHOULD becomes MUST.
117
118=item *
1b2c56c8 119
0ab8f81e 120If an error occurs, it SHOULD return the fragment of string that has
121been converted and modify $octets in-place to remove the converted
122part leaving it starting with the problem fragment. If perlio_ok() is
123true, SHOULD becomes MUST.
124
125=item *
126
127If I<$check> is false then C<decode> should make a "best effort" to
1b2c56c8 128convert the string - for example by using Unicode's "\x{FFFD}" as a
129replacement character.
130
131=back
132
10c5ecbb 133=head2 Other methods defined in Encode::Encodings
134
135You do not have to override methods shown below unless you have to.
136
137=over 4
138
139=item -E<gt>name
140
141Predefined As:
142
143 sub name { return shift->{'Name'} }
144
145MUST return the string representing the canonical name of the encoding.
146
147=item -E<gt>new_sequence
148
149Predefined As:
150
151 sub new_sequence { return $_[0] }
152
153This is a placeholder for encodings with state. It should return an
154object which implements this interface. All current implementations
155return the original object.
156
0ab8f81e 157=item -E<gt>perlio_ok()
158
10c5ecbb 159Predefined As:
011b2d2f 160
10c5ecbb 161 sub perlio_ok {
162 eval{ require PerlIO::encoding };
163 return $@ ? 0 : 1;
164 }
0ab8f81e 165
10c5ecbb 166If your encoding does not support PerlIO for some reasons, just;
0ab8f81e 167
168 sub perlio_ok { 0 }
169
170=item -E<gt>needs_lines()
171
10c5ecbb 172Predefined As:
173
174 sub needs_lines { 0 };
175
0ab8f81e 176If your encoding can work with PerlIO but needs line buffering, you
177MUST define this method so it returns true. 7bit ISO-2022 encodings
178are one example that needs this. When this method is missing, false
179is assumed.
180
181=back
182
10c5ecbb 183=head2 Example: Encode::ROT13
184
185 package Encode::ROT13;
186 use strict;
187 use base qw(Encode::Encoding);
188
189 __PACKAGE__->Define('rot13');
190
191 sub encode($$;$){
192 my ($obj, $str, $chk) = @_;
193 $str =~ tr/A-Za-z/N-ZA-Mn-za-m/;
194 $_[1] = '' if $chk; # this is what in-place edit means
195 return $str;
196 }
197
198 # Jr pna or ynml yvxr guvf;
199 *decode = \&encode;
200
201 1;
202
203=head1 Why the heck Encode API is different?
204
0ab8f81e 205It should be noted that the I<$check> behaviour is different from the
1b2c56c8 206outer public API. The logic is that the "unchecked" case is useful
0ab8f81e 207when the encoding is part of a stream which may be reporting errors
208(e.g. STDERR). In such cases, it is desirable to get everything
1b2c56c8 209through somehow without causing additional errors which obscure the
0ab8f81e 210original one. Also, the encoding is best placed to know what the
1b2c56c8 211correct replacement character is, so if that is the desired behaviour
212then letting low level code do it is the most efficient.
213
0ab8f81e 214By contrast, if I<$check> is true, the scheme above allows the
215encoding to do as much as it can and tell the layer above how much
216that was. What is lacking at present is a mechanism to report what
217went wrong. The most likely interface will be an additional method
218call to the object, or perhaps (to avoid forcing per-stream objects
219on otherwise stateless encodings) an additional parameter.
1b2c56c8 220
221It is also highly desirable that encoding classes inherit from
222C<Encode::Encoding> as a base class. This allows that class to define
10c5ecbb 223additional behaviour for all encoding objects.
1b2c56c8 224
225 package Encode::MyEncoding;
226 use base qw(Encode::Encoding);
227
228 __PACKAGE__->Define(qw(myCanonical myAlias));
229
0ab8f81e 230to create an object with C<< bless {Name => ...}, $class >>, and call
1b2c56c8 231define_encoding. They inherit their C<name> method from
232C<Encode::Encoding>.
233
234=head2 Compiled Encodings
235
0ab8f81e 236For the sake of speed and efficiency, most of the encodings are now
237supported via a I<compiled form>: XS modules generated from UCM
238files. Encode provides the enc2xs tool to achieve that. Please see
67d7b5ef 239L<enc2xs> for more details.
1b2c56c8 240
67d7b5ef 241=head1 SEE ALSO
1b2c56c8 242
67d7b5ef 243L<perlmod>, L<enc2xs>
1b2c56c8 244
0ab8f81e 245=begin future
f2a2953c 246
247=over 4
248
249=item Scheme 1
250
0ab8f81e 251The fixup routine gets passed the remaining fragment of string being
252processed. It modifies it in place to remove bytes/characters it can
253understand and returns a string used to represent them. For example:
f2a2953c 254
255 sub fixup {
256 my $ch = substr($_[0],0,1,'');
257 return sprintf("\x{%02X}",ord($ch);
258 }
259
0ab8f81e 260This scheme is close to how the underlying C code for Encode works,
261but gives the fixup routine very little context.
f2a2953c 262
263=item Scheme 2
264
0ab8f81e 265The fixup routine gets passed the original string, an index into
266it of the problem area, and the output string so far. It appends
267what it wants to the output string and returns a new index into the
268original string. For example:
f2a2953c 269
270 sub fixup {
271 # my ($s,$i,$d) = @_;
272 my $ch = substr($_[0],$_[1],1);
273 $_[2] .= sprintf("\x{%02X}",ord($ch);
274 return $_[1]+1;
275 }
276
277This scheme gives maximal control to the fixup routine but is more
0ab8f81e 278complicated to code, and may require that the internals of Encode be tweaked to
279keep the original string intact.
f2a2953c 280
281=item Other Schemes
282
0ab8f81e 283Hybrids of the above.
f2a2953c 284
285Multiple return values rather than in-place modifications.
286
287Index into the string could be C<pos($str)> allowing C<s/\G...//>.
288
289=back
290
0ab8f81e 291=end future
292
1b2c56c8 293=cut