ext/Encode/lib/Encode/Encoding.pm

   1 package Encode::Encoding;
   2 # Base class for classes which implement encodings
   3 use strict;
   4 our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   5
   6 require Encode;
   7
   8 sub Define
   9 {
  10     my $obj = shift;
  11     my $canonical = shift;
  12     $obj = bless { Name => $canonical },$obj unless ref $obj;
  13     # warn "$canonical => $obj\n";
  14     Encode::define_encoding($obj, $canonical, @_);
  15 }
  16
  17 sub name  { return shift->{'Name'} }
  18
  19 sub renew { return $_[0] }
  20 *new_sequence = \&renew;
  21
  22 sub needs_lines { 0 };
  23
  24 sub perlio_ok {
  25     eval{ require PerlIO::encoding };
  26     return $@ ? 0 : 1;
  27 }
  28
  29 # (Temporary|legacy) methods
  30
  31 sub toUnicode    { shift->decode(@_) }
  32 sub fromUnicode  { shift->encode(@_) }
  33
  34 #
  35 # Needs to be overloaded or just croak
  36 #
  37
  38 sub encode {
  39     require Carp;
  40     my $obj = shift;
  41     my $class = ref($obj) ? ref($obj) : $obj;
  42     Carp::croak $class, "->encode() not defined!";
  43 }
  44
  45 sub decode{
  46     require Carp;
  47     my $obj = shift;
  48     my $class = ref($obj) ? ref($obj) : $obj;
  49     Carp::croak $class, "->encode() not defined!";
  50 }
  51
  52 sub DESTROY {}
  53
  54 1;
  55 __END__
  56
  57 =head1 NAME
  58
  59 Encode::Encoding - Encode Implementation Base Class
  60
  61 =head1 SYNOPSIS
  62
  63   package Encode::MyEncoding;
  64   use base qw(Encode::Encoding);
  65
  66   __PACKAGE__->Define(qw(myCanonical myAlias));
  67
  68 =head1 DESCRIPTION
  69
  70 As mentioned in L<Encode>, encodings are (in the current
  71 implementation at least) defined as objects. The mapping of encoding
  72 name to object is via the C<%Encode::Encoding> hash.  Though you can
  73 directly manipulate this hash, it is strongly encouraged to use this
  74 base class module and add encode() and decode() methods.
  75
  76 =head2 Methods you should implement
  77
  78 You are strongly encouraged to implement methods below, at least
  79 either encode() or decode().
  80
  81 =over 4
  82
  83 =item -E<gt>encode($string [,$check])
  84
  85 MUST return the octet sequence representing I<$string>.
  86
  87 =over 2
  88
  89 =item *
  90
  91 If I<$check> is true, it SHOULD modify I<$string> in place to remove
  92 the converted part (i.e.  the whole string unless there is an error).
  93 If perlio_ok() is true, SHOULD becomes MUST.
  94
  95 =item *
  96
  97 If an error occurs, it SHOULD return the octet sequence for the
  98 fragment of string that has been converted and modify $string in-place
  99 to remove the converted part leaving it starting with the problem
 100 fragment.  If perlio_ok() is true, SHOULD becomes MUST.
 101
 102 =item *
 103
 104 If I<$check> is is false then C<encode> MUST  make a "best effort" to
 105 convert the string - for example, by using a replacement character.
 106
 107 =back
 108
 109 =item -E<gt>decode($octets [,$check])
 110
 111 MUST return the string that I<$octets> represents.
 112
 113 =over 2
 114
 115 =item *
 116
 117 If I<$check> is true, it SHOULD modify I<$octets> in place to remove
 118 the converted part (i.e.  the whole sequence unless there is an
 119 error).  If perlio_ok() is true, SHOULD becomes MUST.
 120
 121 =item *
 122
 123 If an error occurs, it SHOULD return the fragment of string that has
 124 been converted and modify $octets in-place to remove the converted
 125 part leaving it starting with the problem fragment.  If perlio_ok() is
 126 true, SHOULD becomes MUST.
 127
 128 =item *
 129
 130 If I<$check> is false then C<decode> should make a "best effort" to
 131 convert the string - for example by using Unicode's "\x{FFFD}" as a
 132 replacement character.
 133
 134 =back
 135
 136 =back
 137
 138 If you want your encoding to work with L<encoding> pragma, you should
 139 also implement the method below.
 140
 141 =over 4
 142
 143 =item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check])
 144
 145 MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>.
 146 Decoding will terminate when $terminator (a string) appears in output.
 147 I<$offset> will be modified to the last $octets position at end of decode.
 148 Returns true if $terminator appears output, else returns false.
 149
 150 =back
 151
 152 =head2 Other methods defined in Encode::Encodings
 153
 154 You do not have to override methods shown below unless you have to.
 155
 156 =over 4
 157
 158 =item -E<gt>name
 159
 160 Predefined As:
 161
 162   sub name  { return shift->{'Name'} }
 163
 164 MUST return the string representing the canonical name of the encoding.
 165
 166 =item -E<gt>renew
 167
 168 Predefined As:
 169
 170   sub renew { return $_[0] }
 171
 172 This method reconstructs the encoding object if necessary.  If you need
 173 to store the state during encoding, this is where you clone your object.
 174 Here is an example:
 175
 176   sub renew {
 177       my $self = shift;
 178       my $clone = bless { %$self } => ref($self);
 179       $clone->{clone} = 1; # so the caller can see it
 180       return $clone;
 181   }
 182
 183 Since most encodings are stateless the default behavior is just return
 184 itself as shown above.
 185
 186 PerlIO ALWAYS calls this method to make sure it has its own private
 187 encoding object.
 188
 189 =item -E<gt>perlio_ok()
 190
 191 Predefined As:
 192
 193   sub perlio_ok {
 194       eval{ require PerlIO::encoding };
 195       return $@ ? 0 : 1;
 196   }
 197
 198 If your encoding does not support PerlIO for some reasons, just;
 199
 200  sub perlio_ok { 0 }
 201
 202 =item -E<gt>needs_lines()
 203
 204 Predefined As:
 205
 206   sub needs_lines { 0 };
 207
 208 If your encoding can work with PerlIO but needs line buffering, you
 209 MUST define this method so it returns true.  7bit ISO-2022 encodings
 210 are one example that needs this.  When this method is missing, false
 211 is assumed.
 212
 213 =back
 214
 215 =head2 Example: Encode::ROT13
 216
 217   package Encode::ROT13;
 218   use strict;
 219   use base qw(Encode::Encoding);
 220
 221   __PACKAGE__->Define('rot13');
 222
 223   sub encode($$;$){
 224       my ($obj, $str, $chk) = @_;
 225       $str =~ tr/A-Za-z/N-ZA-Mn-za-m/;
 226       $_[1] = '' if $chk; # this is what in-place edit means
 227       return $str;
 228   }
 229
 230   # Jr pna or ynml yvxr guvf;
 231   *decode = \&encode;
 232
 233   1;
 234
 235 =head1 Why the heck Encode API is different?
 236
 237 It should be noted that the I<$check> behaviour is different from the
 238 outer public API. The logic is that the "unchecked" case is useful
 239 when the encoding is part of a stream which may be reporting errors
 240 (e.g. STDERR).  In such cases, it is desirable to get everything
 241 through somehow without causing additional errors which obscure the
 242 original one. Also, the encoding is best placed to know what the
 243 correct replacement character is, so if that is the desired behaviour
 244 then letting low level code do it is the most efficient.
 245
 246 By contrast, if I<$check> is true, the scheme above allows the
 247 encoding to do as much as it can and tell the layer above how much
 248 that was. What is lacking at present is a mechanism to report what
 249 went wrong. The most likely interface will be an additional method
 250 call to the object, or perhaps (to avoid forcing per-stream objects
 251 on otherwise stateless encodings) an additional parameter.
 252
 253 It is also highly desirable that encoding classes inherit from
 254 C<Encode::Encoding> as a base class. This allows that class to define
 255 additional behaviour for all encoding objects.
 256
 257   package Encode::MyEncoding;
 258   use base qw(Encode::Encoding);
 259
 260   __PACKAGE__->Define(qw(myCanonical myAlias));
 261
 262 to create an object with C<< bless {Name => ...}, $class >>, and call
 263 define_encoding.  They inherit their C<name> method from
 264 C<Encode::Encoding>.
 265
 266 =head2 Compiled Encodings
 267
 268 For the sake of speed and efficiency, most of the encodings are now
 269 supported via a I<compiled form>: XS modules generated from UCM
 270 files.   Encode provides the enc2xs tool to achieve that.  Please see
 271 L<enc2xs> for more details.
 272
 273 =head1 SEE ALSO
 274
 275 L<perlmod>, L<enc2xs>
 276
 277 =begin future
 278
 279 =over 4
 280
 281 =item Scheme 1
 282
 283 The fixup routine gets passed the remaining fragment of string being
 284 processed.  It modifies it in place to remove bytes/characters it can
 285 understand and returns a string used to represent them.  For example:
 286
 287  sub fixup {
 288    my $ch = substr($_[0],0,1,'');
 289    return sprintf("\x{%02X}",ord($ch);
 290  }
 291
 292 This scheme is close to how the underlying C code for Encode works,
 293 but gives the fixup routine very little context.
 294
 295 =item Scheme 2
 296
 297 The fixup routine gets passed the original string, an index into
 298 it of the problem area, and the output string so far.  It appends
 299 what it wants to the output string and returns a new index into the
 300 original string.  For example:
 301
 302  sub fixup {
 303    # my ($s,$i,$d) = @_;
 304    my $ch = substr($_[0],$_[1],1);
 305    $_[2] .= sprintf("\x{%02X}",ord($ch);
 306    return $_[1]+1;
 307  }
 308
 309 This scheme gives maximal control to the fixup routine but is more
 310 complicated to code, and may require that the internals of Encode be tweaked to
 311 keep the original string intact.
 312
 313 =item Other Schemes
 314
 315 Hybrids of the above.
 316
 317 Multiple return values rather than in-place modifications.
 318
 319 Index into the string could be C<pos($str)> allowing C<s/\G...//>.
 320
 321 =back
 322
 323 =end future
 324
 325 =cut