ext/Encode/lib/Encode/Encoding.pm

   1 package Encode::Encoding;
   2 # Base class for classes which implement encodings
   3 use strict;
   4 our $VERSION = do { my @r = (q$Revision: 1.28 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   5
   6 sub Define
   7 {
   8     my $obj = shift;
   9     my $canonical = shift;
  10     $obj = bless { Name => $canonical },$obj unless ref $obj;
  11     # warn "$canonical => $obj\n";
  12     Encode::define_encoding($obj, $canonical, @_);
  13 }
  14
  15 sub name { shift->{'Name'} }
  16
  17 # Temporary legacy methods
  18 sub toUnicode    { shift->decode(@_) }
  19 sub fromUnicode  { shift->encode(@_) }
  20
  21 sub new_sequence { return $_[0] }
  22
  23 sub perlio_ok { 0 }
  24
  25 sub needs_lines  { 0 }
  26
  27 sub DESTROY {}
  28
  29 1;
  30 __END__
  31
  32 =head1 NAME
  33
  34 Encode::Encoding - Encode Implementation Base Class
  35
  36 =head1 SYNOPSIS
  37
  38   package Encode::MyEncoding;
  39   use base qw(Encode::Encoding);
  40
  41   __PACKAGE__->Define(qw(myCanonical myAlias));
  42
  43 =head1 DESCRIPTION
  44
  45 As mentioned in L<Encode>, encodings are (in the current
  46 implementation at least) defined by objects. The mapping of encoding
  47 name to object is via the C<%encodings> hash.
  48
  49 The values of the hash can currently be either strings or objects.
  50 The string form may go away in the future. The string form occurs
  51 when C<encodings()> has scanned C<@INC> for loadable encodings but has
  52 not actually loaded the encoding in question. This is because the
  53 current "loading" process is all Perl and a bit slow.
  54
  55 Once an encoding is loaded, the value of the hash is the object which
  56 implements the encoding. The object should provide the following
  57 interface:
  58
  59 =over 4
  60
  61 =item -E<gt>name
  62
  63 MUST return the string representing the canonical name of the encoding.
  64
  65 =item -E<gt>new_sequence
  66
  67 This is a placeholder for encodings with state. It should return an
  68 object which implements this interface.  All current implementations
  69 return the original object.
  70
  71 =item -E<gt>encode($string,$check)
  72
  73 MUST return the octet sequence representing I<$string>.
  74
  75 =over 2
  76
  77 =item *
  78
  79 If I<$check> is true, it SHOULD modify I<$string> in place to remove
  80 the converted part (i.e.  the whole string unless there is an error).
  81 If perlio_ok() is true, SHOULD becomes MUST.
  82
  83 =item *
  84
  85 If an error occurs, it SHOULD return the octet sequence for the
  86 fragment of string that has been converted and modify $string in-place
  87 to remove the converted part leaving it starting with the problem
  88 fragment.  If perlio_ok() is true, SHOULD becomes MUST.
  89
  90 =item *
  91
  92 If I<$check> is is false then C<encode> MUST  make a "best effort" to
  93 convert the string - for example, by using a replacement character.
  94
  95 =back
  96
  97 =item -E<gt>decode($octets,$check)
  98
  99 MUST return the string that I<$octets> represents.
 100
 101 =over 2
 102
 103 =item *
 104
 105 If I<$check> is true, it SHOULD modify I<$octets> in place to remove
 106 the converted part (i.e.  the whole sequence unless there is an
 107 error).  If perlio_ok() is true, SHOULD becomes MUST.
 108
 109 =item *
 110
 111 If an error occurs, it SHOULD return the fragment of string that has
 112 been converted and modify $octets in-place to remove the converted
 113 part leaving it starting with the problem fragment.  If perlio_ok() is
 114 true, SHOULD becomes MUST.
 115
 116 =item *
 117
 118 If I<$check> is false then C<decode> should make a "best effort" to
 119 convert the string - for example by using Unicode's "\x{FFFD}" as a
 120 replacement character.
 121
 122 =back
 123
 124 =item -E<gt>perlio_ok()
 125
 126 If you want your encoding to work with PerlIO, you MUST define this
 127 method so that it returns 1 when PerlIO is enabled.  Here is an
 128 example;
 129
 130  sub perlio_ok {
 131      eval { require PerlIO::encoding };
 132      if ($@){
 133          return 0;
 134      }else{
 135          return 1;
 136      }
 137   }
 138
 139
 140 By default, this method is defined as follows;
 141
 142  sub perlio_ok { 0 }
 143
 144 =item -E<gt>needs_lines()
 145
 146 If your encoding can work with PerlIO but needs line buffering, you
 147 MUST define this method so it returns true.  7bit ISO-2022 encodings
 148 are one example that needs this.  When this method is missing, false
 149 is assumed.
 150
 151 =back
 152
 153 It should be noted that the I<$check> behaviour is different from the
 154 outer public API. The logic is that the "unchecked" case is useful
 155 when the encoding is part of a stream which may be reporting errors
 156 (e.g. STDERR).  In such cases, it is desirable to get everything
 157 through somehow without causing additional errors which obscure the
 158 original one. Also, the encoding is best placed to know what the
 159 correct replacement character is, so if that is the desired behaviour
 160 then letting low level code do it is the most efficient.
 161
 162 By contrast, if I<$check> is true, the scheme above allows the
 163 encoding to do as much as it can and tell the layer above how much
 164 that was. What is lacking at present is a mechanism to report what
 165 went wrong. The most likely interface will be an additional method
 166 call to the object, or perhaps (to avoid forcing per-stream objects
 167 on otherwise stateless encodings) an additional parameter.
 168
 169 It is also highly desirable that encoding classes inherit from
 170 C<Encode::Encoding> as a base class. This allows that class to define
 171 additional behaviour for all encoding objects. For example, built-in
 172 Unicode, UCS-2, and UTF-8 classes use
 173
 174   package Encode::MyEncoding;
 175   use base qw(Encode::Encoding);
 176
 177   __PACKAGE__->Define(qw(myCanonical myAlias));
 178
 179 to create an object with C<< bless {Name => ...}, $class >>, and call
 180 define_encoding.  They inherit their C<name> method from
 181 C<Encode::Encoding>.
 182
 183 =head2 Compiled Encodings
 184
 185 For the sake of speed and efficiency, most of the encodings are now
 186 supported via a I<compiled form>: XS modules generated from UCM
 187 files.   Encode provides the enc2xs tool to achieve that.  Please see
 188 L<enc2xs> for more details.
 189
 190 =head1 SEE ALSO
 191
 192 L<perlmod>, L<enc2xs>
 193
 194 =begin future
 195
 196 =over 4
 197
 198 =item Scheme 1
 199
 200 The fixup routine gets passed the remaining fragment of string being
 201 processed.  It modifies it in place to remove bytes/characters it can
 202 understand and returns a string used to represent them.  For example:
 203
 204  sub fixup {
 205    my $ch = substr($_[0],0,1,'');
 206    return sprintf("\x{%02X}",ord($ch);
 207  }
 208
 209 This scheme is close to how the underlying C code for Encode works,
 210 but gives the fixup routine very little context.
 211
 212 =item Scheme 2
 213
 214 The fixup routine gets passed the original string, an index into
 215 it of the problem area, and the output string so far.  It appends
 216 what it wants to the output string and returns a new index into the
 217 original string.  For example:
 218
 219  sub fixup {
 220    # my ($s,$i,$d) = @_;
 221    my $ch = substr($_[0],$_[1],1);
 222    $_[2] .= sprintf("\x{%02X}",ord($ch);
 223    return $_[1]+1;
 224  }
 225
 226 This scheme gives maximal control to the fixup routine but is more
 227 complicated to code, and may require that the internals of Encode be tweaked to
 228 keep the original string intact.
 229
 230 =item Other Schemes
 231
 232 Hybrids of the above.
 233
 234 Multiple return values rather than in-place modifications.
 235
 236 Index into the string could be C<pos($str)> allowing C<s/\G...//>.
 237
 238 =back
 239
 240 =end future
 241
 242 =cut