X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2Flib%2FEncode%2FEncoding.pm;h=12f06557a55fb6e73038552da7ed8e5a322386c7;hb=011b2d2f95a2b6260e1a3409e652417bcc2b453d;hp=249b7fa2dd95b5c5797b43180b387a143fd98435;hpb=ee981de62537b59144b6bbbb2f3fb71c06a28fb8;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm index 249b7fa..12f0655 100644 --- a/ext/Encode/lib/Encode/Encoding.pm +++ b/ext/Encode/lib/Encode/Encoding.pm @@ -1,7 +1,7 @@ package Encode::Encoding; # Base class for classes which implement encodings use strict; -our $VERSION = do { my @r = (q$Revision: 0.90 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.28 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; sub Define { @@ -9,7 +9,7 @@ sub Define my $canonical = shift; $obj = bless { Name => $canonical },$obj unless ref $obj; # warn "$canonical => $obj\n"; - Encode::define_encoding($obj, $canonical, @_); + Encode::define_encoding($obj, $canonical, @_); } sub name { shift->{'Name'} } @@ -20,7 +20,223 @@ sub fromUnicode { shift->encode(@_) } sub new_sequence { return $_[0] } +sub perlio_ok { 0 } + +sub needs_lines { 0 } + sub DESTROY {} 1; __END__ + +=head1 NAME + +Encode::Encoding - Encode Implementation Base Class + +=head1 SYNOPSIS + + package Encode::MyEncoding; + use base qw(Encode::Encoding); + + __PACKAGE__->Define(qw(myCanonical myAlias)); + +=head1 DESCRIPTION + +As mentioned in L, encodings are (in the current +implementation at least) defined by objects. The mapping of encoding +name to object is via the C<%encodings> hash. + +The values of the hash can currently be either strings or objects. +The string form may go away in the future. The string form occurs +when C has scanned C<@INC> for loadable encodings but has +not actually loaded the encoding in question. This is because the +current "loading" process is all Perl and a bit slow. + +Once an encoding is loaded, the value of the hash is the object which +implements the encoding. The object should provide the following +interface: + +=over 4 + +=item -Ename + +MUST return the string representing the canonical name of the encoding. + +=item -Enew_sequence + +This is a placeholder for encodings with state. It should return an +object which implements this interface. All current implementations +return the original object. + +=item -Eencode($string,$check) + +MUST return the octet sequence representing I<$string>. + +=over 2 + +=item * + +If I<$check> is true, it SHOULD modify I<$string> in place to remove +the converted part (i.e. the whole string unless there is an error). +If perlio_ok() is true, SHOULD becomes MUST. + +=item * + +If an error occurs, it SHOULD return the octet sequence for the +fragment of string that has been converted and modify $string in-place +to remove the converted part leaving it starting with the problem +fragment. If perlio_ok() is true, SHOULD becomes MUST. + +=item * + +If I<$check> is is false then C MUST make a "best effort" to +convert the string - for example, by using a replacement character. + +=back + +=item -Edecode($octets,$check) + +MUST return the string that I<$octets> represents. + +=over 2 + +=item * + +If I<$check> is true, it SHOULD modify I<$octets> in place to remove +the converted part (i.e. the whole sequence unless there is an +error). If perlio_ok() is true, SHOULD becomes MUST. + +=item * + +If an error occurs, it SHOULD return the fragment of string that has +been converted and modify $octets in-place to remove the converted +part leaving it starting with the problem fragment. If perlio_ok() is +true, SHOULD becomes MUST. + +=item * + +If I<$check> is false then C should make a "best effort" to +convert the string - for example by using Unicode's "\x{FFFD}" as a +replacement character. + +=back + +=item -Eperlio_ok() + +If you want your encoding to work with PerlIO, you MUST define this +method so that it returns 1 when PerlIO is enabled. Here is an +example; + + sub perlio_ok { + eval { require PerlIO::encoding }; + if ($@){ + return 0; + }else{ + return 1; + } + } + + +By default, this method is defined as follows; + + sub perlio_ok { 0 } + +=item -Eneeds_lines() + +If your encoding can work with PerlIO but needs line buffering, you +MUST define this method so it returns true. 7bit ISO-2022 encodings +are one example that needs this. When this method is missing, false +is assumed. + +=back + +It should be noted that the I<$check> behaviour is different from the +outer public API. The logic is that the "unchecked" case is useful +when the encoding is part of a stream which may be reporting errors +(e.g. STDERR). In such cases, it is desirable to get everything +through somehow without causing additional errors which obscure the +original one. Also, the encoding is best placed to know what the +correct replacement character is, so if that is the desired behaviour +then letting low level code do it is the most efficient. + +By contrast, if I<$check> is true, the scheme above allows the +encoding to do as much as it can and tell the layer above how much +that was. What is lacking at present is a mechanism to report what +went wrong. The most likely interface will be an additional method +call to the object, or perhaps (to avoid forcing per-stream objects +on otherwise stateless encodings) an additional parameter. + +It is also highly desirable that encoding classes inherit from +C as a base class. This allows that class to define +additional behaviour for all encoding objects. For example, built-in +Unicode, UCS-2, and UTF-8 classes use + + package Encode::MyEncoding; + use base qw(Encode::Encoding); + + __PACKAGE__->Define(qw(myCanonical myAlias)); + +to create an object with C<< bless {Name => ...}, $class >>, and call +define_encoding. They inherit their C method from +C. + +=head2 Compiled Encodings + +For the sake of speed and efficiency, most of the encodings are now +supported via a I: XS modules generated from UCM +files. Encode provides the enc2xs tool to achieve that. Please see +L for more details. + +=head1 SEE ALSO + +L, L + +=begin future + +=over 4 + +=item Scheme 1 + +The fixup routine gets passed the remaining fragment of string being +processed. It modifies it in place to remove bytes/characters it can +understand and returns a string used to represent them. For example: + + sub fixup { + my $ch = substr($_[0],0,1,''); + return sprintf("\x{%02X}",ord($ch); + } + +This scheme is close to how the underlying C code for Encode works, +but gives the fixup routine very little context. + +=item Scheme 2 + +The fixup routine gets passed the original string, an index into +it of the problem area, and the output string so far. It appends +what it wants to the output string and returns a new index into the +original string. For example: + + sub fixup { + # my ($s,$i,$d) = @_; + my $ch = substr($_[0],$_[1],1); + $_[2] .= sprintf("\x{%02X}",ord($ch); + return $_[1]+1; + } + +This scheme gives maximal control to the fixup routine but is more +complicated to code, and may require that the internals of Encode be tweaked to +keep the original string intact. + +=item Other Schemes + +Hybrids of the above. + +Multiple return values rather than in-place modifications. + +Index into the string could be C allowing C. + +=back + +=end future + +=cut