Fix a little syntax error, reported by Marcus Holland-Moritz
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
b786ee6f 2# $Id: Encode.pm,v 1.98 2003/08/20 11:16:34 dankogai Exp dankogai $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
ac6c51b8 6our $VERSION = do { my @r = (q$Revision: 1.9801 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8f139f4c 7sub DEBUG () { 0 }
6d1c0808 8use XSLoader ();
10c5ecbb 9XSLoader::load(__PACKAGE__, $VERSION);
2c674647 10
2c674647 11require Exporter;
7e19fb92 12use base qw/Exporter/;
2c674647 13
4411f3b6 14# Public, encouraged API is exported by default
85982a32 15
16our @EXPORT = qw(
17 decode decode_utf8 encode encode_utf8
a0d8a30e 18 encodings find_encoding clone_encoding
4411f3b6 19);
20
b7a5c9de 21our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
af1f55d9 22 PERLQQ HTMLCREF XMLCREF);
b7a5c9de 23our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
af1f55d9 24 FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
85982a32 25
51ef4e11 26our @EXPORT_OK =
6d1c0808 27 (
85982a32 28 qw(
29 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
30 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
31 ),
32 @FB_FLAGS, @FB_CONSTS,
33 );
34
6d1c0808 35our %EXPORT_TAGS =
85982a32 36 (
37 all => [ @EXPORT, @EXPORT_OK ],
38 fallbacks => [ @FB_CONSTS ],
39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
40 );
41
4411f3b6 42# Documentation moved after __END__ for speed - NI-S
2c674647 43
a63c962f 44our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 45
5d030b67 46use Encode::Alias;
47
5129552c 48# Make a %Encoding package variable to allow a certain amount of cheating
49our %Encoding;
aae85ceb 50our %ExtModule;
51require Encode::Config;
52eval { require Encode::ConfigLocal };
5129552c 53
656753f8 54sub encodings
55{
5129552c 56 my $class = shift;
fc17bd48 57 my %enc;
58 if (@_ and $_[0] eq ":all"){
59 %enc = ( %Encoding, %ExtModule );
60 }else{
61 %enc = %Encoding;
62 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
8f139f4c 63 DEBUG and warn $mod;
fc17bd48 64 for my $enc (keys %ExtModule){
65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
66 }
67 }
5129552c 68 }
69 return
ce912cd4 70 sort { lc $a cmp lc $b }
fc17bd48 71 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc;
51ef4e11 72}
73
85982a32 74sub perlio_ok{
0ab8f81e 75 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
011b2d2f 76 $obj->can("perlio_ok") and return $obj->perlio_ok();
0ab8f81e 77 return 0; # safety net
85982a32 78}
79
51ef4e11 80sub define_encoding
81{
18586f54 82 my $obj = shift;
83 my $name = shift;
5129552c 84 $Encoding{$name} = $obj;
18586f54 85 my $lc = lc($name);
86 define_alias($lc => $obj) unless $lc eq $name;
10c5ecbb 87 while (@_){
18586f54 88 my $alias = shift;
10c5ecbb 89 define_alias($alias, $obj);
18586f54 90 }
91 return $obj;
656753f8 92}
93
656753f8 94sub getEncoding
95{
10c5ecbb 96 my ($class, $name, $skip_external) = @_;
97
a0d8a30e 98 ref($name) && $name->can('renew') and return $name;
10c5ecbb 99 exists $Encoding{$name} and return $Encoding{$name};
18586f54 100 my $lc = lc $name;
10c5ecbb 101 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 102
5129552c 103 my $oc = $class->find_alias($name);
10c5ecbb 104 defined($oc) and return $oc;
105 $lc ne $name and $oc = $class->find_alias($lc);
106 defined($oc) and return $oc;
c50d192e 107
c731e18e 108 unless ($skip_external)
d1ed7747 109 {
c731e18e 110 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
111 $mod =~ s,::,/,g ; $mod .= '.pm';
112 eval{ require $mod; };
10c5ecbb 113 exists $Encoding{$name} and return $Encoding{$name};
c731e18e 114 }
d1ed7747 115 }
18586f54 116 return;
656753f8 117}
118
a0d8a30e 119sub find_encoding($;$)
4411f3b6 120{
10c5ecbb 121 my ($name, $skip_external) = @_;
dd9703c9 122 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 123}
124
a0d8a30e 125sub resolve_alias($){
fcb875d4 126 my $obj = find_encoding(shift);
127 defined $obj and return $obj->name;
128 return;
129}
130
a0d8a30e 131sub clone_encoding($){
132 my $obj = find_encoding(shift);
133 ref $obj or return;
134 eval { require Storable };
135 $@ and return;
136 return Storable::dclone($obj);
137}
138
b2704119 139sub encode($$;$)
4411f3b6 140{
e8c86ba6 141 my ($name, $string, $check) = @_;
0f7c507f 142 return undef unless defined $string;
b2704119 143 $check ||=0;
18586f54 144 my $enc = find_encoding($name);
10c5ecbb 145 unless(defined $enc){
146 require Carp;
147 Carp::croak("Unknown encoding '$name'");
148 }
18586f54 149 my $octets = $enc->encode($string,$check);
23f3589e 150 $_[1] = $string if $check;
18586f54 151 return $octets;
4411f3b6 152}
153
b2704119 154sub decode($$;$)
4411f3b6 155{
18586f54 156 my ($name,$octets,$check) = @_;
0f7c507f 157 return undef unless defined $octets;
b2704119 158 $check ||=0;
18586f54 159 my $enc = find_encoding($name);
10c5ecbb 160 unless(defined $enc){
161 require Carp;
162 Carp::croak("Unknown encoding '$name'");
163 }
18586f54 164 my $string = $enc->decode($octets,$check);
165 $_[1] = $octets if $check;
166 return $string;
4411f3b6 167}
168
b2704119 169sub from_to($$$;$)
4411f3b6 170{
18586f54 171 my ($string,$from,$to,$check) = @_;
0f7c507f 172 return undef unless defined $string;
b2704119 173 $check ||=0;
18586f54 174 my $f = find_encoding($from);
10c5ecbb 175 unless (defined $f){
176 require Carp;
177 Carp::croak("Unknown encoding '$from'");
178 }
18586f54 179 my $t = find_encoding($to);
10c5ecbb 180 unless (defined $t){
181 require Carp;
182 Carp::croak("Unknown encoding '$to'");
183 }
18586f54 184 my $uni = $f->decode($string,$check);
185 return undef if ($check && length($string));
a999c27c 186 $string = $t->encode($uni,$check);
18586f54 187 return undef if ($check && length($uni));
3ef515df 188 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 189}
190
b2704119 191sub encode_utf8($)
4411f3b6 192{
18586f54 193 my ($str) = @_;
c731e18e 194 utf8::encode($str);
18586f54 195 return $str;
4411f3b6 196}
197
b2704119 198sub decode_utf8($)
4411f3b6 199{
18586f54 200 my ($str) = @_;
201 return undef unless utf8::decode($str);
202 return $str;
5ad8ef52 203}
204
b536bf57 205predefine_encodings(1);
f2a2953c 206
207#
208# This is to restore %Encoding if really needed;
209#
10c5ecbb 210
f2a2953c 211sub predefine_encodings{
10c5ecbb 212 use Encode::Encoding;
b536bf57 213 no warnings 'redefine';
214 my $use_xs = shift;
6d1c0808 215 if ($ON_EBCDIC) {
f2a2953c 216 # was in Encode::UTF_EBCDIC
217 package Encode::UTF_EBCDIC;
10c5ecbb 218 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
f2a2953c 219 *decode = sub{
220 my ($obj,$str,$chk) = @_;
221 my $res = '';
222 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 223 $res .=
f2a2953c 224 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
225 }
226 $_[1] = '' if $chk;
227 return $res;
228 };
229 *encode = sub{
230 my ($obj,$str,$chk) = @_;
231 my $res = '';
232 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 233 $res .=
f2a2953c 234 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
235 }
236 $_[1] = '' if $chk;
237 return $res;
238 };
6d1c0808 239 $Encode::Encoding{Unicode} =
c731e18e 240 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
6d1c0808 241 } else {
f2a2953c 242 package Encode::Internal;
10c5ecbb 243 push @Encode::Internal::ISA, 'Encode::Encoding';
f2a2953c 244 *decode = sub{
245 my ($obj,$str,$chk) = @_;
246 utf8::upgrade($str);
247 $_[1] = '' if $chk;
248 return $str;
249 };
250 *encode = \&decode;
6d1c0808 251 $Encode::Encoding{Unicode} =
c731e18e 252 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 253 }
254
255 {
256 # was in Encode::utf8
257 package Encode::utf8;
10c5ecbb 258 push @Encode::utf8::ISA, 'Encode::Encoding';
b536bf57 259 #
260 if ($use_xs){
8f139f4c 261 Encode::DEBUG and warn __PACKAGE__, " XS on";
b536bf57 262 *decode = \&decode_xs;
263 *encode = \&encode_xs;
264 }else{
8f139f4c 265 Encode::DEBUG and warn __PACKAGE__, " XS off";
b536bf57 266 *decode = sub{
267 my ($obj,$octets,$chk) = @_;
268 my $str = Encode::decode_utf8($octets);
269 if (defined $str) {
270 $_[1] = '' if $chk;
271 return $str;
272 }
273 return undef;
274 };
275 *encode = sub {
276 my ($obj,$string,$chk) = @_;
277 my $octets = Encode::encode_utf8($string);
278 $_[1] = '' if $chk;
279 return $octets;
280 };
281 }
220e2d4e 282 *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
283 my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
284 my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
285 use bytes;
286 if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
287 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
288 $$rpos = $npos + length($trm);
289 return 1;
290 }
291 $$rdst .= substr($$rsrc, $pos);
292 $$rpos = length($$rsrc);
293 return '';
294 };
b7a5c9de 295 $Encode::Encoding{utf8} =
c731e18e 296 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 297 }
f2a2953c 298}
299
656753f8 3001;
301
2a936312 302__END__
303
4411f3b6 304=head1 NAME
305
306Encode - character encodings
307
308=head1 SYNOPSIS
309
310 use Encode;
311
67d7b5ef 312=head2 Table of Contents
313
0ab8f81e 314Encode consists of a collection of modules whose details are too big
67d7b5ef 315to fit in one document. This POD itself explains the top-level APIs
6d1c0808 316and general topics at a glance. For other topics and more details,
0ab8f81e 317see the PODs below:
67d7b5ef 318
319 Name Description
320 --------------------------------------------------------
6d1c0808 321 Encode::Alias Alias definitions to encodings
67d7b5ef 322 Encode::Encoding Encode Implementation Base Class
323 Encode::Supported List of Supported Encodings
324 Encode::CN Simplified Chinese Encodings
325 Encode::JP Japanese Encodings
326 Encode::KR Korean Encodings
327 Encode::TW Traditional Chinese Encodings
328 --------------------------------------------------------
329
4411f3b6 330=head1 DESCRIPTION
331
47bfe92f 332The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 333and the rest of the system. Perl strings are sequences of
334B<characters>.
335
336The repertoire of characters that Perl can represent is at least that
337defined by the Unicode Consortium. On most platforms the ordinal
338values of the characters (as returned by C<ord(ch)>) is the "Unicode
339codepoint" for the character (the exceptions are those platforms where
340the legacy encoding is some variant of EBCDIC rather than a super-set
341of ASCII - see L<perlebcdic>).
342
0ab8f81e 343Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef 344often called "bytes". These chunks are also known as "octets" in
345networking standards. Perl is widely used to manipulate data of many
346types - not only strings of characters representing human or computer
0ab8f81e 347languages but also "binary" data being the machine's representation of
67d7b5ef 348numbers, pixels in an image - or just about anything.
349
0ab8f81e 350When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 351process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 352byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef 353"logical character".
354
355=head2 TERMINOLOGY
4411f3b6 356
7e19fb92 357=over 2
21938dfa 358
67d7b5ef 359=item *
360
361I<character>: a character in the range 0..(2**32-1) (or more).
362(What Perl's strings are made of.)
363
364=item *
365
366I<byte>: a character in the range 0..255
367(A special case of a Perl character.)
368
369=item *
370
371I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 372(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef 373
374=back
4411f3b6 375
67d7b5ef 376=head1 PERL ENCODING API
4411f3b6 377
7e19fb92 378=over 2
4411f3b6 379
b7a5c9de 380=item $octets = encode(ENCODING, $string [, CHECK])
4411f3b6 381
0ab8f81e 382Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 383a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e 384an alias. For encoding names and aliases, see L</"Defining Aliases">.
385For CHECK, see L</"Handling Malformed Data">.
4411f3b6 386
b7a5c9de 387For example, to convert a string from Perl's internal format to
6d1c0808 388iso-8859-1 (also known as Latin1),
681a7c68 389
b7a5c9de 390 $octets = encode("iso-8859-1", $string);
7e19fb92 391
b7a5c9de 392B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
393B<may not be equal to> $string. Though they both contain the same data, the utf8 flag
7e19fb92 394for $octets is B<always> off. When you encode anything, utf8 flag of
395the result is always off, even when it contains completely valid utf8
396string. See L</"The UTF-8 flag"> below.
681a7c68 397
4089adc4 398encode($valid_encoding, undef) is harmless but warns you for
399C<Use of uninitialized value in subroutine entry>.
400encode($valid_encoding, '') is harmless and warnless.
401
b7a5c9de 402=item $string = decode(ENCODING, $octets [, CHECK])
4411f3b6 403
0ab8f81e 404Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
405internal form and returns the resulting string. As in encode(),
406ENCODING can be either a canonical name or an alias. For encoding names
407and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f 408L</"Handling Malformed Data">.
409
b7a5c9de 410For example, to convert ISO-8859-1 data to a string in Perl's internal format:
681a7c68 411
b7a5c9de 412 $string = decode("iso-8859-1", $octets);
681a7c68 413
b7a5c9de 414B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
415B<may not be equal to> $octets. Though they both contain the same data,
416the utf8 flag for $string is on unless $octets entirely consists of
7e19fb92 417ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
418below.
47bfe92f 419
4089adc4 420decode($valid_encoding, undef) is harmless but warns you for
421C<Use of uninitialized value in subroutine entry>.
422decode($valid_encoding, '') is harmless and warnless.
423
b7a5c9de 424=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 425
b7a5c9de 426Converts B<in-place> data between two encodings. The data in $octets
427must be encoded as octets and not as characters in Perl's internal
428format. For example, to convert ISO-8859-1 data to Microsoft's CP1250 encoding:
2b106fbe 429
b7a5c9de 430 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe 431
432and to convert it back:
433
b7a5c9de 434 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 435
ab97ca19 436Note that because the conversion happens in place, the data to be
0ab8f81e 437converted cannot be a string constant; it must be a scalar variable.
ab97ca19 438
b7a5c9de 439from_to() returns the length of the converted string in octets on success, undef
3ef515df 440otherwise.
441
b7a5c9de 442B<CAVEAT>: The following operations look the same but are not quite so;
7e19fb92 443
b7a5c9de 444 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 445 $data = decode("iso-8859-1", $data); #2
4411f3b6 446
b7a5c9de 447Both #1 and #2 make $data consist of a completely valid UTF-8 string
7e19fb92 448but only #2 turns utf8 flag on. #1 is equivalent to
f2a2953c 449
7e19fb92 450 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 451
7e19fb92 452See L</"The UTF-8 flag"> below.
f2a2953c 453
454=item $octets = encode_utf8($string);
455
7e19fb92 456Equivalent to C<$octets = encode("utf8", $string);> The characters
b7a5c9de 457that comprise $string are encoded in Perl's internal format and the
458result is returned as a sequence of octets. All possible
7e19fb92 459characters have a UTF-8 representation so this function cannot fail.
460
f2a2953c 461
462=item $string = decode_utf8($octets [, CHECK]);
463
7e19fb92 464equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
b7a5c9de 465The sequence of octets represented by
7e19fb92 466$octets is decoded from UTF-8 into a sequence of logical
467characters. Not all sequences of octets form valid UTF-8 encodings, so
468it is possible for this call to fail. For CHECK, see
469L</"Handling Malformed Data">.
f2a2953c 470
471=back
472
51ef4e11 473=head2 Listing available encodings
474
5129552c 475 use Encode;
476 @list = Encode->encodings();
477
478Returns a list of the canonical names of the available encodings that
479are loaded. To get a list of all available encodings including the
480ones that are not loaded yet, say
481
482 @all_encodings = Encode->encodings(":all");
483
0ab8f81e 484Or you can give the name of a specific module.
5129552c 485
c731e18e 486 @with_jp = Encode->encodings("Encode::JP");
487
488When "::" is not in the name, "Encode::" is assumed.
51ef4e11 489
c731e18e 490 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 491
0ab8f81e 492To find out in detail which encodings are supported by this package,
5d030b67 493see L<Encode::Supported>.
51ef4e11 494
495=head2 Defining Aliases
496
0ab8f81e 497To add a new alias to a given encoding, use:
67d7b5ef 498
5129552c 499 use Encode;
500 use Encode::Alias;
a63c962f 501 define_alias(newName => ENCODING);
51ef4e11 502
3ef515df 503After that, newName can be used as an alias for ENCODING.
f2a2953c 504ENCODING may be either the name of an encoding or an
505I<encoding object>
51ef4e11 506
fcb875d4 507But before you do so, make sure the alias is nonexistent with
508C<resolve_alias()>, which returns the canonical name thereof.
509i.e.
510
511 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
512 Encode::resolve_alias("iso-8859-12") # false; nonexistent
513 Encode::resolve_alias($name) eq $name # true if $name is canonical
514
0ab8f81e 515resolve_alias() does not need C<use Encode::Alias>; it can be
516exported via C<use Encode qw(resolve_alias)>.
fcb875d4 517
0ab8f81e 518See L<Encode::Alias> for details.
51ef4e11 519
85982a32 520=head1 Encoding via PerlIO
4411f3b6 521
b7a5c9de 522If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
0ab8f81e 523and encode directly via a filehandle. The following two examples
524are totally identical in their functionality.
4411f3b6 525
85982a32 526 # via PerlIO
527 open my $in, "<:encoding(shiftjis)", $infile or die;
528 open my $out, ">:encoding(euc-jp)", $outfile or die;
b7a5c9de 529 while(<$in>){ print $out $_; }
8e86646e 530
85982a32 531 # via from_to
0ab8f81e 532 open my $in, "<", $infile or die;
533 open my $out, ">", $outfile or die;
b7a5c9de 534 while(<$in>){
0ab8f81e 535 from_to($_, "shiftjis", "euc-jp", 1);
b7a5c9de 536 print $out $_;
85982a32 537 }
4411f3b6 538
b7a5c9de 539Unfortunately, it may be that encodings are PerlIO-savvy. You can check
0ab8f81e 540if your encoding is supported by PerlIO by calling the C<perlio_ok>
541method.
542
543 Encode::perlio_ok("hz"); # False
544 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
545
546 use Encode qw(perlio_ok); # exported upon request
547 perlio_ok("euc-jp")
4411f3b6 548
0ab8f81e 549Fortunately, all encodings that come with Encode core are PerlIO-savvy
b7a5c9de 550except for hz and ISO-2022-kr. For gory details, see L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 551
85982a32 552=head1 Handling Malformed Data
4411f3b6 553
0ab8f81e 554The I<CHECK> argument is used as follows. When you omit it,
555the behaviour is the same as if you had passed a value of 0 for
556I<CHECK>.
47bfe92f 557
151b5d36 558=over 2
559
85982a32 560=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 561
0ab8f81e 562If I<CHECK> is 0, (en|de)code will put a I<substitution character>
563in place of a malformed character. For UCM-based encodings,
b7a5c9de 564E<lt>subcharE<gt> will be used. For Unicode, the code point C<0xFFFD> is used.
0ab8f81e 565If the data is supposed to be UTF-8, an optional lexical warning
566(category utf8) is given.
e9692b5b 567
7e19fb92 568=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 569
b7a5c9de 570If I<CHECK> is 1, methods will die on error immediately with an error
0ab8f81e 571message. Therefore, when I<CHECK> is set to 1, you should trap the
572fatal error with eval{} unless you really want to let it die on error.
47bfe92f 573
85982a32 574=item I<CHECK> = Encode::FB_QUIET
47bfe92f 575
85982a32 576If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
0ab8f81e 577return the portion of the data that has been processed so far when
578an error occurs. The data argument will be overwritten with
579everything after that point (that is, the unprocessed part of data).
580This is handy when you have to call decode repeatedly in the case
581where your source data may contain partial multi-byte character
582sequences, for example because you are reading with a fixed-width
583buffer. Here is some sample code that does exactly this:
4411f3b6 584
b7a5c9de 585 my $data = ''; my $utf8 = '';
85982a32 586 while(defined(read $fh, $buffer, 256)){
0ab8f81e 587 # buffer may end in a partial character so we append
85982a32 588 $data .= $buffer;
ee269af2 589 $utf8 .= decode($encoding, $data, Encode::FB_QUIET);
0ab8f81e 590 # $data now contains the unprocessed partial character
85982a32 591 }
1768d7eb 592
85982a32 593=item I<CHECK> = Encode::FB_WARN
67d7b5ef 594
0ab8f81e 595This is the same as above, except that it warns on error. Handy when
596you are debugging the mode above.
85982a32 597
598=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
599
af1f55d9 600=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
601
602=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
603
85982a32 604For encodings that are implemented by Encode::XS, CHECK ==
605Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
606
b7a5c9de 607When you decode, C<\xI<HH>> will be inserted for a malformed character,
608where I<HH> is the hex representation of the octet that could not be
609decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
610where I<HHHH> is the Unicode ID of the character that cannot be found
0ab8f81e 611in the character repertoire of the encoding.
85982a32 612
af1f55d9 613HTML/XML character reference modes are about the same, in place of
b7a5c9de 614C<\x{I<HHHH>}>, HTML uses C<&#I<NNNN>>; where I<NNNN> is a decimal digit and
615XML uses C<&#xI<HHHH>>; where I<HHHH> is the hexadecimal digit.
af1f55d9 616
85982a32 617=item The bitmask
618
0ab8f81e 619These modes are actually set via a bitmask. Here is how the FB_XX
620constants are laid out. You can import the FB_XX constants via
621C<use Encode qw(:fallbacks)>; you can import the generic bitmask
622constants via C<use Encode qw(:fallback_all)>.
85982a32 623
b0b300a3 624 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
625 DIE_ON_ERR 0x0001 X
4089adc4 626 WARN_ON_ERR 0x0002 X
b0b300a3 627 RETURN_ON_ERR 0x0004 X X
628 LEAVE_SRC 0x0008
629 PERLQQ 0x0100 X
b7a5c9de 630 HTMLCREF 0x0200
631 XMLCREF 0x0400
67d7b5ef 632
151b5d36 633=back
634
0ab8f81e 635=head2 Unimplemented fallback schemes
67d7b5ef 636
0ab8f81e 637In the future, you will be able to use a code reference to a callback
f2a2953c 638function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 639
982a4085 640The fallback scheme does not work on EBCDIC platforms.
641
67d7b5ef 642=head1 Defining Encodings
643
644To define a new encoding, use:
645
b7a5c9de 646 use Encode qw(define_encoding);
67d7b5ef 647 define_encoding($object, 'canonicalName' [, alias...]);
648
649I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 650should provide the interface described in L<Encode::Encoding>.
67d7b5ef 651If more than two arguments are provided then additional
b7a5c9de 652arguments are taken as aliases for I<$object>.
67d7b5ef 653
f2a2953c 654See L<Encode::Encoding> for more details.
655
7e19fb92 656=head1 The UTF-8 flag
657
658Before the introduction of utf8 support in perl, The C<eq> operator
b7a5c9de 659just compared the strings represented by two scalars. Beginning with
660perl 5.8, C<eq> compares two strings with simultaneous consideration
661of I<the utf8 flag>. To explain why we made it so, I will quote page
662402 of C<Programming Perl, 3rd ed.>
7e19fb92 663
664=over 2
665
666=item Goal #1:
667
668Old byte-oriented programs should not spontaneously break on the old
669byte-oriented data they used to work on.
670
671=item Goal #2:
672
673Old byte-oriented programs should magically start working on the new
674character-oriented data when appropriate.
675
676=item Goal #3:
677
678Programs should run just as fast in the new character-oriented mode
679as in the old byte-oriented mode.
680
681=item Goal #4:
682
683Perl should remain one language, rather than forking into a
684byte-oriented Perl and a character-oriented Perl.
685
686=back
687
688Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
689was born and many features documented in the book remained
b7a5c9de 690unimplemented for a long time. Perl 5.8 corrected this and the introduction
691of the UTF-8 flag is one of them. You can think of this perl notion as of a
692byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
7e19fb92 693flag on).
694
695Here is how Encode takes care of the utf8 flag.
696
4bdf5738 697=over 2
7e19fb92 698
699=item *
700
701When you encode, the resulting utf8 flag is always off.
702
151b5d36 703=item *
7e19fb92 704
b7a5c9de 705When you decode, the resulting utf8 flag is on unless you can
7e19fb92 706unambiguously represent data. Here is the definition of
707dis-ambiguity.
708
b7a5c9de 709After C<$utf8 = decode('foo', $octet);>,
7e19fb92 710
711 When $octet is... The utf8 flag in $utf8 is
712 ---------------------------------------------
713 In ASCII only (or EBCDIC only) OFF
714 In ISO-8859-1 ON
715 In any other Encoding ON
716 ---------------------------------------------
717
718As you see, there is one exception, In ASCII. That way you can assue
719Goal #1. And with Encode Goal #2 is assumed but you still have to be
720careful in such cases mentioned in B<CAVEAT> paragraphs.
721
722This utf8 flag is not visible in perl scripts, exactly for the same
723reason you cannot (or you I<don't have to>) see if a scalar contains a
724string, integer, or floating point number. But you can still peek
725and poke these if you will. See the section below.
726
727=back
728
729=head2 Messing with Perl's Internals
4411f3b6 730
47bfe92f 731The following API uses parts of Perl's internals in the current
0ab8f81e 732implementation. As such, they are efficient but may change.
4411f3b6 733
7e19fb92 734=over 2
4411f3b6 735
a63c962f 736=item is_utf8(STRING [, CHECK])
4411f3b6 737
0ab8f81e 738[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
47bfe92f 739If CHECK is true, also checks the data in STRING for being well-formed
740UTF-8. Returns true if successful, false otherwise.
4411f3b6 741
b5ab1f6f 742As of perl 5.8.1, L<utf8> also has utf8::is_utif8().
743
a63c962f 744=item _utf8_on(STRING)
4411f3b6 745
0ab8f81e 746[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
4411f3b6 747B<not> checked for being well-formed UTF-8. Do not use unless you
748B<know> that the STRING is well-formed UTF-8. Returns the previous
0ab8f81e 749state of the UTF-8 flag (so please don't treat the return value as
750indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 751
a63c962f 752=item _utf8_off(STRING)
4411f3b6 753
0ab8f81e 754[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
755Returns the previous state of the UTF-8 flag (so please don't treat the
756return value as indicating success or failure), or C<undef> if STRING is
4411f3b6 757not a string.
758
759=back
760
761=head1 SEE ALSO
762
5d030b67 763L<Encode::Encoding>,
764L<Encode::Supported>,
6d1c0808 765L<Encode::PerlIO>,
5d030b67 766L<encoding>,
6d1c0808 767L<perlebcdic>,
768L<perlfunc/open>,
769L<perlunicode>,
770L<utf8>,
5d030b67 771the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 772
85982a32 773=head1 MAINTAINER
aae85ceb 774
775This project was originated by Nick Ing-Simmons and later maintained
7e19fb92 776by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
777list of people involved. For any questions, use
b7a5c9de 778E<lt>perl-unicode@perl.orgE<gt> so we can all share.
aae85ceb 779
4411f3b6 780=cut