Upgrade to Encode 2.08.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
cc836e95 2# $Id: Encode.pm,v 2.8 2004/10/24 12:32:06 dankogai Exp $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
cc836e95 6our $VERSION = do { my @r = (q$Revision: 2.8 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8f139f4c 7sub DEBUG () { 0 }
6d1c0808 8use XSLoader ();
10c5ecbb 9XSLoader::load(__PACKAGE__, $VERSION);
2c674647 10
2c674647 11require Exporter;
7e19fb92 12use base qw/Exporter/;
2c674647 13
4411f3b6 14# Public, encouraged API is exported by default
85982a32 15
16our @EXPORT = qw(
17 decode decode_utf8 encode encode_utf8
a0d8a30e 18 encodings find_encoding clone_encoding
4411f3b6 19);
20
b7a5c9de 21our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
af1f55d9 22 PERLQQ HTMLCREF XMLCREF);
b7a5c9de 23our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
af1f55d9 24 FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
85982a32 25
51ef4e11 26our @EXPORT_OK =
6d1c0808 27 (
85982a32 28 qw(
29 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
30 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
31 ),
32 @FB_FLAGS, @FB_CONSTS,
33 );
34
6d1c0808 35our %EXPORT_TAGS =
85982a32 36 (
37 all => [ @EXPORT, @EXPORT_OK ],
38 fallbacks => [ @FB_CONSTS ],
39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
40 );
41
4411f3b6 42# Documentation moved after __END__ for speed - NI-S
2c674647 43
a63c962f 44our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 45
5d030b67 46use Encode::Alias;
47
5129552c 48# Make a %Encoding package variable to allow a certain amount of cheating
49our %Encoding;
aae85ceb 50our %ExtModule;
51require Encode::Config;
52eval { require Encode::ConfigLocal };
5129552c 53
656753f8 54sub encodings
55{
5129552c 56 my $class = shift;
fc17bd48 57 my %enc;
58 if (@_ and $_[0] eq ":all"){
59 %enc = ( %Encoding, %ExtModule );
60 }else{
61 %enc = %Encoding;
62 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
8f139f4c 63 DEBUG and warn $mod;
fc17bd48 64 for my $enc (keys %ExtModule){
65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
66 }
67 }
5129552c 68 }
69 return
ce912cd4 70 sort { lc $a cmp lc $b }
fc17bd48 71 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc;
51ef4e11 72}
73
85982a32 74sub perlio_ok{
0ab8f81e 75 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
011b2d2f 76 $obj->can("perlio_ok") and return $obj->perlio_ok();
0ab8f81e 77 return 0; # safety net
85982a32 78}
79
51ef4e11 80sub define_encoding
81{
18586f54 82 my $obj = shift;
83 my $name = shift;
5129552c 84 $Encoding{$name} = $obj;
18586f54 85 my $lc = lc($name);
86 define_alias($lc => $obj) unless $lc eq $name;
10c5ecbb 87 while (@_){
18586f54 88 my $alias = shift;
10c5ecbb 89 define_alias($alias, $obj);
18586f54 90 }
91 return $obj;
656753f8 92}
93
656753f8 94sub getEncoding
95{
10c5ecbb 96 my ($class, $name, $skip_external) = @_;
97
a0d8a30e 98 ref($name) && $name->can('renew') and return $name;
10c5ecbb 99 exists $Encoding{$name} and return $Encoding{$name};
18586f54 100 my $lc = lc $name;
10c5ecbb 101 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 102
5129552c 103 my $oc = $class->find_alias($name);
10c5ecbb 104 defined($oc) and return $oc;
105 $lc ne $name and $oc = $class->find_alias($lc);
106 defined($oc) and return $oc;
c50d192e 107
c731e18e 108 unless ($skip_external)
d1ed7747 109 {
c731e18e 110 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
111 $mod =~ s,::,/,g ; $mod .= '.pm';
112 eval{ require $mod; };
10c5ecbb 113 exists $Encoding{$name} and return $Encoding{$name};
c731e18e 114 }
d1ed7747 115 }
18586f54 116 return;
656753f8 117}
118
a0d8a30e 119sub find_encoding($;$)
4411f3b6 120{
10c5ecbb 121 my ($name, $skip_external) = @_;
dd9703c9 122 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 123}
124
a0d8a30e 125sub resolve_alias($){
fcb875d4 126 my $obj = find_encoding(shift);
127 defined $obj and return $obj->name;
128 return;
129}
130
a0d8a30e 131sub clone_encoding($){
132 my $obj = find_encoding(shift);
133 ref $obj or return;
134 eval { require Storable };
135 $@ and return;
136 return Storable::dclone($obj);
137}
138
b2704119 139sub encode($$;$)
4411f3b6 140{
e8c86ba6 141 my ($name, $string, $check) = @_;
0f7c507f 142 return undef unless defined $string;
f9d05ba3 143 return undef if ref $string;
b2704119 144 $check ||=0;
18586f54 145 my $enc = find_encoding($name);
10c5ecbb 146 unless(defined $enc){
147 require Carp;
148 Carp::croak("Unknown encoding '$name'");
149 }
18586f54 150 my $octets = $enc->encode($string,$check);
23f3589e 151 $_[1] = $string if $check;
18586f54 152 return $octets;
4411f3b6 153}
154
b2704119 155sub decode($$;$)
4411f3b6 156{
18586f54 157 my ($name,$octets,$check) = @_;
0f7c507f 158 return undef unless defined $octets;
f9d05ba3 159 return undef if ref $octets;
b2704119 160 $check ||=0;
18586f54 161 my $enc = find_encoding($name);
10c5ecbb 162 unless(defined $enc){
163 require Carp;
164 Carp::croak("Unknown encoding '$name'");
165 }
18586f54 166 my $string = $enc->decode($octets,$check);
167 $_[1] = $octets if $check;
168 return $string;
4411f3b6 169}
170
b2704119 171sub from_to($$$;$)
4411f3b6 172{
18586f54 173 my ($string,$from,$to,$check) = @_;
0f7c507f 174 return undef unless defined $string;
b2704119 175 $check ||=0;
18586f54 176 my $f = find_encoding($from);
10c5ecbb 177 unless (defined $f){
178 require Carp;
179 Carp::croak("Unknown encoding '$from'");
180 }
18586f54 181 my $t = find_encoding($to);
10c5ecbb 182 unless (defined $t){
183 require Carp;
184 Carp::croak("Unknown encoding '$to'");
185 }
18586f54 186 my $uni = $f->decode($string,$check);
187 return undef if ($check && length($string));
a999c27c 188 $string = $t->encode($uni,$check);
18586f54 189 return undef if ($check && length($uni));
3ef515df 190 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 191}
192
b2704119 193sub encode_utf8($)
4411f3b6 194{
18586f54 195 my ($str) = @_;
c731e18e 196 utf8::encode($str);
18586f54 197 return $str;
4411f3b6 198}
199
c2cbba7d 200sub decode_utf8($;$)
4411f3b6 201{
c2cbba7d 202 my ($str, $check) = @_;
203 if ($check){
204 return decode("utf8", $str, $check);
205 }else{
206 return undef unless utf8::decode($str);
207 return $str;
208 }
5ad8ef52 209}
210
b536bf57 211predefine_encodings(1);
f2a2953c 212
213#
214# This is to restore %Encoding if really needed;
215#
10c5ecbb 216
f2a2953c 217sub predefine_encodings{
10c5ecbb 218 use Encode::Encoding;
b536bf57 219 no warnings 'redefine';
220 my $use_xs = shift;
6d1c0808 221 if ($ON_EBCDIC) {
f2a2953c 222 # was in Encode::UTF_EBCDIC
223 package Encode::UTF_EBCDIC;
10c5ecbb 224 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
f2a2953c 225 *decode = sub{
226 my ($obj,$str,$chk) = @_;
227 my $res = '';
228 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 229 $res .=
f2a2953c 230 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
231 }
232 $_[1] = '' if $chk;
233 return $res;
234 };
235 *encode = sub{
236 my ($obj,$str,$chk) = @_;
237 my $res = '';
238 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 239 $res .=
f2a2953c 240 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
241 }
242 $_[1] = '' if $chk;
243 return $res;
244 };
6d1c0808 245 $Encode::Encoding{Unicode} =
c731e18e 246 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
6d1c0808 247 } else {
f2a2953c 248 package Encode::Internal;
10c5ecbb 249 push @Encode::Internal::ISA, 'Encode::Encoding';
f2a2953c 250 *decode = sub{
251 my ($obj,$str,$chk) = @_;
252 utf8::upgrade($str);
253 $_[1] = '' if $chk;
254 return $str;
255 };
256 *encode = \&decode;
6d1c0808 257 $Encode::Encoding{Unicode} =
c731e18e 258 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 259 }
260
261 {
262 # was in Encode::utf8
263 package Encode::utf8;
10c5ecbb 264 push @Encode::utf8::ISA, 'Encode::Encoding';
b536bf57 265 #
266 if ($use_xs){
8f139f4c 267 Encode::DEBUG and warn __PACKAGE__, " XS on";
b536bf57 268 *decode = \&decode_xs;
269 *encode = \&encode_xs;
270 }else{
8f139f4c 271 Encode::DEBUG and warn __PACKAGE__, " XS off";
b536bf57 272 *decode = sub{
273 my ($obj,$octets,$chk) = @_;
274 my $str = Encode::decode_utf8($octets);
275 if (defined $str) {
276 $_[1] = '' if $chk;
277 return $str;
278 }
279 return undef;
280 };
281 *encode = sub {
282 my ($obj,$string,$chk) = @_;
283 my $octets = Encode::encode_utf8($string);
284 $_[1] = '' if $chk;
285 return $octets;
286 };
287 }
220e2d4e 288 *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
289 my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
290 my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
291 use bytes;
292 if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
293 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
294 $$rpos = $npos + length($trm);
295 return 1;
296 }
297 $$rdst .= substr($$rsrc, $pos);
298 $$rpos = length($$rsrc);
299 return '';
300 };
b7a5c9de 301 $Encode::Encoding{utf8} =
c731e18e 302 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 303 }
f2a2953c 304}
305
656753f8 3061;
307
2a936312 308__END__
309
4411f3b6 310=head1 NAME
311
312Encode - character encodings
313
314=head1 SYNOPSIS
315
316 use Encode;
317
67d7b5ef 318=head2 Table of Contents
319
0ab8f81e 320Encode consists of a collection of modules whose details are too big
67d7b5ef 321to fit in one document. This POD itself explains the top-level APIs
6d1c0808 322and general topics at a glance. For other topics and more details,
0ab8f81e 323see the PODs below:
67d7b5ef 324
325 Name Description
326 --------------------------------------------------------
6d1c0808 327 Encode::Alias Alias definitions to encodings
67d7b5ef 328 Encode::Encoding Encode Implementation Base Class
329 Encode::Supported List of Supported Encodings
330 Encode::CN Simplified Chinese Encodings
331 Encode::JP Japanese Encodings
332 Encode::KR Korean Encodings
333 Encode::TW Traditional Chinese Encodings
334 --------------------------------------------------------
335
4411f3b6 336=head1 DESCRIPTION
337
47bfe92f 338The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 339and the rest of the system. Perl strings are sequences of
340B<characters>.
341
342The repertoire of characters that Perl can represent is at least that
343defined by the Unicode Consortium. On most platforms the ordinal
344values of the characters (as returned by C<ord(ch)>) is the "Unicode
345codepoint" for the character (the exceptions are those platforms where
346the legacy encoding is some variant of EBCDIC rather than a super-set
347of ASCII - see L<perlebcdic>).
348
0ab8f81e 349Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef 350often called "bytes". These chunks are also known as "octets" in
351networking standards. Perl is widely used to manipulate data of many
352types - not only strings of characters representing human or computer
0ab8f81e 353languages but also "binary" data being the machine's representation of
67d7b5ef 354numbers, pixels in an image - or just about anything.
355
0ab8f81e 356When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 357process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 358byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef 359"logical character".
360
361=head2 TERMINOLOGY
4411f3b6 362
7e19fb92 363=over 2
21938dfa 364
67d7b5ef 365=item *
366
367I<character>: a character in the range 0..(2**32-1) (or more).
368(What Perl's strings are made of.)
369
370=item *
371
372I<byte>: a character in the range 0..255
373(A special case of a Perl character.)
374
375=item *
376
377I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 378(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef 379
380=back
4411f3b6 381
67d7b5ef 382=head1 PERL ENCODING API
4411f3b6 383
7e19fb92 384=over 2
4411f3b6 385
b7a5c9de 386=item $octets = encode(ENCODING, $string [, CHECK])
4411f3b6 387
0ab8f81e 388Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 389a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e 390an alias. For encoding names and aliases, see L</"Defining Aliases">.
391For CHECK, see L</"Handling Malformed Data">.
4411f3b6 392
b7a5c9de 393For example, to convert a string from Perl's internal format to
6d1c0808 394iso-8859-1 (also known as Latin1),
681a7c68 395
b7a5c9de 396 $octets = encode("iso-8859-1", $string);
7e19fb92 397
b7a5c9de 398B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
399B<may not be equal to> $string. Though they both contain the same data, the utf8 flag
7e19fb92 400for $octets is B<always> off. When you encode anything, utf8 flag of
401the result is always off, even when it contains completely valid utf8
402string. See L</"The UTF-8 flag"> below.
681a7c68 403
4089adc4 404encode($valid_encoding, undef) is harmless but warns you for
405C<Use of uninitialized value in subroutine entry>.
406encode($valid_encoding, '') is harmless and warnless.
407
b7a5c9de 408=item $string = decode(ENCODING, $octets [, CHECK])
4411f3b6 409
0ab8f81e 410Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
411internal form and returns the resulting string. As in encode(),
412ENCODING can be either a canonical name or an alias. For encoding names
413and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f 414L</"Handling Malformed Data">.
415
b7a5c9de 416For example, to convert ISO-8859-1 data to a string in Perl's internal format:
681a7c68 417
b7a5c9de 418 $string = decode("iso-8859-1", $octets);
681a7c68 419
b7a5c9de 420B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
421B<may not be equal to> $octets. Though they both contain the same data,
422the utf8 flag for $string is on unless $octets entirely consists of
7e19fb92 423ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
424below.
47bfe92f 425
4089adc4 426decode($valid_encoding, undef) is harmless but warns you for
427C<Use of uninitialized value in subroutine entry>.
428decode($valid_encoding, '') is harmless and warnless.
429
b7a5c9de 430=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 431
b7a5c9de 432Converts B<in-place> data between two encodings. The data in $octets
433must be encoded as octets and not as characters in Perl's internal
f9d05ba3 434format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
435encoding:
2b106fbe 436
b7a5c9de 437 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe 438
439and to convert it back:
440
b7a5c9de 441 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 442
ab97ca19 443Note that because the conversion happens in place, the data to be
0ab8f81e 444converted cannot be a string constant; it must be a scalar variable.
ab97ca19 445
f9d05ba3 446from_to() returns the length of the converted string in octets on
447success, I<undef> on error.
3ef515df 448
b7a5c9de 449B<CAVEAT>: The following operations look the same but are not quite so;
7e19fb92 450
b7a5c9de 451 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 452 $data = decode("iso-8859-1", $data); #2
4411f3b6 453
b7a5c9de 454Both #1 and #2 make $data consist of a completely valid UTF-8 string
7e19fb92 455but only #2 turns utf8 flag on. #1 is equivalent to
f2a2953c 456
7e19fb92 457 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 458
7e19fb92 459See L</"The UTF-8 flag"> below.
f2a2953c 460
461=item $octets = encode_utf8($string);
462
7e19fb92 463Equivalent to C<$octets = encode("utf8", $string);> The characters
b7a5c9de 464that comprise $string are encoded in Perl's internal format and the
465result is returned as a sequence of octets. All possible
7e19fb92 466characters have a UTF-8 representation so this function cannot fail.
467
f2a2953c 468
469=item $string = decode_utf8($octets [, CHECK]);
470
7e19fb92 471equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
b7a5c9de 472The sequence of octets represented by
7e19fb92 473$octets is decoded from UTF-8 into a sequence of logical
474characters. Not all sequences of octets form valid UTF-8 encodings, so
475it is possible for this call to fail. For CHECK, see
476L</"Handling Malformed Data">.
f2a2953c 477
478=back
479
51ef4e11 480=head2 Listing available encodings
481
5129552c 482 use Encode;
483 @list = Encode->encodings();
484
485Returns a list of the canonical names of the available encodings that
486are loaded. To get a list of all available encodings including the
487ones that are not loaded yet, say
488
489 @all_encodings = Encode->encodings(":all");
490
0ab8f81e 491Or you can give the name of a specific module.
5129552c 492
c731e18e 493 @with_jp = Encode->encodings("Encode::JP");
494
495When "::" is not in the name, "Encode::" is assumed.
51ef4e11 496
c731e18e 497 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 498
0ab8f81e 499To find out in detail which encodings are supported by this package,
5d030b67 500see L<Encode::Supported>.
51ef4e11 501
502=head2 Defining Aliases
503
0ab8f81e 504To add a new alias to a given encoding, use:
67d7b5ef 505
5129552c 506 use Encode;
507 use Encode::Alias;
a63c962f 508 define_alias(newName => ENCODING);
51ef4e11 509
3ef515df 510After that, newName can be used as an alias for ENCODING.
f2a2953c 511ENCODING may be either the name of an encoding or an
512I<encoding object>
51ef4e11 513
fcb875d4 514But before you do so, make sure the alias is nonexistent with
515C<resolve_alias()>, which returns the canonical name thereof.
516i.e.
517
518 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
519 Encode::resolve_alias("iso-8859-12") # false; nonexistent
520 Encode::resolve_alias($name) eq $name # true if $name is canonical
521
0ab8f81e 522resolve_alias() does not need C<use Encode::Alias>; it can be
523exported via C<use Encode qw(resolve_alias)>.
fcb875d4 524
0ab8f81e 525See L<Encode::Alias> for details.
51ef4e11 526
85982a32 527=head1 Encoding via PerlIO
4411f3b6 528
b7a5c9de 529If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
0ab8f81e 530and encode directly via a filehandle. The following two examples
531are totally identical in their functionality.
4411f3b6 532
85982a32 533 # via PerlIO
534 open my $in, "<:encoding(shiftjis)", $infile or die;
535 open my $out, ">:encoding(euc-jp)", $outfile or die;
b7a5c9de 536 while(<$in>){ print $out $_; }
8e86646e 537
85982a32 538 # via from_to
0ab8f81e 539 open my $in, "<", $infile or die;
540 open my $out, ">", $outfile or die;
b7a5c9de 541 while(<$in>){
0ab8f81e 542 from_to($_, "shiftjis", "euc-jp", 1);
b7a5c9de 543 print $out $_;
85982a32 544 }
4411f3b6 545
b7a5c9de 546Unfortunately, it may be that encodings are PerlIO-savvy. You can check
0ab8f81e 547if your encoding is supported by PerlIO by calling the C<perlio_ok>
548method.
549
550 Encode::perlio_ok("hz"); # False
551 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
552
553 use Encode qw(perlio_ok); # exported upon request
554 perlio_ok("euc-jp")
4411f3b6 555
0ab8f81e 556Fortunately, all encodings that come with Encode core are PerlIO-savvy
f9d05ba3 557except for hz and ISO-2022-kr. For gory details, see
558L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 559
85982a32 560=head1 Handling Malformed Data
4411f3b6 561
f9d05ba3 562The optional I<CHECK> argument is used as follows. When you omit it,
563Encode::FB_DEFAULT ( == 0 ) is assumed.
564
565=over 2
566
567=item B<NOTE:> Not all encoding suppport this feature
568
569Some encodings ignore I<CHECK> argument. For example,
570L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
571
572=back
573
574Now here is the list of I<CHECK> values available
47bfe92f 575
151b5d36 576=over 2
577
85982a32 578=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 579
f9d05ba3 580If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
581place of a malformed character. When you encode to UCM-based encodings,
582E<lt>subcharE<gt> will be used. When you decode from UCM-based
583encodings, the code point C<0xFFFD> is used. If the data is supposed
584to be UTF-8, an optional lexical warning (category utf8) is given.
e9692b5b 585
7e19fb92 586=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 587
b7a5c9de 588If I<CHECK> is 1, methods will die on error immediately with an error
0ab8f81e 589message. Therefore, when I<CHECK> is set to 1, you should trap the
f9d05ba3 590error with eval{} unless you really want to let it die.
47bfe92f 591
85982a32 592=item I<CHECK> = Encode::FB_QUIET
47bfe92f 593
85982a32 594If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
f9d05ba3 595return the portion of the data that has been processed so far when an
596error occurs. The data argument will be overwritten with everything
597after that point (that is, the unprocessed part of data). This is
598handy when you have to call decode repeatedly in the case where your
599source data may contain partial multi-byte character sequences,
600(i.e. you are reading with a fixed-width buffer). Here is a sample
601code that does exactly this:
4411f3b6 602
b7a5c9de 603 my $data = ''; my $utf8 = '';
85982a32 604 while(defined(read $fh, $buffer, 256)){
0ab8f81e 605 # buffer may end in a partial character so we append
85982a32 606 $data .= $buffer;
ee269af2 607 $utf8 .= decode($encoding, $data, Encode::FB_QUIET);
0ab8f81e 608 # $data now contains the unprocessed partial character
85982a32 609 }
1768d7eb 610
85982a32 611=item I<CHECK> = Encode::FB_WARN
67d7b5ef 612
0ab8f81e 613This is the same as above, except that it warns on error. Handy when
614you are debugging the mode above.
85982a32 615
616=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
617
af1f55d9 618=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
619
620=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
621
85982a32 622For encodings that are implemented by Encode::XS, CHECK ==
623Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
624
b7a5c9de 625When you decode, C<\xI<HH>> will be inserted for a malformed character,
626where I<HH> is the hex representation of the octet that could not be
627decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
628where I<HHHH> is the Unicode ID of the character that cannot be found
0ab8f81e 629in the character repertoire of the encoding.
85982a32 630
af1f55d9 631HTML/XML character reference modes are about the same, in place of
f9d05ba3 632C<\x{I<HHHH>}>, HTML uses C<&#I<NNNN>;> where I<NNNN> is a decimal digit and
633XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal digit.
af1f55d9 634
85982a32 635=item The bitmask
636
0ab8f81e 637These modes are actually set via a bitmask. Here is how the FB_XX
638constants are laid out. You can import the FB_XX constants via
639C<use Encode qw(:fallbacks)>; you can import the generic bitmask
640constants via C<use Encode qw(:fallback_all)>.
85982a32 641
b0b300a3 642 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
643 DIE_ON_ERR 0x0001 X
4089adc4 644 WARN_ON_ERR 0x0002 X
b0b300a3 645 RETURN_ON_ERR 0x0004 X X
646 LEAVE_SRC 0x0008
647 PERLQQ 0x0100 X
b7a5c9de 648 HTMLCREF 0x0200
649 XMLCREF 0x0400
67d7b5ef 650
151b5d36 651=back
652
0ab8f81e 653=head2 Unimplemented fallback schemes
67d7b5ef 654
0ab8f81e 655In the future, you will be able to use a code reference to a callback
f2a2953c 656function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 657
982a4085 658The fallback scheme does not work on EBCDIC platforms.
659
67d7b5ef 660=head1 Defining Encodings
661
662To define a new encoding, use:
663
b7a5c9de 664 use Encode qw(define_encoding);
67d7b5ef 665 define_encoding($object, 'canonicalName' [, alias...]);
666
667I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 668should provide the interface described in L<Encode::Encoding>.
67d7b5ef 669If more than two arguments are provided then additional
b7a5c9de 670arguments are taken as aliases for I<$object>.
67d7b5ef 671
f2a2953c 672See L<Encode::Encoding> for more details.
673
7e19fb92 674=head1 The UTF-8 flag
675
676Before the introduction of utf8 support in perl, The C<eq> operator
b7a5c9de 677just compared the strings represented by two scalars. Beginning with
678perl 5.8, C<eq> compares two strings with simultaneous consideration
679of I<the utf8 flag>. To explain why we made it so, I will quote page
680402 of C<Programming Perl, 3rd ed.>
7e19fb92 681
682=over 2
683
684=item Goal #1:
685
686Old byte-oriented programs should not spontaneously break on the old
687byte-oriented data they used to work on.
688
689=item Goal #2:
690
691Old byte-oriented programs should magically start working on the new
692character-oriented data when appropriate.
693
694=item Goal #3:
695
696Programs should run just as fast in the new character-oriented mode
697as in the old byte-oriented mode.
698
699=item Goal #4:
700
701Perl should remain one language, rather than forking into a
702byte-oriented Perl and a character-oriented Perl.
703
704=back
705
706Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
707was born and many features documented in the book remained
b7a5c9de 708unimplemented for a long time. Perl 5.8 corrected this and the introduction
709of the UTF-8 flag is one of them. You can think of this perl notion as of a
710byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
7e19fb92 711flag on).
712
713Here is how Encode takes care of the utf8 flag.
714
4bdf5738 715=over 2
7e19fb92 716
717=item *
718
719When you encode, the resulting utf8 flag is always off.
720
151b5d36 721=item *
7e19fb92 722
b7a5c9de 723When you decode, the resulting utf8 flag is on unless you can
7e19fb92 724unambiguously represent data. Here is the definition of
725dis-ambiguity.
726
b7a5c9de 727After C<$utf8 = decode('foo', $octet);>,
7e19fb92 728
729 When $octet is... The utf8 flag in $utf8 is
730 ---------------------------------------------
731 In ASCII only (or EBCDIC only) OFF
732 In ISO-8859-1 ON
733 In any other Encoding ON
734 ---------------------------------------------
735
736As you see, there is one exception, In ASCII. That way you can assue
737Goal #1. And with Encode Goal #2 is assumed but you still have to be
738careful in such cases mentioned in B<CAVEAT> paragraphs.
739
740This utf8 flag is not visible in perl scripts, exactly for the same
741reason you cannot (or you I<don't have to>) see if a scalar contains a
742string, integer, or floating point number. But you can still peek
743and poke these if you will. See the section below.
744
745=back
746
747=head2 Messing with Perl's Internals
4411f3b6 748
47bfe92f 749The following API uses parts of Perl's internals in the current
0ab8f81e 750implementation. As such, they are efficient but may change.
4411f3b6 751
7e19fb92 752=over 2
4411f3b6 753
a63c962f 754=item is_utf8(STRING [, CHECK])
4411f3b6 755
0ab8f81e 756[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
47bfe92f 757If CHECK is true, also checks the data in STRING for being well-formed
758UTF-8. Returns true if successful, false otherwise.
4411f3b6 759
2c246b25 760As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
b5ab1f6f 761
a63c962f 762=item _utf8_on(STRING)
4411f3b6 763
0ab8f81e 764[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
4411f3b6 765B<not> checked for being well-formed UTF-8. Do not use unless you
766B<know> that the STRING is well-formed UTF-8. Returns the previous
0ab8f81e 767state of the UTF-8 flag (so please don't treat the return value as
768indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 769
a63c962f 770=item _utf8_off(STRING)
4411f3b6 771
0ab8f81e 772[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
773Returns the previous state of the UTF-8 flag (so please don't treat the
774return value as indicating success or failure), or C<undef> if STRING is
4411f3b6 775not a string.
776
777=back
778
779=head1 SEE ALSO
780
5d030b67 781L<Encode::Encoding>,
782L<Encode::Supported>,
6d1c0808 783L<Encode::PerlIO>,
5d030b67 784L<encoding>,
6d1c0808 785L<perlebcdic>,
786L<perlfunc/open>,
787L<perlunicode>,
788L<utf8>,
5d030b67 789the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 790
85982a32 791=head1 MAINTAINER
aae85ceb 792
793This project was originated by Nick Ing-Simmons and later maintained
7e19fb92 794by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
795list of people involved. For any questions, use
b7a5c9de 796E<lt>perl-unicode@perl.orgE<gt> so we can all share.
aae85ceb 797
4411f3b6 798=cut