Missing semi-colon
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
c2cbba7d 2# $Id: Encode.pm,v 1.99 2003/12/29 02:47:16 dankogai Exp dankogai $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
c2cbba7d 6our $VERSION = do { my @r = (q$Revision: 1.99 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8f139f4c 7sub DEBUG () { 0 }
6d1c0808 8use XSLoader ();
10c5ecbb 9XSLoader::load(__PACKAGE__, $VERSION);
2c674647 10
2c674647 11require Exporter;
7e19fb92 12use base qw/Exporter/;
2c674647 13
4411f3b6 14# Public, encouraged API is exported by default
85982a32 15
16our @EXPORT = qw(
17 decode decode_utf8 encode encode_utf8
a0d8a30e 18 encodings find_encoding clone_encoding
4411f3b6 19);
20
b7a5c9de 21our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
af1f55d9 22 PERLQQ HTMLCREF XMLCREF);
b7a5c9de 23our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
af1f55d9 24 FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
85982a32 25
51ef4e11 26our @EXPORT_OK =
6d1c0808 27 (
85982a32 28 qw(
29 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
30 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
31 ),
32 @FB_FLAGS, @FB_CONSTS,
33 );
34
6d1c0808 35our %EXPORT_TAGS =
85982a32 36 (
37 all => [ @EXPORT, @EXPORT_OK ],
38 fallbacks => [ @FB_CONSTS ],
39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
40 );
41
4411f3b6 42# Documentation moved after __END__ for speed - NI-S
2c674647 43
a63c962f 44our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 45
5d030b67 46use Encode::Alias;
47
5129552c 48# Make a %Encoding package variable to allow a certain amount of cheating
49our %Encoding;
aae85ceb 50our %ExtModule;
51require Encode::Config;
52eval { require Encode::ConfigLocal };
5129552c 53
656753f8 54sub encodings
55{
5129552c 56 my $class = shift;
fc17bd48 57 my %enc;
58 if (@_ and $_[0] eq ":all"){
59 %enc = ( %Encoding, %ExtModule );
60 }else{
61 %enc = %Encoding;
62 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
8f139f4c 63 DEBUG and warn $mod;
fc17bd48 64 for my $enc (keys %ExtModule){
65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
66 }
67 }
5129552c 68 }
69 return
ce912cd4 70 sort { lc $a cmp lc $b }
fc17bd48 71 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc;
51ef4e11 72}
73
85982a32 74sub perlio_ok{
0ab8f81e 75 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
011b2d2f 76 $obj->can("perlio_ok") and return $obj->perlio_ok();
0ab8f81e 77 return 0; # safety net
85982a32 78}
79
51ef4e11 80sub define_encoding
81{
18586f54 82 my $obj = shift;
83 my $name = shift;
5129552c 84 $Encoding{$name} = $obj;
18586f54 85 my $lc = lc($name);
86 define_alias($lc => $obj) unless $lc eq $name;
10c5ecbb 87 while (@_){
18586f54 88 my $alias = shift;
10c5ecbb 89 define_alias($alias, $obj);
18586f54 90 }
91 return $obj;
656753f8 92}
93
656753f8 94sub getEncoding
95{
10c5ecbb 96 my ($class, $name, $skip_external) = @_;
97
a0d8a30e 98 ref($name) && $name->can('renew') and return $name;
10c5ecbb 99 exists $Encoding{$name} and return $Encoding{$name};
18586f54 100 my $lc = lc $name;
10c5ecbb 101 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 102
5129552c 103 my $oc = $class->find_alias($name);
10c5ecbb 104 defined($oc) and return $oc;
105 $lc ne $name and $oc = $class->find_alias($lc);
106 defined($oc) and return $oc;
c50d192e 107
c731e18e 108 unless ($skip_external)
d1ed7747 109 {
c731e18e 110 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
111 $mod =~ s,::,/,g ; $mod .= '.pm';
112 eval{ require $mod; };
10c5ecbb 113 exists $Encoding{$name} and return $Encoding{$name};
c731e18e 114 }
d1ed7747 115 }
18586f54 116 return;
656753f8 117}
118
a0d8a30e 119sub find_encoding($;$)
4411f3b6 120{
10c5ecbb 121 my ($name, $skip_external) = @_;
dd9703c9 122 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 123}
124
a0d8a30e 125sub resolve_alias($){
fcb875d4 126 my $obj = find_encoding(shift);
127 defined $obj and return $obj->name;
128 return;
129}
130
a0d8a30e 131sub clone_encoding($){
132 my $obj = find_encoding(shift);
133 ref $obj or return;
134 eval { require Storable };
135 $@ and return;
136 return Storable::dclone($obj);
137}
138
b2704119 139sub encode($$;$)
4411f3b6 140{
e8c86ba6 141 my ($name, $string, $check) = @_;
0f7c507f 142 return undef unless defined $string;
b2704119 143 $check ||=0;
18586f54 144 my $enc = find_encoding($name);
10c5ecbb 145 unless(defined $enc){
146 require Carp;
147 Carp::croak("Unknown encoding '$name'");
148 }
18586f54 149 my $octets = $enc->encode($string,$check);
23f3589e 150 $_[1] = $string if $check;
18586f54 151 return $octets;
4411f3b6 152}
153
b2704119 154sub decode($$;$)
4411f3b6 155{
18586f54 156 my ($name,$octets,$check) = @_;
0f7c507f 157 return undef unless defined $octets;
b2704119 158 $check ||=0;
18586f54 159 my $enc = find_encoding($name);
10c5ecbb 160 unless(defined $enc){
161 require Carp;
162 Carp::croak("Unknown encoding '$name'");
163 }
18586f54 164 my $string = $enc->decode($octets,$check);
165 $_[1] = $octets if $check;
166 return $string;
4411f3b6 167}
168
b2704119 169sub from_to($$$;$)
4411f3b6 170{
18586f54 171 my ($string,$from,$to,$check) = @_;
0f7c507f 172 return undef unless defined $string;
b2704119 173 $check ||=0;
18586f54 174 my $f = find_encoding($from);
10c5ecbb 175 unless (defined $f){
176 require Carp;
177 Carp::croak("Unknown encoding '$from'");
178 }
18586f54 179 my $t = find_encoding($to);
10c5ecbb 180 unless (defined $t){
181 require Carp;
182 Carp::croak("Unknown encoding '$to'");
183 }
18586f54 184 my $uni = $f->decode($string,$check);
185 return undef if ($check && length($string));
a999c27c 186 $string = $t->encode($uni,$check);
18586f54 187 return undef if ($check && length($uni));
3ef515df 188 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 189}
190
b2704119 191sub encode_utf8($)
4411f3b6 192{
18586f54 193 my ($str) = @_;
c731e18e 194 utf8::encode($str);
18586f54 195 return $str;
4411f3b6 196}
197
c2cbba7d 198sub decode_utf8($;$)
4411f3b6 199{
c2cbba7d 200 my ($str, $check) = @_;
201 if ($check){
202 return decode("utf8", $str, $check);
203 }else{
204 return undef unless utf8::decode($str);
205 return $str;
206 }
5ad8ef52 207}
208
b536bf57 209predefine_encodings(1);
f2a2953c 210
211#
212# This is to restore %Encoding if really needed;
213#
10c5ecbb 214
f2a2953c 215sub predefine_encodings{
10c5ecbb 216 use Encode::Encoding;
b536bf57 217 no warnings 'redefine';
218 my $use_xs = shift;
6d1c0808 219 if ($ON_EBCDIC) {
f2a2953c 220 # was in Encode::UTF_EBCDIC
221 package Encode::UTF_EBCDIC;
10c5ecbb 222 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
f2a2953c 223 *decode = sub{
224 my ($obj,$str,$chk) = @_;
225 my $res = '';
226 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 227 $res .=
f2a2953c 228 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
229 }
230 $_[1] = '' if $chk;
231 return $res;
232 };
233 *encode = sub{
234 my ($obj,$str,$chk) = @_;
235 my $res = '';
236 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 237 $res .=
f2a2953c 238 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
239 }
240 $_[1] = '' if $chk;
241 return $res;
242 };
6d1c0808 243 $Encode::Encoding{Unicode} =
c731e18e 244 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
6d1c0808 245 } else {
f2a2953c 246 package Encode::Internal;
10c5ecbb 247 push @Encode::Internal::ISA, 'Encode::Encoding';
f2a2953c 248 *decode = sub{
249 my ($obj,$str,$chk) = @_;
250 utf8::upgrade($str);
251 $_[1] = '' if $chk;
252 return $str;
253 };
254 *encode = \&decode;
6d1c0808 255 $Encode::Encoding{Unicode} =
c731e18e 256 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 257 }
258
259 {
260 # was in Encode::utf8
261 package Encode::utf8;
10c5ecbb 262 push @Encode::utf8::ISA, 'Encode::Encoding';
b536bf57 263 #
264 if ($use_xs){
8f139f4c 265 Encode::DEBUG and warn __PACKAGE__, " XS on";
b536bf57 266 *decode = \&decode_xs;
267 *encode = \&encode_xs;
268 }else{
8f139f4c 269 Encode::DEBUG and warn __PACKAGE__, " XS off";
b536bf57 270 *decode = sub{
271 my ($obj,$octets,$chk) = @_;
272 my $str = Encode::decode_utf8($octets);
273 if (defined $str) {
274 $_[1] = '' if $chk;
275 return $str;
276 }
277 return undef;
278 };
279 *encode = sub {
280 my ($obj,$string,$chk) = @_;
281 my $octets = Encode::encode_utf8($string);
282 $_[1] = '' if $chk;
283 return $octets;
284 };
285 }
220e2d4e 286 *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
287 my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
288 my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
289 use bytes;
290 if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
291 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
292 $$rpos = $npos + length($trm);
293 return 1;
294 }
295 $$rdst .= substr($$rsrc, $pos);
296 $$rpos = length($$rsrc);
297 return '';
298 };
b7a5c9de 299 $Encode::Encoding{utf8} =
c731e18e 300 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 301 }
f2a2953c 302}
303
656753f8 3041;
305
2a936312 306__END__
307
4411f3b6 308=head1 NAME
309
310Encode - character encodings
311
312=head1 SYNOPSIS
313
314 use Encode;
315
67d7b5ef 316=head2 Table of Contents
317
0ab8f81e 318Encode consists of a collection of modules whose details are too big
67d7b5ef 319to fit in one document. This POD itself explains the top-level APIs
6d1c0808 320and general topics at a glance. For other topics and more details,
0ab8f81e 321see the PODs below:
67d7b5ef 322
323 Name Description
324 --------------------------------------------------------
6d1c0808 325 Encode::Alias Alias definitions to encodings
67d7b5ef 326 Encode::Encoding Encode Implementation Base Class
327 Encode::Supported List of Supported Encodings
328 Encode::CN Simplified Chinese Encodings
329 Encode::JP Japanese Encodings
330 Encode::KR Korean Encodings
331 Encode::TW Traditional Chinese Encodings
332 --------------------------------------------------------
333
4411f3b6 334=head1 DESCRIPTION
335
47bfe92f 336The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 337and the rest of the system. Perl strings are sequences of
338B<characters>.
339
340The repertoire of characters that Perl can represent is at least that
341defined by the Unicode Consortium. On most platforms the ordinal
342values of the characters (as returned by C<ord(ch)>) is the "Unicode
343codepoint" for the character (the exceptions are those platforms where
344the legacy encoding is some variant of EBCDIC rather than a super-set
345of ASCII - see L<perlebcdic>).
346
0ab8f81e 347Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef 348often called "bytes". These chunks are also known as "octets" in
349networking standards. Perl is widely used to manipulate data of many
350types - not only strings of characters representing human or computer
0ab8f81e 351languages but also "binary" data being the machine's representation of
67d7b5ef 352numbers, pixels in an image - or just about anything.
353
0ab8f81e 354When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 355process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 356byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef 357"logical character".
358
359=head2 TERMINOLOGY
4411f3b6 360
7e19fb92 361=over 2
21938dfa 362
67d7b5ef 363=item *
364
365I<character>: a character in the range 0..(2**32-1) (or more).
366(What Perl's strings are made of.)
367
368=item *
369
370I<byte>: a character in the range 0..255
371(A special case of a Perl character.)
372
373=item *
374
375I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 376(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef 377
378=back
4411f3b6 379
67d7b5ef 380=head1 PERL ENCODING API
4411f3b6 381
7e19fb92 382=over 2
4411f3b6 383
b7a5c9de 384=item $octets = encode(ENCODING, $string [, CHECK])
4411f3b6 385
0ab8f81e 386Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 387a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e 388an alias. For encoding names and aliases, see L</"Defining Aliases">.
389For CHECK, see L</"Handling Malformed Data">.
4411f3b6 390
b7a5c9de 391For example, to convert a string from Perl's internal format to
6d1c0808 392iso-8859-1 (also known as Latin1),
681a7c68 393
b7a5c9de 394 $octets = encode("iso-8859-1", $string);
7e19fb92 395
b7a5c9de 396B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
397B<may not be equal to> $string. Though they both contain the same data, the utf8 flag
7e19fb92 398for $octets is B<always> off. When you encode anything, utf8 flag of
399the result is always off, even when it contains completely valid utf8
400string. See L</"The UTF-8 flag"> below.
681a7c68 401
4089adc4 402encode($valid_encoding, undef) is harmless but warns you for
403C<Use of uninitialized value in subroutine entry>.
404encode($valid_encoding, '') is harmless and warnless.
405
b7a5c9de 406=item $string = decode(ENCODING, $octets [, CHECK])
4411f3b6 407
0ab8f81e 408Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
409internal form and returns the resulting string. As in encode(),
410ENCODING can be either a canonical name or an alias. For encoding names
411and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f 412L</"Handling Malformed Data">.
413
b7a5c9de 414For example, to convert ISO-8859-1 data to a string in Perl's internal format:
681a7c68 415
b7a5c9de 416 $string = decode("iso-8859-1", $octets);
681a7c68 417
b7a5c9de 418B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
419B<may not be equal to> $octets. Though they both contain the same data,
420the utf8 flag for $string is on unless $octets entirely consists of
7e19fb92 421ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
422below.
47bfe92f 423
4089adc4 424decode($valid_encoding, undef) is harmless but warns you for
425C<Use of uninitialized value in subroutine entry>.
426decode($valid_encoding, '') is harmless and warnless.
427
b7a5c9de 428=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 429
b7a5c9de 430Converts B<in-place> data between two encodings. The data in $octets
431must be encoded as octets and not as characters in Perl's internal
432format. For example, to convert ISO-8859-1 data to Microsoft's CP1250 encoding:
2b106fbe 433
b7a5c9de 434 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe 435
436and to convert it back:
437
b7a5c9de 438 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 439
ab97ca19 440Note that because the conversion happens in place, the data to be
0ab8f81e 441converted cannot be a string constant; it must be a scalar variable.
ab97ca19 442
b7a5c9de 443from_to() returns the length of the converted string in octets on success, undef
3ef515df 444otherwise.
445
b7a5c9de 446B<CAVEAT>: The following operations look the same but are not quite so;
7e19fb92 447
b7a5c9de 448 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 449 $data = decode("iso-8859-1", $data); #2
4411f3b6 450
b7a5c9de 451Both #1 and #2 make $data consist of a completely valid UTF-8 string
7e19fb92 452but only #2 turns utf8 flag on. #1 is equivalent to
f2a2953c 453
7e19fb92 454 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 455
7e19fb92 456See L</"The UTF-8 flag"> below.
f2a2953c 457
458=item $octets = encode_utf8($string);
459
7e19fb92 460Equivalent to C<$octets = encode("utf8", $string);> The characters
b7a5c9de 461that comprise $string are encoded in Perl's internal format and the
462result is returned as a sequence of octets. All possible
7e19fb92 463characters have a UTF-8 representation so this function cannot fail.
464
f2a2953c 465
466=item $string = decode_utf8($octets [, CHECK]);
467
7e19fb92 468equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
b7a5c9de 469The sequence of octets represented by
7e19fb92 470$octets is decoded from UTF-8 into a sequence of logical
471characters. Not all sequences of octets form valid UTF-8 encodings, so
472it is possible for this call to fail. For CHECK, see
473L</"Handling Malformed Data">.
f2a2953c 474
475=back
476
51ef4e11 477=head2 Listing available encodings
478
5129552c 479 use Encode;
480 @list = Encode->encodings();
481
482Returns a list of the canonical names of the available encodings that
483are loaded. To get a list of all available encodings including the
484ones that are not loaded yet, say
485
486 @all_encodings = Encode->encodings(":all");
487
0ab8f81e 488Or you can give the name of a specific module.
5129552c 489
c731e18e 490 @with_jp = Encode->encodings("Encode::JP");
491
492When "::" is not in the name, "Encode::" is assumed.
51ef4e11 493
c731e18e 494 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 495
0ab8f81e 496To find out in detail which encodings are supported by this package,
5d030b67 497see L<Encode::Supported>.
51ef4e11 498
499=head2 Defining Aliases
500
0ab8f81e 501To add a new alias to a given encoding, use:
67d7b5ef 502
5129552c 503 use Encode;
504 use Encode::Alias;
a63c962f 505 define_alias(newName => ENCODING);
51ef4e11 506
3ef515df 507After that, newName can be used as an alias for ENCODING.
f2a2953c 508ENCODING may be either the name of an encoding or an
509I<encoding object>
51ef4e11 510
fcb875d4 511But before you do so, make sure the alias is nonexistent with
512C<resolve_alias()>, which returns the canonical name thereof.
513i.e.
514
515 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
516 Encode::resolve_alias("iso-8859-12") # false; nonexistent
517 Encode::resolve_alias($name) eq $name # true if $name is canonical
518
0ab8f81e 519resolve_alias() does not need C<use Encode::Alias>; it can be
520exported via C<use Encode qw(resolve_alias)>.
fcb875d4 521
0ab8f81e 522See L<Encode::Alias> for details.
51ef4e11 523
85982a32 524=head1 Encoding via PerlIO
4411f3b6 525
b7a5c9de 526If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
0ab8f81e 527and encode directly via a filehandle. The following two examples
528are totally identical in their functionality.
4411f3b6 529
85982a32 530 # via PerlIO
531 open my $in, "<:encoding(shiftjis)", $infile or die;
532 open my $out, ">:encoding(euc-jp)", $outfile or die;
b7a5c9de 533 while(<$in>){ print $out $_; }
8e86646e 534
85982a32 535 # via from_to
0ab8f81e 536 open my $in, "<", $infile or die;
537 open my $out, ">", $outfile or die;
b7a5c9de 538 while(<$in>){
0ab8f81e 539 from_to($_, "shiftjis", "euc-jp", 1);
b7a5c9de 540 print $out $_;
85982a32 541 }
4411f3b6 542
b7a5c9de 543Unfortunately, it may be that encodings are PerlIO-savvy. You can check
0ab8f81e 544if your encoding is supported by PerlIO by calling the C<perlio_ok>
545method.
546
547 Encode::perlio_ok("hz"); # False
548 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
549
550 use Encode qw(perlio_ok); # exported upon request
551 perlio_ok("euc-jp")
4411f3b6 552
0ab8f81e 553Fortunately, all encodings that come with Encode core are PerlIO-savvy
b7a5c9de 554except for hz and ISO-2022-kr. For gory details, see L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 555
85982a32 556=head1 Handling Malformed Data
4411f3b6 557
0ab8f81e 558The I<CHECK> argument is used as follows. When you omit it,
559the behaviour is the same as if you had passed a value of 0 for
560I<CHECK>.
47bfe92f 561
151b5d36 562=over 2
563
85982a32 564=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 565
0ab8f81e 566If I<CHECK> is 0, (en|de)code will put a I<substitution character>
567in place of a malformed character. For UCM-based encodings,
b7a5c9de 568E<lt>subcharE<gt> will be used. For Unicode, the code point C<0xFFFD> is used.
0ab8f81e 569If the data is supposed to be UTF-8, an optional lexical warning
570(category utf8) is given.
e9692b5b 571
7e19fb92 572=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 573
b7a5c9de 574If I<CHECK> is 1, methods will die on error immediately with an error
0ab8f81e 575message. Therefore, when I<CHECK> is set to 1, you should trap the
576fatal error with eval{} unless you really want to let it die on error.
47bfe92f 577
85982a32 578=item I<CHECK> = Encode::FB_QUIET
47bfe92f 579
85982a32 580If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
0ab8f81e 581return the portion of the data that has been processed so far when
582an error occurs. The data argument will be overwritten with
583everything after that point (that is, the unprocessed part of data).
584This is handy when you have to call decode repeatedly in the case
585where your source data may contain partial multi-byte character
586sequences, for example because you are reading with a fixed-width
587buffer. Here is some sample code that does exactly this:
4411f3b6 588
b7a5c9de 589 my $data = ''; my $utf8 = '';
85982a32 590 while(defined(read $fh, $buffer, 256)){
0ab8f81e 591 # buffer may end in a partial character so we append
85982a32 592 $data .= $buffer;
ee269af2 593 $utf8 .= decode($encoding, $data, Encode::FB_QUIET);
0ab8f81e 594 # $data now contains the unprocessed partial character
85982a32 595 }
1768d7eb 596
85982a32 597=item I<CHECK> = Encode::FB_WARN
67d7b5ef 598
0ab8f81e 599This is the same as above, except that it warns on error. Handy when
600you are debugging the mode above.
85982a32 601
602=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
603
af1f55d9 604=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
605
606=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
607
85982a32 608For encodings that are implemented by Encode::XS, CHECK ==
609Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
610
b7a5c9de 611When you decode, C<\xI<HH>> will be inserted for a malformed character,
612where I<HH> is the hex representation of the octet that could not be
613decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
614where I<HHHH> is the Unicode ID of the character that cannot be found
0ab8f81e 615in the character repertoire of the encoding.
85982a32 616
af1f55d9 617HTML/XML character reference modes are about the same, in place of
b7a5c9de 618C<\x{I<HHHH>}>, HTML uses C<&#I<NNNN>>; where I<NNNN> is a decimal digit and
619XML uses C<&#xI<HHHH>>; where I<HHHH> is the hexadecimal digit.
af1f55d9 620
85982a32 621=item The bitmask
622
0ab8f81e 623These modes are actually set via a bitmask. Here is how the FB_XX
624constants are laid out. You can import the FB_XX constants via
625C<use Encode qw(:fallbacks)>; you can import the generic bitmask
626constants via C<use Encode qw(:fallback_all)>.
85982a32 627
b0b300a3 628 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
629 DIE_ON_ERR 0x0001 X
4089adc4 630 WARN_ON_ERR 0x0002 X
b0b300a3 631 RETURN_ON_ERR 0x0004 X X
632 LEAVE_SRC 0x0008
633 PERLQQ 0x0100 X
b7a5c9de 634 HTMLCREF 0x0200
635 XMLCREF 0x0400
67d7b5ef 636
151b5d36 637=back
638
0ab8f81e 639=head2 Unimplemented fallback schemes
67d7b5ef 640
0ab8f81e 641In the future, you will be able to use a code reference to a callback
f2a2953c 642function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 643
982a4085 644The fallback scheme does not work on EBCDIC platforms.
645
67d7b5ef 646=head1 Defining Encodings
647
648To define a new encoding, use:
649
b7a5c9de 650 use Encode qw(define_encoding);
67d7b5ef 651 define_encoding($object, 'canonicalName' [, alias...]);
652
653I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 654should provide the interface described in L<Encode::Encoding>.
67d7b5ef 655If more than two arguments are provided then additional
b7a5c9de 656arguments are taken as aliases for I<$object>.
67d7b5ef 657
f2a2953c 658See L<Encode::Encoding> for more details.
659
7e19fb92 660=head1 The UTF-8 flag
661
662Before the introduction of utf8 support in perl, The C<eq> operator
b7a5c9de 663just compared the strings represented by two scalars. Beginning with
664perl 5.8, C<eq> compares two strings with simultaneous consideration
665of I<the utf8 flag>. To explain why we made it so, I will quote page
666402 of C<Programming Perl, 3rd ed.>
7e19fb92 667
668=over 2
669
670=item Goal #1:
671
672Old byte-oriented programs should not spontaneously break on the old
673byte-oriented data they used to work on.
674
675=item Goal #2:
676
677Old byte-oriented programs should magically start working on the new
678character-oriented data when appropriate.
679
680=item Goal #3:
681
682Programs should run just as fast in the new character-oriented mode
683as in the old byte-oriented mode.
684
685=item Goal #4:
686
687Perl should remain one language, rather than forking into a
688byte-oriented Perl and a character-oriented Perl.
689
690=back
691
692Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
693was born and many features documented in the book remained
b7a5c9de 694unimplemented for a long time. Perl 5.8 corrected this and the introduction
695of the UTF-8 flag is one of them. You can think of this perl notion as of a
696byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
7e19fb92 697flag on).
698
699Here is how Encode takes care of the utf8 flag.
700
4bdf5738 701=over 2
7e19fb92 702
703=item *
704
705When you encode, the resulting utf8 flag is always off.
706
151b5d36 707=item *
7e19fb92 708
b7a5c9de 709When you decode, the resulting utf8 flag is on unless you can
7e19fb92 710unambiguously represent data. Here is the definition of
711dis-ambiguity.
712
b7a5c9de 713After C<$utf8 = decode('foo', $octet);>,
7e19fb92 714
715 When $octet is... The utf8 flag in $utf8 is
716 ---------------------------------------------
717 In ASCII only (or EBCDIC only) OFF
718 In ISO-8859-1 ON
719 In any other Encoding ON
720 ---------------------------------------------
721
722As you see, there is one exception, In ASCII. That way you can assue
723Goal #1. And with Encode Goal #2 is assumed but you still have to be
724careful in such cases mentioned in B<CAVEAT> paragraphs.
725
726This utf8 flag is not visible in perl scripts, exactly for the same
727reason you cannot (or you I<don't have to>) see if a scalar contains a
728string, integer, or floating point number. But you can still peek
729and poke these if you will. See the section below.
730
731=back
732
733=head2 Messing with Perl's Internals
4411f3b6 734
47bfe92f 735The following API uses parts of Perl's internals in the current
0ab8f81e 736implementation. As such, they are efficient but may change.
4411f3b6 737
7e19fb92 738=over 2
4411f3b6 739
a63c962f 740=item is_utf8(STRING [, CHECK])
4411f3b6 741
0ab8f81e 742[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
47bfe92f 743If CHECK is true, also checks the data in STRING for being well-formed
744UTF-8. Returns true if successful, false otherwise.
4411f3b6 745
b5ab1f6f 746As of perl 5.8.1, L<utf8> also has utf8::is_utif8().
747
a63c962f 748=item _utf8_on(STRING)
4411f3b6 749
0ab8f81e 750[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
4411f3b6 751B<not> checked for being well-formed UTF-8. Do not use unless you
752B<know> that the STRING is well-formed UTF-8. Returns the previous
0ab8f81e 753state of the UTF-8 flag (so please don't treat the return value as
754indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 755
a63c962f 756=item _utf8_off(STRING)
4411f3b6 757
0ab8f81e 758[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
759Returns the previous state of the UTF-8 flag (so please don't treat the
760return value as indicating success or failure), or C<undef> if STRING is
4411f3b6 761not a string.
762
763=back
764
765=head1 SEE ALSO
766
5d030b67 767L<Encode::Encoding>,
768L<Encode::Supported>,
6d1c0808 769L<Encode::PerlIO>,
5d030b67 770L<encoding>,
6d1c0808 771L<perlebcdic>,
772L<perlfunc/open>,
773L<perlunicode>,
774L<utf8>,
5d030b67 775the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 776
85982a32 777=head1 MAINTAINER
aae85ceb 778
779This project was originated by Nick Ing-Simmons and later maintained
7e19fb92 780by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
781list of people involved. For any questions, use
b7a5c9de 782E<lt>perl-unicode@perl.orgE<gt> so we can all share.
aae85ceb 783
4411f3b6 784=cut