Subject: Re: Change 16122: Try to be clearer about perlio.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
7e19fb92 3our $VERSION = do { my @r = (q$Revision: 1.61 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
6d1c0808 5use XSLoader ();
6XSLoader::load 'Encode';
2c674647 7
2c674647 8require Exporter;
7e19fb92 9use base qw/Exporter/;
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
85982a32 12
13our @EXPORT = qw(
14 decode decode_utf8 encode encode_utf8
15 encodings find_encoding
4411f3b6 16);
17
af1f55d9 18our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
19 PERLQQ HTMLCREF XMLCREF);
20our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
21 FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
85982a32 22
51ef4e11 23our @EXPORT_OK =
6d1c0808 24 (
85982a32 25 qw(
26 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
27 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
28 ),
29 @FB_FLAGS, @FB_CONSTS,
30 );
31
6d1c0808 32our %EXPORT_TAGS =
85982a32 33 (
34 all => [ @EXPORT, @EXPORT_OK ],
35 fallbacks => [ @FB_CONSTS ],
36 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
37 );
38
4411f3b6 39# Documentation moved after __END__ for speed - NI-S
2c674647 40
bf230f3d 41use Carp;
42
a63c962f 43our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 44
5d030b67 45use Encode::Alias;
46
5129552c 47# Make a %Encoding package variable to allow a certain amount of cheating
48our %Encoding;
aae85ceb 49our %ExtModule;
50require Encode::Config;
51eval { require Encode::ConfigLocal };
5129552c 52
656753f8 53sub encodings
54{
5129552c 55 my $class = shift;
071db25d 56 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e 57 for my $mod (@modules){
58 $mod =~ s,::,/,g or $mod = "Encode/$mod";
6d1c0808 59 $mod .= '.pm';
c731e18e 60 $DEBUG and warn "about to require $mod;";
61 eval { require $mod; };
5129552c 62 }
c731e18e 63 my %modules = map {$_ => 1} @modules;
5129552c 64 return
ce912cd4 65 sort { lc $a cmp lc $b }
66 grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
51ef4e11 67}
68
85982a32 69sub perlio_ok{
0ab8f81e 70 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
011b2d2f 71 $obj->can("perlio_ok") and return $obj->perlio_ok();
0ab8f81e 72 return 0; # safety net
85982a32 73}
74
51ef4e11 75sub define_encoding
76{
18586f54 77 my $obj = shift;
78 my $name = shift;
5129552c 79 $Encoding{$name} = $obj;
18586f54 80 my $lc = lc($name);
81 define_alias($lc => $obj) unless $lc eq $name;
82 while (@_)
83 {
84 my $alias = shift;
85 define_alias($alias,$obj);
86 }
87 return $obj;
656753f8 88}
89
656753f8 90sub getEncoding
91{
dd9703c9 92 my ($class,$name,$skip_external) = @_;
18586f54 93 my $enc;
94 if (ref($name) && $name->can('new_sequence'))
95 {
96 return $name;
97 }
98 my $lc = lc $name;
5129552c 99 if (exists $Encoding{$name})
18586f54 100 {
5129552c 101 return $Encoding{$name};
18586f54 102 }
5129552c 103 if (exists $Encoding{$lc})
18586f54 104 {
5129552c 105 return $Encoding{$lc};
18586f54 106 }
c50d192e 107
5129552c 108 my $oc = $class->find_alias($name);
c50d192e 109 return $oc if defined $oc;
110
5129552c 111 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 112 return $oc if defined $oc;
113
c731e18e 114 unless ($skip_external)
d1ed7747 115 {
c731e18e 116 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
117 $mod =~ s,::,/,g ; $mod .= '.pm';
118 eval{ require $mod; };
119 return $Encoding{$name} if exists $Encoding{$name};
120 }
d1ed7747 121 }
18586f54 122 return;
656753f8 123}
124
4411f3b6 125sub find_encoding
126{
dd9703c9 127 my ($name,$skip_external) = @_;
128 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 129}
130
fcb875d4 131sub resolve_alias {
132 my $obj = find_encoding(shift);
133 defined $obj and return $obj->name;
134 return;
135}
136
b2704119 137sub encode($$;$)
4411f3b6 138{
18586f54 139 my ($name,$string,$check) = @_;
b2704119 140 $check ||=0;
18586f54 141 my $enc = find_encoding($name);
142 croak("Unknown encoding '$name'") unless defined $enc;
143 my $octets = $enc->encode($string,$check);
144 return undef if ($check && length($string));
145 return $octets;
4411f3b6 146}
147
b2704119 148sub decode($$;$)
4411f3b6 149{
18586f54 150 my ($name,$octets,$check) = @_;
b2704119 151 $check ||=0;
18586f54 152 my $enc = find_encoding($name);
153 croak("Unknown encoding '$name'") unless defined $enc;
154 my $string = $enc->decode($octets,$check);
155 $_[1] = $octets if $check;
156 return $string;
4411f3b6 157}
158
b2704119 159sub from_to($$$;$)
4411f3b6 160{
18586f54 161 my ($string,$from,$to,$check) = @_;
b2704119 162 $check ||=0;
18586f54 163 my $f = find_encoding($from);
164 croak("Unknown encoding '$from'") unless defined $f;
165 my $t = find_encoding($to);
166 croak("Unknown encoding '$to'") unless defined $t;
167 my $uni = $f->decode($string,$check);
168 return undef if ($check && length($string));
a999c27c 169 $string = $t->encode($uni,$check);
18586f54 170 return undef if ($check && length($uni));
3ef515df 171 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 172}
173
b2704119 174sub encode_utf8($)
4411f3b6 175{
18586f54 176 my ($str) = @_;
c731e18e 177 utf8::encode($str);
18586f54 178 return $str;
4411f3b6 179}
180
b2704119 181sub decode_utf8($)
4411f3b6 182{
18586f54 183 my ($str) = @_;
184 return undef unless utf8::decode($str);
185 return $str;
5ad8ef52 186}
187
f2a2953c 188predefine_encodings();
189
190#
191# This is to restore %Encoding if really needed;
192#
193sub predefine_encodings{
6d1c0808 194 if ($ON_EBCDIC) {
f2a2953c 195 # was in Encode::UTF_EBCDIC
196 package Encode::UTF_EBCDIC;
197 *name = sub{ shift->{'Name'} };
198 *new_sequence = sub{ return $_[0] };
af1f55d9 199 *needs_lines = sub{ 0 };
200 *perlio_ok = sub {
201 eval{ require PerlIO::encoding };
202 return $@ ? 0 : 1;
203 };
f2a2953c 204 *decode = sub{
205 my ($obj,$str,$chk) = @_;
206 my $res = '';
207 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 208 $res .=
f2a2953c 209 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
210 }
211 $_[1] = '' if $chk;
212 return $res;
213 };
214 *encode = sub{
215 my ($obj,$str,$chk) = @_;
216 my $res = '';
217 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 218 $res .=
f2a2953c 219 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
220 }
221 $_[1] = '' if $chk;
222 return $res;
223 };
6d1c0808 224 $Encode::Encoding{Unicode} =
c731e18e 225 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
6d1c0808 226 } else {
f2a2953c 227 # was in Encode::UTF_EBCDIC
228 package Encode::Internal;
229 *name = sub{ shift->{'Name'} };
230 *new_sequence = sub{ return $_[0] };
af1f55d9 231 *needs_lines = sub{ 0 };
232 *perlio_ok = sub {
233 eval{ require PerlIO::encoding };
234 return $@ ? 0 : 1;
235 };
f2a2953c 236 *decode = sub{
237 my ($obj,$str,$chk) = @_;
238 utf8::upgrade($str);
239 $_[1] = '' if $chk;
240 return $str;
241 };
242 *encode = \&decode;
6d1c0808 243 $Encode::Encoding{Unicode} =
c731e18e 244 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 245 }
246
247 {
248 # was in Encode::utf8
249 package Encode::utf8;
250 *name = sub{ shift->{'Name'} };
251 *new_sequence = sub{ return $_[0] };
af1f55d9 252 *needs_lines = sub{ 0 };
253 *perlio_ok = sub {
254 eval{ require PerlIO::encoding };
255 return $@ ? 0 : 1;
256 };
f2a2953c 257 *decode = sub{
258 my ($obj,$octets,$chk) = @_;
259 my $str = Encode::decode_utf8($octets);
260 if (defined $str) {
261 $_[1] = '' if $chk;
262 return $str;
263 }
264 return undef;
265 };
266 *encode = sub {
267 my ($obj,$string,$chk) = @_;
268 my $octets = Encode::encode_utf8($string);
269 $_[1] = '' if $chk;
270 return $octets;
271 };
0ab8f81e 272 $Encode::Encoding{utf8} =
c731e18e 273 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 274 }
f2a2953c 275}
276
656753f8 2771;
278
2a936312 279__END__
280
4411f3b6 281=head1 NAME
282
283Encode - character encodings
284
285=head1 SYNOPSIS
286
287 use Encode;
288
67d7b5ef 289=head2 Table of Contents
290
0ab8f81e 291Encode consists of a collection of modules whose details are too big
67d7b5ef 292to fit in one document. This POD itself explains the top-level APIs
6d1c0808 293and general topics at a glance. For other topics and more details,
0ab8f81e 294see the PODs below:
67d7b5ef 295
296 Name Description
297 --------------------------------------------------------
6d1c0808 298 Encode::Alias Alias definitions to encodings
67d7b5ef 299 Encode::Encoding Encode Implementation Base Class
300 Encode::Supported List of Supported Encodings
301 Encode::CN Simplified Chinese Encodings
302 Encode::JP Japanese Encodings
303 Encode::KR Korean Encodings
304 Encode::TW Traditional Chinese Encodings
305 --------------------------------------------------------
306
4411f3b6 307=head1 DESCRIPTION
308
47bfe92f 309The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 310and the rest of the system. Perl strings are sequences of
311B<characters>.
312
313The repertoire of characters that Perl can represent is at least that
314defined by the Unicode Consortium. On most platforms the ordinal
315values of the characters (as returned by C<ord(ch)>) is the "Unicode
316codepoint" for the character (the exceptions are those platforms where
317the legacy encoding is some variant of EBCDIC rather than a super-set
318of ASCII - see L<perlebcdic>).
319
0ab8f81e 320Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef 321often called "bytes". These chunks are also known as "octets" in
322networking standards. Perl is widely used to manipulate data of many
323types - not only strings of characters representing human or computer
0ab8f81e 324languages but also "binary" data being the machine's representation of
67d7b5ef 325numbers, pixels in an image - or just about anything.
326
0ab8f81e 327When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 328process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 329byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef 330"logical character".
331
332=head2 TERMINOLOGY
4411f3b6 333
7e19fb92 334=over 2
21938dfa 335
67d7b5ef 336=item *
337
338I<character>: a character in the range 0..(2**32-1) (or more).
339(What Perl's strings are made of.)
340
341=item *
342
343I<byte>: a character in the range 0..255
344(A special case of a Perl character.)
345
346=item *
347
348I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 349(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef 350
351=back
4411f3b6 352
67d7b5ef 353The marker [INTERNAL] marks Internal Implementation Details, in
354general meant only for those who think they know what they are doing,
355and such details may change in future releases.
356
357=head1 PERL ENCODING API
4411f3b6 358
7e19fb92 359=over 2
4411f3b6 360
f2a2953c 361=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 362
0ab8f81e 363Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 364a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e 365an alias. For encoding names and aliases, see L</"Defining Aliases">.
366For CHECK, see L</"Handling Malformed Data">.
4411f3b6 367
0ab8f81e 368For example, to convert (internally UTF-8 encoded) Unicode string to
6d1c0808 369iso-8859-1 (also known as Latin1),
681a7c68 370
7e19fb92 371 $octets = encode("iso-8859-1", $utf8);
372
373B<CAVEAT>: When you C<$octets = encode("utf8", $utf8)>, then $octets
374B<ne> $utf8. Though they both contain the same data, the utf8 flag
375for $octets is B<always> off. When you encode anything, utf8 flag of
376the result is always off, even when it contains completely valid utf8
377string. See L</"The UTF-8 flag"> below.
681a7c68 378
f2a2953c 379=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 380
0ab8f81e 381Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
382internal form and returns the resulting string. As in encode(),
383ENCODING can be either a canonical name or an alias. For encoding names
384and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f 385L</"Handling Malformed Data">.
386
0ab8f81e 387For example, to convert ISO-8859-1 data to UTF-8:
681a7c68 388
67d7b5ef 389 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 390
7e19fb92 391B<CAVEAT>: When you C<$utf8 = encode("utf8", $octets)>, then $utf8
392B<may not be equal to> $utf8. Though they both contain the same data,
393the utf8 flag for $utf8 is on unless $octets entirely conststs of
394ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
395below.
47bfe92f 396
7e19fb92 397=item [$length =] from_to($string, FROM_ENC, TO_ENC [, CHECK])
398
399Converts B<in-place> data between two encodings. For example, to
400convert ISO-8859-1 data to UTF-8:
2b106fbe 401
7e19fb92 402 from_to($data, "iso-8859-1", "utf8");
2b106fbe 403
404and to convert it back:
405
7e19fb92 406 from_to($data, "utf8", "iso-8859-1");
4411f3b6 407
ab97ca19 408Note that because the conversion happens in place, the data to be
0ab8f81e 409converted cannot be a string constant; it must be a scalar variable.
ab97ca19 410
0ab8f81e 411from_to() returns the length of the converted string on success, undef
3ef515df 412otherwise.
413
7e19fb92 414B<CAVEAT>: The following operations look the same but not quite so;
415
416 from_to($data, "iso-8859-1", "utf8"); #1
417 $data = decode("iso-8859-1", $data); #2
4411f3b6 418
7e19fb92 419Both #1 and #2 makes $data consists of completely valid UTF-8 string
420but only #2 turns utf8 flag on. #1 is equivalent to
f2a2953c 421
7e19fb92 422 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 423
7e19fb92 424See L</"The UTF-8 flag"> below.
f2a2953c 425
426=item $octets = encode_utf8($string);
427
7e19fb92 428Equivalent to C<$octets = encode("utf8", $string);> The characters
429that comprise $string are encoded in Perl's superset of UTF-8 and the
430resulting octets are returned as a sequence of bytes. All possible
431characters have a UTF-8 representation so this function cannot fail.
432
f2a2953c 433
434=item $string = decode_utf8($octets [, CHECK]);
435
7e19fb92 436equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
437decode_utf8($octets [, CHECK]); The sequence of octets represented by
438$octets is decoded from UTF-8 into a sequence of logical
439characters. Not all sequences of octets form valid UTF-8 encodings, so
440it is possible for this call to fail. For CHECK, see
441L</"Handling Malformed Data">.
f2a2953c 442
443=back
444
51ef4e11 445=head2 Listing available encodings
446
5129552c 447 use Encode;
448 @list = Encode->encodings();
449
450Returns a list of the canonical names of the available encodings that
451are loaded. To get a list of all available encodings including the
452ones that are not loaded yet, say
453
454 @all_encodings = Encode->encodings(":all");
455
0ab8f81e 456Or you can give the name of a specific module.
5129552c 457
c731e18e 458 @with_jp = Encode->encodings("Encode::JP");
459
460When "::" is not in the name, "Encode::" is assumed.
51ef4e11 461
c731e18e 462 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 463
0ab8f81e 464To find out in detail which encodings are supported by this package,
5d030b67 465see L<Encode::Supported>.
51ef4e11 466
467=head2 Defining Aliases
468
0ab8f81e 469To add a new alias to a given encoding, use:
67d7b5ef 470
5129552c 471 use Encode;
472 use Encode::Alias;
a63c962f 473 define_alias(newName => ENCODING);
51ef4e11 474
3ef515df 475After that, newName can be used as an alias for ENCODING.
f2a2953c 476ENCODING may be either the name of an encoding or an
477I<encoding object>
51ef4e11 478
fcb875d4 479But before you do so, make sure the alias is nonexistent with
480C<resolve_alias()>, which returns the canonical name thereof.
481i.e.
482
483 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
484 Encode::resolve_alias("iso-8859-12") # false; nonexistent
485 Encode::resolve_alias($name) eq $name # true if $name is canonical
486
0ab8f81e 487resolve_alias() does not need C<use Encode::Alias>; it can be
488exported via C<use Encode qw(resolve_alias)>.
fcb875d4 489
0ab8f81e 490See L<Encode::Alias> for details.
51ef4e11 491
85982a32 492=head1 Encoding via PerlIO
4411f3b6 493
0ab8f81e 494If your perl supports I<PerlIO>, you can use a PerlIO layer to decode
495and encode directly via a filehandle. The following two examples
496are totally identical in their functionality.
4411f3b6 497
85982a32 498 # via PerlIO
499 open my $in, "<:encoding(shiftjis)", $infile or die;
500 open my $out, ">:encoding(euc-jp)", $outfile or die;
501 while(<>){ print; }
8e86646e 502
85982a32 503 # via from_to
0ab8f81e 504 open my $in, "<", $infile or die;
505 open my $out, ">", $outfile or die;
6d1c0808 506 while(<>){
0ab8f81e 507 from_to($_, "shiftjis", "euc-jp", 1);
85982a32 508 }
4411f3b6 509
0ab8f81e 510Unfortunately, there may be encodings are PerlIO-savvy. You can check
511if your encoding is supported by PerlIO by calling the C<perlio_ok>
512method.
513
514 Encode::perlio_ok("hz"); # False
515 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
516
517 use Encode qw(perlio_ok); # exported upon request
518 perlio_ok("euc-jp")
4411f3b6 519
0ab8f81e 520Fortunately, all encodings that come with Encode core are PerlIO-savvy
521except for hz and ISO-2022-kr. See L<Encode::Encoding> for details.
4411f3b6 522
0ab8f81e 523For gory details, see L<Encode::PerlIO>.
4411f3b6 524
85982a32 525=head1 Handling Malformed Data
4411f3b6 526
7e19fb92 527=over 2
47bfe92f 528
0ab8f81e 529The I<CHECK> argument is used as follows. When you omit it,
530the behaviour is the same as if you had passed a value of 0 for
531I<CHECK>.
47bfe92f 532
85982a32 533=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 534
0ab8f81e 535If I<CHECK> is 0, (en|de)code will put a I<substitution character>
536in place of a malformed character. For UCM-based encodings,
537E<lt>subcharE<gt> will be used. For Unicode, "\x{FFFD}" is used.
538If the data is supposed to be UTF-8, an optional lexical warning
539(category utf8) is given.
e9692b5b 540
7e19fb92 541=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 542
0ab8f81e 543If I<CHECK> is 1, methods will die immediately with an error
544message. Therefore, when I<CHECK> is set to 1, you should trap the
545fatal error with eval{} unless you really want to let it die on error.
47bfe92f 546
85982a32 547=item I<CHECK> = Encode::FB_QUIET
47bfe92f 548
85982a32 549If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
0ab8f81e 550return the portion of the data that has been processed so far when
551an error occurs. The data argument will be overwritten with
552everything after that point (that is, the unprocessed part of data).
553This is handy when you have to call decode repeatedly in the case
554where your source data may contain partial multi-byte character
555sequences, for example because you are reading with a fixed-width
556buffer. Here is some sample code that does exactly this:
4411f3b6 557
85982a32 558 my $data = '';
559 while(defined(read $fh, $buffer, 256)){
0ab8f81e 560 # buffer may end in a partial character so we append
85982a32 561 $data .= $buffer;
562 $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
0ab8f81e 563 # $data now contains the unprocessed partial character
85982a32 564 }
1768d7eb 565
85982a32 566=item I<CHECK> = Encode::FB_WARN
67d7b5ef 567
0ab8f81e 568This is the same as above, except that it warns on error. Handy when
569you are debugging the mode above.
85982a32 570
571=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
572
af1f55d9 573=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
574
575=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
576
85982a32 577For encodings that are implemented by Encode::XS, CHECK ==
578Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
579
0ab8f81e 580When you decode, '\xI<XX>' will be inserted for a malformed character,
581where I<XX> is the hex representation of the octet that could not be
582decoded to utf8. And when you encode, '\x{I<xxxx>}' will be inserted,
583where I<xxxx> is the Unicode ID of the character that cannot be found
584in the character repertoire of the encoding.
85982a32 585
af1f55d9 586HTML/XML character reference modes are about the same, in place of
587\x{I<xxxx>}, HTML uses &#I<1234>; where I<1234> is a decimal digit and
588XML uses &#xI<abcd>; where I<abcd> is the hexadecimal digit.
589
85982a32 590=item The bitmask
591
0ab8f81e 592These modes are actually set via a bitmask. Here is how the FB_XX
593constants are laid out. You can import the FB_XX constants via
594C<use Encode qw(:fallbacks)>; you can import the generic bitmask
595constants via C<use Encode qw(:fallback_all)>.
85982a32 596
b0b300a3 597 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
598 DIE_ON_ERR 0x0001 X
599 WARN_ON_ER 0x0002 X
600 RETURN_ON_ERR 0x0004 X X
601 LEAVE_SRC 0x0008
602 PERLQQ 0x0100 X
af1f55d9 603 HTMLCREF 0x0200
604 XMLCREF 0x0400
67d7b5ef 605
0ab8f81e 606=head2 Unimplemented fallback schemes
67d7b5ef 607
0ab8f81e 608In the future, you will be able to use a code reference to a callback
f2a2953c 609function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 610
611=head1 Defining Encodings
612
613To define a new encoding, use:
614
615 use Encode qw(define_alias);
616 define_encoding($object, 'canonicalName' [, alias...]);
617
618I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 619should provide the interface described in L<Encode::Encoding>.
67d7b5ef 620If more than two arguments are provided then additional
0ab8f81e 621arguments are taken as aliases for I<$object>, as for C<define_alias>.
67d7b5ef 622
f2a2953c 623See L<Encode::Encoding> for more details.
624
7e19fb92 625=head1 The UTF-8 flag
626
627Before the introduction of utf8 support in perl, The C<eq> operator
628just compares internal data of the scalars. Now C<eq> means internal
629data equality AND I<the utf8 flag>. To explain why we made it so, I
630will quote page 402 of C<Programming Perl, 3rd ed.>
631
632=over 2
633
634=item Goal #1:
635
636Old byte-oriented programs should not spontaneously break on the old
637byte-oriented data they used to work on.
638
639=item Goal #2:
640
641Old byte-oriented programs should magically start working on the new
642character-oriented data when appropriate.
643
644=item Goal #3:
645
646Programs should run just as fast in the new character-oriented mode
647as in the old byte-oriented mode.
648
649=item Goal #4:
650
651Perl should remain one language, rather than forking into a
652byte-oriented Perl and a character-oriented Perl.
653
654=back
655
656Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
657was born and many features documented in the book remained
658unimplemented. Perl 5.8 hopefully correct this and the introduction
659of UTF-8 flag is one of them. You can think this perl notion of
660byte-oriented mode (utf8 flag off) and character-oriented mode (utf8
661flag on).
662
663Here is how Encode takes care of the utf8 flag.
664
4bdf5738 665=over 2
7e19fb92 666
667=item *
668
669When you encode, the resulting utf8 flag is always off.
670
671=item
672
673When you decode, the resuting utf8 flag is on unless you can
674unambiguously represent data. Here is the definition of
675dis-ambiguity.
676
677 After C<$utf8 = decode('foo', $octet);>,
678
679 When $octet is... The utf8 flag in $utf8 is
680 ---------------------------------------------
681 In ASCII only (or EBCDIC only) OFF
682 In ISO-8859-1 ON
683 In any other Encoding ON
684 ---------------------------------------------
685
686As you see, there is one exception, In ASCII. That way you can assue
687Goal #1. And with Encode Goal #2 is assumed but you still have to be
688careful in such cases mentioned in B<CAVEAT> paragraphs.
689
690This utf8 flag is not visible in perl scripts, exactly for the same
691reason you cannot (or you I<don't have to>) see if a scalar contains a
692string, integer, or floating point number. But you can still peek
693and poke these if you will. See the section below.
694
695=back
696
697=head2 Messing with Perl's Internals
4411f3b6 698
47bfe92f 699The following API uses parts of Perl's internals in the current
0ab8f81e 700implementation. As such, they are efficient but may change.
4411f3b6 701
7e19fb92 702=over 2
4411f3b6 703
a63c962f 704=item is_utf8(STRING [, CHECK])
4411f3b6 705
0ab8f81e 706[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
47bfe92f 707If CHECK is true, also checks the data in STRING for being well-formed
708UTF-8. Returns true if successful, false otherwise.
4411f3b6 709
a63c962f 710=item _utf8_on(STRING)
4411f3b6 711
0ab8f81e 712[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
4411f3b6 713B<not> checked for being well-formed UTF-8. Do not use unless you
714B<know> that the STRING is well-formed UTF-8. Returns the previous
0ab8f81e 715state of the UTF-8 flag (so please don't treat the return value as
716indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 717
a63c962f 718=item _utf8_off(STRING)
4411f3b6 719
0ab8f81e 720[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
721Returns the previous state of the UTF-8 flag (so please don't treat the
722return value as indicating success or failure), or C<undef> if STRING is
4411f3b6 723not a string.
724
725=back
726
727=head1 SEE ALSO
728
5d030b67 729L<Encode::Encoding>,
730L<Encode::Supported>,
6d1c0808 731L<Encode::PerlIO>,
5d030b67 732L<encoding>,
6d1c0808 733L<perlebcdic>,
734L<perlfunc/open>,
735L<perlunicode>,
736L<utf8>,
5d030b67 737the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 738
85982a32 739=head1 MAINTAINER
aae85ceb 740
741This project was originated by Nick Ing-Simmons and later maintained
7e19fb92 742by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
743list of people involved. For any questions, use
744E<lt>perl-unicode@perl.orgE<gt> so we can all share share.
aae85ceb 745
4411f3b6 746=cut