Upgrade to Encode-2.17
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
5a1dbf39 2# $Id: Encode.pm,v 2.17 2006/05/09 17:10:42 dankogai Exp dankogai $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
5a1dbf39 6our $VERSION = sprintf "%d.%02d", q$Revision: 2.17 $ =~ /(\d+)/g;
8f139f4c 7sub DEBUG () { 0 }
6d1c0808 8use XSLoader ();
d1256cb1 9XSLoader::load( __PACKAGE__, $VERSION );
2c674647 10
2c674647 11require Exporter;
7e19fb92 12use base qw/Exporter/;
2c674647 13
4411f3b6 14# Public, encouraged API is exported by default
85982a32 15
16our @EXPORT = qw(
0a8c69ed 17 decode decode_utf8 encode encode_utf8 str2bytes bytes2str
a0d8a30e 18 encodings find_encoding clone_encoding
4411f3b6 19);
d1256cb1 20our @FB_FLAGS = qw(
21 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
22 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
23);
24our @FB_CONSTS = qw(
25 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
26 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
27);
28our @EXPORT_OK = (
29 qw(
30 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
31 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
85982a32 32 ),
d1256cb1 33 @FB_FLAGS, @FB_CONSTS,
34);
85982a32 35
d1256cb1 36our %EXPORT_TAGS = (
37 all => [ @EXPORT, @EXPORT_OK ],
38 fallbacks => [@FB_CONSTS],
39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
40);
85982a32 41
4411f3b6 42# Documentation moved after __END__ for speed - NI-S
2c674647 43
d1256cb1 44our $ON_EBCDIC = ( ord("A") == 193 );
f2a2953c 45
5d030b67 46use Encode::Alias;
47
5129552c 48# Make a %Encoding package variable to allow a certain amount of cheating
49our %Encoding;
aae85ceb 50our %ExtModule;
51require Encode::Config;
52eval { require Encode::ConfigLocal };
5129552c 53
d1256cb1 54sub encodings {
5129552c 55 my $class = shift;
fc17bd48 56 my %enc;
d1256cb1 57 if ( @_ and $_[0] eq ":all" ) {
58 %enc = ( %Encoding, %ExtModule );
5129552c 59 }
d1256cb1 60 else {
61 %enc = %Encoding;
62 for my $mod ( map { m/::/o ? $_ : "Encode::$_" } @_ ) {
63 DEBUG and warn $mod;
64 for my $enc ( keys %ExtModule ) {
65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
66 }
67 }
68 }
69 return sort { lc $a cmp lc $b }
70 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
51ef4e11 71}
72
d1256cb1 73sub perlio_ok {
74 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
011b2d2f 75 $obj->can("perlio_ok") and return $obj->perlio_ok();
d1256cb1 76 return 0; # safety net
85982a32 77}
78
d1256cb1 79sub define_encoding {
18586f54 80 my $obj = shift;
81 my $name = shift;
5129552c 82 $Encoding{$name} = $obj;
18586f54 83 my $lc = lc($name);
d1256cb1 84 define_alias( $lc => $obj ) unless $lc eq $name;
85 while (@_) {
86 my $alias = shift;
87 define_alias( $alias, $obj );
18586f54 88 }
89 return $obj;
656753f8 90}
91
d1256cb1 92sub getEncoding {
93 my ( $class, $name, $skip_external ) = @_;
10c5ecbb 94
a0d8a30e 95 ref($name) && $name->can('renew') and return $name;
10c5ecbb 96 exists $Encoding{$name} and return $Encoding{$name};
18586f54 97 my $lc = lc $name;
10c5ecbb 98 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 99
5129552c 100 my $oc = $class->find_alias($name);
10c5ecbb 101 defined($oc) and return $oc;
102 $lc ne $name and $oc = $class->find_alias($lc);
103 defined($oc) and return $oc;
c50d192e 104
d1256cb1 105 unless ($skip_external) {
106 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
107 $mod =~ s,::,/,g;
108 $mod .= '.pm';
109 eval { require $mod; };
110 exists $Encoding{$name} and return $Encoding{$name};
111 }
d1ed7747 112 }
18586f54 113 return;
656753f8 114}
115
d1256cb1 116sub find_encoding($;$) {
117 my ( $name, $skip_external ) = @_;
118 return __PACKAGE__->getEncoding( $name, $skip_external );
4411f3b6 119}
120
d1256cb1 121sub resolve_alias($) {
fcb875d4 122 my $obj = find_encoding(shift);
123 defined $obj and return $obj->name;
124 return;
125}
126
d1256cb1 127sub clone_encoding($) {
a0d8a30e 128 my $obj = find_encoding(shift);
129 ref $obj or return;
130 eval { require Storable };
131 $@ and return;
132 return Storable::dclone($obj);
133}
134
d1256cb1 135sub encode($$;$) {
136 my ( $name, $string, $check ) = @_;
0f7c507f 137 return undef unless defined $string;
d1256cb1 138 $string .= '' if ref $string; # stringify;
139 $check ||= 0;
18586f54 140 my $enc = find_encoding($name);
d1256cb1 141 unless ( defined $enc ) {
142 require Carp;
143 Carp::croak("Unknown encoding '$name'");
10c5ecbb 144 }
d1256cb1 145 my $octets = $enc->encode( $string, $check );
146 $_[1] = $string if $check and !( $check & LEAVE_SRC() );
18586f54 147 return $octets;
4411f3b6 148}
0a8c69ed 149*str2bytes = \&encode;
4411f3b6 150
d1256cb1 151sub decode($$;$) {
152 my ( $name, $octets, $check ) = @_;
0f7c507f 153 return undef unless defined $octets;
78589665 154 $octets .= '' if ref $octets;
d1256cb1 155 $check ||= 0;
18586f54 156 my $enc = find_encoding($name);
d1256cb1 157 unless ( defined $enc ) {
158 require Carp;
159 Carp::croak("Unknown encoding '$name'");
10c5ecbb 160 }
d1256cb1 161 my $string = $enc->decode( $octets, $check );
162 $_[1] = $octets if $check and !( $check & LEAVE_SRC() );
18586f54 163 return $string;
4411f3b6 164}
0a8c69ed 165*bytes2str = \&decode;
4411f3b6 166
d1256cb1 167sub from_to($$$;$) {
168 my ( $string, $from, $to, $check ) = @_;
0f7c507f 169 return undef unless defined $string;
d1256cb1 170 $check ||= 0;
18586f54 171 my $f = find_encoding($from);
d1256cb1 172 unless ( defined $f ) {
173 require Carp;
174 Carp::croak("Unknown encoding '$from'");
10c5ecbb 175 }
18586f54 176 my $t = find_encoding($to);
d1256cb1 177 unless ( defined $t ) {
178 require Carp;
179 Carp::croak("Unknown encoding '$to'");
10c5ecbb 180 }
41c240f5 181 my $uni = $f->decode($string);
d1256cb1 182 $_[0] = $string = $t->encode( $uni, $check );
183 return undef if ( $check && length($uni) );
184 return defined( $_[0] ) ? length($string) : undef;
4411f3b6 185}
186
d1256cb1 187sub encode_utf8($) {
18586f54 188 my ($str) = @_;
c731e18e 189 utf8::encode($str);
18586f54 190 return $str;
4411f3b6 191}
192
d1256cb1 193sub decode_utf8($;$) {
194 my ( $str, $check ) = @_;
41c240f5 195 return $str if is_utf8($str);
d1256cb1 196 if ($check) {
197 return decode( "utf8", $str, $check );
198 }
199 else {
200 return decode( "utf8", $str );
201 return $str;
c2cbba7d 202 }
5ad8ef52 203}
204
b536bf57 205predefine_encodings(1);
f2a2953c 206
207#
208# This is to restore %Encoding if really needed;
209#
10c5ecbb 210
d1256cb1 211sub predefine_encodings {
10c5ecbb 212 use Encode::Encoding;
b536bf57 213 no warnings 'redefine';
214 my $use_xs = shift;
6d1c0808 215 if ($ON_EBCDIC) {
d1256cb1 216
217 # was in Encode::UTF_EBCDIC
218 package Encode::UTF_EBCDIC;
219 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
220 *decode = sub {
221 my ( $obj, $str, $chk ) = @_;
222 my $res = '';
223 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
224 $res .=
225 chr(
226 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
227 );
228 }
229 $_[1] = '' if $chk;
230 return $res;
231 };
232 *encode = sub {
233 my ( $obj, $str, $chk ) = @_;
234 my $res = '';
235 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
236 $res .=
237 chr(
238 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
239 );
240 }
241 $_[1] = '' if $chk;
242 return $res;
243 };
244 $Encode::Encoding{Unicode} =
245 bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
246 }
247 else {
248
249 package Encode::Internal;
250 push @Encode::Internal::ISA, 'Encode::Encoding';
251 *decode = sub {
252 my ( $obj, $str, $chk ) = @_;
253 utf8::upgrade($str);
254 $_[1] = '' if $chk;
255 return $str;
256 };
257 *encode = \&decode;
258 $Encode::Encoding{Unicode} =
259 bless { Name => "Internal" } => "Encode::Internal";
f2a2953c 260 }
261
262 {
d1256cb1 263
264 # was in Encode::utf8
265 package Encode::utf8;
266 push @Encode::utf8::ISA, 'Encode::Encoding';
267
268 #
269 if ($use_xs) {
270 Encode::DEBUG and warn __PACKAGE__, " XS on";
271 *decode = \&decode_xs;
272 *encode = \&encode_xs;
273 }
274 else {
275 Encode::DEBUG and warn __PACKAGE__, " XS off";
276 *decode = sub {
277 my ( $obj, $octets, $chk ) = @_;
278 my $str = Encode::decode_utf8($octets);
279 if ( defined $str ) {
280 $_[1] = '' if $chk;
281 return $str;
282 }
283 return undef;
284 };
285 *encode = sub {
286 my ( $obj, $string, $chk ) = @_;
287 my $octets = Encode::encode_utf8($string);
288 $_[1] = '' if $chk;
289 return $octets;
290 };
291 }
292 *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk)
293 # currently ignores $chk
294 my ( $obj, undef, undef, $pos, $trm ) = @_;
295 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
296 use bytes;
297 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
298 $$rdst .=
299 substr( $$rsrc, $pos, $npos - $pos + length($trm) );
300 $$rpos = $npos + length($trm);
301 return 1;
302 }
303 $$rdst .= substr( $$rsrc, $pos );
304 $$rpos = length($$rsrc);
305 return '';
306 };
307 $Encode::Encoding{utf8} =
308 bless { Name => "utf8" } => "Encode::utf8";
309 $Encode::Encoding{"utf-8-strict"} =
310 bless { Name => "utf-8-strict", strict_utf8 => 1 } =>
311 "Encode::utf8";
f2a2953c 312 }
f2a2953c 313}
314
656753f8 3151;
316
2a936312 317__END__
318
4411f3b6 319=head1 NAME
320
321Encode - character encodings
322
323=head1 SYNOPSIS
324
325 use Encode;
326
67d7b5ef 327=head2 Table of Contents
328
0ab8f81e 329Encode consists of a collection of modules whose details are too big
67d7b5ef 330to fit in one document. This POD itself explains the top-level APIs
6d1c0808 331and general topics at a glance. For other topics and more details,
0ab8f81e 332see the PODs below:
67d7b5ef 333
334 Name Description
335 --------------------------------------------------------
6d1c0808 336 Encode::Alias Alias definitions to encodings
67d7b5ef 337 Encode::Encoding Encode Implementation Base Class
338 Encode::Supported List of Supported Encodings
339 Encode::CN Simplified Chinese Encodings
340 Encode::JP Japanese Encodings
341 Encode::KR Korean Encodings
342 Encode::TW Traditional Chinese Encodings
343 --------------------------------------------------------
344
4411f3b6 345=head1 DESCRIPTION
346
47bfe92f 347The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 348and the rest of the system. Perl strings are sequences of
349B<characters>.
350
351The repertoire of characters that Perl can represent is at least that
352defined by the Unicode Consortium. On most platforms the ordinal
353values of the characters (as returned by C<ord(ch)>) is the "Unicode
354codepoint" for the character (the exceptions are those platforms where
355the legacy encoding is some variant of EBCDIC rather than a super-set
356of ASCII - see L<perlebcdic>).
357
0ab8f81e 358Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef 359often called "bytes". These chunks are also known as "octets" in
360networking standards. Perl is widely used to manipulate data of many
361types - not only strings of characters representing human or computer
0ab8f81e 362languages but also "binary" data being the machine's representation of
67d7b5ef 363numbers, pixels in an image - or just about anything.
364
0ab8f81e 365When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 366process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 367byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef 368"logical character".
369
370=head2 TERMINOLOGY
4411f3b6 371
7e19fb92 372=over 2
21938dfa 373
67d7b5ef 374=item *
375
376I<character>: a character in the range 0..(2**32-1) (or more).
377(What Perl's strings are made of.)
378
379=item *
380
381I<byte>: a character in the range 0..255
382(A special case of a Perl character.)
383
384=item *
385
386I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 387(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef 388
389=back
4411f3b6 390
67d7b5ef 391=head1 PERL ENCODING API
4411f3b6 392
7e19fb92 393=over 2
4411f3b6 394
b7a5c9de 395=item $octets = encode(ENCODING, $string [, CHECK])
4411f3b6 396
0ab8f81e 397Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 398a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e 399an alias. For encoding names and aliases, see L</"Defining Aliases">.
400For CHECK, see L</"Handling Malformed Data">.
4411f3b6 401
b7a5c9de 402For example, to convert a string from Perl's internal format to
6d1c0808 403iso-8859-1 (also known as Latin1),
681a7c68 404
b7a5c9de 405 $octets = encode("iso-8859-1", $string);
7e19fb92 406
b7a5c9de 407B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
408B<may not be equal to> $string. Though they both contain the same data, the utf8 flag
7e19fb92 409for $octets is B<always> off. When you encode anything, utf8 flag of
410the result is always off, even when it contains completely valid utf8
411string. See L</"The UTF-8 flag"> below.
681a7c68 412
7f0d54d7 413If the $string is C<undef> then C<undef> is returned.
4089adc4 414
b7a5c9de 415=item $string = decode(ENCODING, $octets [, CHECK])
4411f3b6 416
0ab8f81e 417Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
418internal form and returns the resulting string. As in encode(),
419ENCODING can be either a canonical name or an alias. For encoding names
420and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f 421L</"Handling Malformed Data">.
422
b7a5c9de 423For example, to convert ISO-8859-1 data to a string in Perl's internal format:
681a7c68 424
b7a5c9de 425 $string = decode("iso-8859-1", $octets);
681a7c68 426
b7a5c9de 427B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
428B<may not be equal to> $octets. Though they both contain the same data,
429the utf8 flag for $string is on unless $octets entirely consists of
7e19fb92 430ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
431below.
47bfe92f 432
7f0d54d7 433If the $string is C<undef> then C<undef> is returned.
4089adc4 434
b7a5c9de 435=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 436
b7a5c9de 437Converts B<in-place> data between two encodings. The data in $octets
438must be encoded as octets and not as characters in Perl's internal
f9d05ba3 439format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
440encoding:
2b106fbe 441
b7a5c9de 442 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe 443
444and to convert it back:
445
b7a5c9de 446 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 447
ab97ca19 448Note that because the conversion happens in place, the data to be
0ab8f81e 449converted cannot be a string constant; it must be a scalar variable.
ab97ca19 450
f9d05ba3 451from_to() returns the length of the converted string in octets on
452success, I<undef> on error.
3ef515df 453
b7a5c9de 454B<CAVEAT>: The following operations look the same but are not quite so;
7e19fb92 455
b7a5c9de 456 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 457 $data = decode("iso-8859-1", $data); #2
4411f3b6 458
b7a5c9de 459Both #1 and #2 make $data consist of a completely valid UTF-8 string
7e19fb92 460but only #2 turns utf8 flag on. #1 is equivalent to
f2a2953c 461
7e19fb92 462 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 463
7e19fb92 464See L</"The UTF-8 flag"> below.
f2a2953c 465
466=item $octets = encode_utf8($string);
467
7e19fb92 468Equivalent to C<$octets = encode("utf8", $string);> The characters
b7a5c9de 469that comprise $string are encoded in Perl's internal format and the
470result is returned as a sequence of octets. All possible
7e19fb92 471characters have a UTF-8 representation so this function cannot fail.
472
f2a2953c 473
474=item $string = decode_utf8($octets [, CHECK]);
475
7e19fb92 476equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
b7a5c9de 477The sequence of octets represented by
7e19fb92 478$octets is decoded from UTF-8 into a sequence of logical
479characters. Not all sequences of octets form valid UTF-8 encodings, so
480it is possible for this call to fail. For CHECK, see
481L</"Handling Malformed Data">.
f2a2953c 482
483=back
484
51ef4e11 485=head2 Listing available encodings
486
5129552c 487 use Encode;
488 @list = Encode->encodings();
489
490Returns a list of the canonical names of the available encodings that
491are loaded. To get a list of all available encodings including the
492ones that are not loaded yet, say
493
494 @all_encodings = Encode->encodings(":all");
495
0ab8f81e 496Or you can give the name of a specific module.
5129552c 497
c731e18e 498 @with_jp = Encode->encodings("Encode::JP");
499
500When "::" is not in the name, "Encode::" is assumed.
51ef4e11 501
c731e18e 502 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 503
0ab8f81e 504To find out in detail which encodings are supported by this package,
5d030b67 505see L<Encode::Supported>.
51ef4e11 506
507=head2 Defining Aliases
508
0ab8f81e 509To add a new alias to a given encoding, use:
67d7b5ef 510
5129552c 511 use Encode;
512 use Encode::Alias;
a63c962f 513 define_alias(newName => ENCODING);
51ef4e11 514
3ef515df 515After that, newName can be used as an alias for ENCODING.
f2a2953c 516ENCODING may be either the name of an encoding or an
517I<encoding object>
51ef4e11 518
fcb875d4 519But before you do so, make sure the alias is nonexistent with
520C<resolve_alias()>, which returns the canonical name thereof.
521i.e.
522
523 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
524 Encode::resolve_alias("iso-8859-12") # false; nonexistent
525 Encode::resolve_alias($name) eq $name # true if $name is canonical
526
0ab8f81e 527resolve_alias() does not need C<use Encode::Alias>; it can be
528exported via C<use Encode qw(resolve_alias)>.
fcb875d4 529
0ab8f81e 530See L<Encode::Alias> for details.
51ef4e11 531
85982a32 532=head1 Encoding via PerlIO
4411f3b6 533
b7a5c9de 534If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
0ab8f81e 535and encode directly via a filehandle. The following two examples
536are totally identical in their functionality.
4411f3b6 537
85982a32 538 # via PerlIO
539 open my $in, "<:encoding(shiftjis)", $infile or die;
540 open my $out, ">:encoding(euc-jp)", $outfile or die;
b7a5c9de 541 while(<$in>){ print $out $_; }
8e86646e 542
85982a32 543 # via from_to
0ab8f81e 544 open my $in, "<", $infile or die;
545 open my $out, ">", $outfile or die;
b7a5c9de 546 while(<$in>){
0ab8f81e 547 from_to($_, "shiftjis", "euc-jp", 1);
b7a5c9de 548 print $out $_;
85982a32 549 }
4411f3b6 550
b7a5c9de 551Unfortunately, it may be that encodings are PerlIO-savvy. You can check
0ab8f81e 552if your encoding is supported by PerlIO by calling the C<perlio_ok>
553method.
554
555 Encode::perlio_ok("hz"); # False
556 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
557
558 use Encode qw(perlio_ok); # exported upon request
559 perlio_ok("euc-jp")
4411f3b6 560
0ab8f81e 561Fortunately, all encodings that come with Encode core are PerlIO-savvy
f9d05ba3 562except for hz and ISO-2022-kr. For gory details, see
563L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 564
85982a32 565=head1 Handling Malformed Data
4411f3b6 566
8e180e82 567The optional I<CHECK> argument tells Encode what to do when it
568encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 )
569is assumed.
570
571As of version 2.12 Encode supports coderef values for CHECK. See below.
f9d05ba3 572
573=over 2
574
3c4b39be 575=item B<NOTE:> Not all encoding support this feature
f9d05ba3 576
577Some encodings ignore I<CHECK> argument. For example,
578L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
579
580=back
581
582Now here is the list of I<CHECK> values available
47bfe92f 583
151b5d36 584=over 2
585
85982a32 586=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 587
f9d05ba3 588If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
78589665 589place of a malformed character. When you encode, E<lt>subcharE<gt>
590will be used. When you decode the code point C<0xFFFD> is used. If
591the data is supposed to be UTF-8, an optional lexical warning
592(category utf8) is given.
e9692b5b 593
7e19fb92 594=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 595
b7a5c9de 596If I<CHECK> is 1, methods will die on error immediately with an error
0ab8f81e 597message. Therefore, when I<CHECK> is set to 1, you should trap the
f9d05ba3 598error with eval{} unless you really want to let it die.
47bfe92f 599
85982a32 600=item I<CHECK> = Encode::FB_QUIET
47bfe92f 601
85982a32 602If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
f9d05ba3 603return the portion of the data that has been processed so far when an
604error occurs. The data argument will be overwritten with everything
605after that point (that is, the unprocessed part of data). This is
606handy when you have to call decode repeatedly in the case where your
607source data may contain partial multi-byte character sequences,
608(i.e. you are reading with a fixed-width buffer). Here is a sample
609code that does exactly this:
4411f3b6 610
78589665 611 my $buffer = ''; my $string = '';
612 while(read $fh, $buffer, 256, length($buffer)){
613 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
614 # $buffer now contains the unprocessed partial character
85982a32 615 }
1768d7eb 616
85982a32 617=item I<CHECK> = Encode::FB_WARN
67d7b5ef 618
0ab8f81e 619This is the same as above, except that it warns on error. Handy when
620you are debugging the mode above.
85982a32 621
622=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
623
af1f55d9 624=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
625
626=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
627
85982a32 628For encodings that are implemented by Encode::XS, CHECK ==
629Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
630
b7a5c9de 631When you decode, C<\xI<HH>> will be inserted for a malformed character,
632where I<HH> is the hex representation of the octet that could not be
633decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
634where I<HHHH> is the Unicode ID of the character that cannot be found
0ab8f81e 635in the character repertoire of the encoding.
85982a32 636
af1f55d9 637HTML/XML character reference modes are about the same, in place of
78589665 638C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
639XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
af1f55d9 640
7f0d54d7 641In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
642
85982a32 643=item The bitmask
644
0ab8f81e 645These modes are actually set via a bitmask. Here is how the FB_XX
646constants are laid out. You can import the FB_XX constants via
647C<use Encode qw(:fallbacks)>; you can import the generic bitmask
648constants via C<use Encode qw(:fallback_all)>.
85982a32 649
b0b300a3 650 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
651 DIE_ON_ERR 0x0001 X
4089adc4 652 WARN_ON_ERR 0x0002 X
b0b300a3 653 RETURN_ON_ERR 0x0004 X X
7f0d54d7 654 LEAVE_SRC 0x0008 X
b0b300a3 655 PERLQQ 0x0100 X
b7a5c9de 656 HTMLCREF 0x0200
657 XMLCREF 0x0400
67d7b5ef 658
151b5d36 659=back
660
8e180e82 661=head2 coderef for CHECK
662
663As of Encode 2.12 CHECK can also be a code reference which takes the
664ord value of unmapped caharacter as an argument and returns a string
665that represents the fallback character. For instance,
67d7b5ef 666
8e180e82 667 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
67d7b5ef 668
8e180e82 669Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of
670\x{I<XXXX>}.
982a4085 671
67d7b5ef 672=head1 Defining Encodings
673
674To define a new encoding, use:
675
b7a5c9de 676 use Encode qw(define_encoding);
67d7b5ef 677 define_encoding($object, 'canonicalName' [, alias...]);
678
679I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 680should provide the interface described in L<Encode::Encoding>.
67d7b5ef 681If more than two arguments are provided then additional
b7a5c9de 682arguments are taken as aliases for I<$object>.
67d7b5ef 683
f2a2953c 684See L<Encode::Encoding> for more details.
685
7e19fb92 686=head1 The UTF-8 flag
687
688Before the introduction of utf8 support in perl, The C<eq> operator
b7a5c9de 689just compared the strings represented by two scalars. Beginning with
690perl 5.8, C<eq> compares two strings with simultaneous consideration
691of I<the utf8 flag>. To explain why we made it so, I will quote page
692402 of C<Programming Perl, 3rd ed.>
7e19fb92 693
694=over 2
695
696=item Goal #1:
697
698Old byte-oriented programs should not spontaneously break on the old
699byte-oriented data they used to work on.
700
701=item Goal #2:
702
703Old byte-oriented programs should magically start working on the new
704character-oriented data when appropriate.
705
706=item Goal #3:
707
708Programs should run just as fast in the new character-oriented mode
709as in the old byte-oriented mode.
710
711=item Goal #4:
712
713Perl should remain one language, rather than forking into a
714byte-oriented Perl and a character-oriented Perl.
715
716=back
717
718Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
719was born and many features documented in the book remained
b7a5c9de 720unimplemented for a long time. Perl 5.8 corrected this and the introduction
721of the UTF-8 flag is one of them. You can think of this perl notion as of a
722byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
7e19fb92 723flag on).
724
725Here is how Encode takes care of the utf8 flag.
726
4bdf5738 727=over 2
7e19fb92 728
729=item *
730
731When you encode, the resulting utf8 flag is always off.
732
151b5d36 733=item *
7e19fb92 734
b7a5c9de 735When you decode, the resulting utf8 flag is on unless you can
7e19fb92 736unambiguously represent data. Here is the definition of
737dis-ambiguity.
738
b7a5c9de 739After C<$utf8 = decode('foo', $octet);>,
7e19fb92 740
741 When $octet is... The utf8 flag in $utf8 is
742 ---------------------------------------------
743 In ASCII only (or EBCDIC only) OFF
744 In ISO-8859-1 ON
745 In any other Encoding ON
746 ---------------------------------------------
747
3c4b39be 748As you see, there is one exception, In ASCII. That way you can assume
7e19fb92 749Goal #1. And with Encode Goal #2 is assumed but you still have to be
750careful in such cases mentioned in B<CAVEAT> paragraphs.
751
752This utf8 flag is not visible in perl scripts, exactly for the same
753reason you cannot (or you I<don't have to>) see if a scalar contains a
754string, integer, or floating point number. But you can still peek
755and poke these if you will. See the section below.
756
757=back
758
759=head2 Messing with Perl's Internals
4411f3b6 760
47bfe92f 761The following API uses parts of Perl's internals in the current
0ab8f81e 762implementation. As such, they are efficient but may change.
4411f3b6 763
7e19fb92 764=over 2
4411f3b6 765
a63c962f 766=item is_utf8(STRING [, CHECK])
4411f3b6 767
0ab8f81e 768[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
47bfe92f 769If CHECK is true, also checks the data in STRING for being well-formed
770UTF-8. Returns true if successful, false otherwise.
4411f3b6 771
2c246b25 772As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
b5ab1f6f 773
a63c962f 774=item _utf8_on(STRING)
4411f3b6 775
0ab8f81e 776[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
4411f3b6 777B<not> checked for being well-formed UTF-8. Do not use unless you
778B<know> that the STRING is well-formed UTF-8. Returns the previous
0ab8f81e 779state of the UTF-8 flag (so please don't treat the return value as
780indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 781
a63c962f 782=item _utf8_off(STRING)
4411f3b6 783
0ab8f81e 784[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
785Returns the previous state of the UTF-8 flag (so please don't treat the
786return value as indicating success or failure), or C<undef> if STRING is
4411f3b6 787not a string.
788
789=back
790
7f0d54d7 791=head1 UTF-8 vs. utf8
792
793 ....We now view strings not as sequences of bytes, but as sequences
794 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
795 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
796
797That has been the perl's notion of UTF-8 but official UTF-8 is more
798strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
799not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
800
801Now that is overruled by Larry Wall himself.
802
803 From: Larry Wall <larry@wall.org>
804 Date: December 04, 2004 11:51:58 JST
805 To: perl-unicode@perl.org
806 Subject: Re: Make Encode.pm support the real UTF-8
807 Message-Id: <20041204025158.GA28754@wall.org>
808
809 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
810 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
811 : but "UTF-8" is the name of the standard and should give the
812 : corresponding behaviour.
813
814 For what it's worth, that's how I've always kept them straight in my
815 head.
8e180e82 816
7f0d54d7 817 Also for what it's worth, Perl 6 will mostly default to strict but
818 make it easy to switch back to lax.
819
820 Larry
821
822Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
823while B<utf8> means liberal, lax, version thereof. And Encode version
8242.10 or later thus groks the difference between C<UTF-8> and C"utf8".
825
826 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
827 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
828
829C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
830Yes, the hyphen between "UTF" and "8" is important. Without it Encode
831goes "liberal"
832
833 find_encoding("UTF-8")->name # is 'utf-8-strict'
834 find_encoding("utf-8")->name # ditto. names are case insensitive
50c1ac04 835 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
7f0d54d7 836 find_encoding("UTF8")->name # is 'utf8'.
837
838
4411f3b6 839=head1 SEE ALSO
840
5d030b67 841L<Encode::Encoding>,
842L<Encode::Supported>,
6d1c0808 843L<Encode::PerlIO>,
5d030b67 844L<encoding>,
6d1c0808 845L<perlebcdic>,
846L<perlfunc/open>,
847L<perlunicode>,
848L<utf8>,
5d030b67 849the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 850
85982a32 851=head1 MAINTAINER
aae85ceb 852
853This project was originated by Nick Ing-Simmons and later maintained
7e19fb92 854by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
855list of people involved. For any questions, use
b7a5c9de 856E<lt>perl-unicode@perl.orgE<gt> so we can all share.
aae85ceb 857
d1256cb1 858While Dan Kogai retains the copyright as a maintainer, the credit
859should go to all those involoved. See AUTHORS for those submitted
860codes.
861
862=head1 COPYRIGHT
863
864Copyright 2002-2006 Dan Kogai E<lt>dankogai@dan.co.jpE<gt>
865
866This library is free software; you can redistribute it and/or modify
867it under the same terms as Perl itself.
868
4411f3b6 869=cut