[p5sagit/p5-mst-13.2.git] / ext / Encode / Unicode / Unicode.pm

package Encode::Unicode;

use strict;
use warnings;

our $VERSION = do { my @r = (q$Revision: 1.34 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);

#
# Object Generator 8 transcoders all at once!
#

require Encode;
for my $name (qw(UTF-16 UTF-16BE UTF-16LE
                 UTF-32 UTF-32BE UTF-32LE
                        UCS-2BE  UCS-2LE))
{
    my ($size, $endian, $ucs2, $mask);
    $name =~ /^(\w+)-(\d+)(\w*)$/o;
    if ($ucs2 = ($1 eq 'UCS')){
	$size = 2;
    }else{
	$size = $2/8;
    }
    $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
    $size == 4 and $endian = uc($endian);

    $Encode::Encoding{$name} = 	
	bless {
	       Name   =>   $name,
	       size   =>   $size,
	       endian => $endian,
	       ucs2   =>   $ucs2,
	      } => __PACKAGE__;

}

sub name { shift->{'Name'} }
sub new_sequence
{
    my $self = shift;
    # Return the original if endian known
    return $self if ($self->{endian});
    # Return a clone
    return bless {%$self},ref($self);
}

sub needs_lines { 0 };

sub perlio_ok { 
    exists $INC{"PerlIO/encoding.pm"} or return 0;
    return 1;
}


#
# three implementations of (en|de)code exist.  The XS version is the
# fastest.  *_modern uses an array and *_classic sticks with substr.
# *_classic is  much slower but more memory conservative.
# *_xs is the default.

sub set_transcoder{
    no warnings qw(redefine);
    my $type = shift;
    if    ($type eq "xs"){
	*decode = \&decode_xs;
	*encode = \&encode_xs;
    }elsif($type eq "modern"){
	*decode = \&decode_modern;
	*encode = \&encode_modern;
    }elsif($type eq "classic"){
	*decode = \&decode_classic;
	*encode = \&encode_classic;
    }else{
	require Carp; 
	Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
    }
}

set_transcoder("xs");

#
# Aux. subs & constants
#

sub FBCHAR(){ 0xFFFd }
sub BOM_BE(){ 0xFeFF }
sub BOM16LE(){ 0xFFFe }
sub BOM32LE(){ 0xFFFe0000 }

sub valid_ucs2($){
    return 
	(0 <= $_[0] && $_[0] < 0xD800) 
	    || 	( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
}

sub issurrogate($){   0xD800 <= $_[0]  && $_[0] <= 0xDFFF }
sub isHiSurrogate($){ 0xD800 <= $_[0]  && $_[0] <  0xDC00 }
sub isLoSurrogate($){ 0xDC00 <= $_[0]  && $_[0] <= 0xDFFF }

sub ensurrogate($){
    use integer; # we have divisions
    my $uni = shift;
    my  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
    my  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
    return ($hi, $lo);
}

sub desurrogate($$){
    my ($hi, $lo) = @_;
    return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
}

sub Mask { {2 => 0xffff,  4 => 0xffffffff} }

#
# *_modern are much faster but guzzle more memory
#

sub decode_modern($$;$)
{
    my ($obj, $str, $chk ) = @_;
    my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};

    # warn "$size, $endian, $ucs2";
    $endian ||= BOMB($size, substr($str, 0, $size, ''))
	or poisoned2death($obj, "Where's the BOM?");
    my  $mask = Mask->{$size};
    my $utf8   = '';
    my @ord = unpack("$endian*", $str);
    undef $str; # to conserve memory
    while (@ord){
	my $ord = shift @ord;
	unless ($size == 4 or valid_ucs2($ord &= $mask)){
	    if ($ucs2){
		$chk and 
		    poisoned2death($obj, "no surrogates allowed", $ord);
		shift @ord; # skip the next one as well
		$ord = FBCHAR;
	    }else{
		unless (isHiSurrogate($ord)){
		    poisoned2death($obj, "Malformed HI surrogate", $ord);
		}
		my $lo = shift @ord;
		unless (isLoSurrogate($lo &= $mask)){
		    poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
		}
		$ord = desurrogate($ord, $lo);
	    }
	}
	$utf8 .= chr($ord);
    }
    utf8::upgrade($utf8);
    return $utf8;
}

sub encode_modern($$;$)
{
    my ($obj, $utf8, $chk) = @_;
    my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
    my @str = ();
    unless ($endian){
	$endian = ($size == 4) ? 'N' : 'n';
	push @str, BOM_BE;
    }
    my @ord = unpack("U*", $utf8);
    undef $utf8; # to conserve memory
    for my $ord (@ord){
	unless ($size == 4 or valid_ucs2($ord)) {
	    unless(issurrogate($ord)){
		if ($ucs2){
		    $chk and 
			poisoned2death($obj, "code point too high", $ord);

		    push @str, FBCHAR;
		}else{
		 
		    push @str, ensurrogate($ord);
		}
	    }else{  # not supposed to happen
		push @str, FBCHAR;
	    }
	}else{
	    push @str, $ord;
	}
    }
    return pack("$endian*", @str);
}

#
# *_classic are slower but more memory conservative
#

sub decode_classic($$;$)
{
    my ($obj, $str, $chk ) = @_;
    my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};

    # warn "$size, $endian, $ucs2";
    $endian ||= BOMB($size, substr($str, 0, $size, ''))
	or poisoned2death($obj, "Where's the BOM?");
    my  $mask = Mask->{$size};
    my $utf8   = '';
    my @ord = unpack("$endian*", $str);
    while (length($str)){
	 my $ord = unpack($endian, substr($str, 0, $size, ''));
	unless ($size == 4 or valid_ucs2($ord &= $mask)){
	    if ($ucs2){
		$chk and 
		    poisoned2death($obj, "no surrogates allowed", $ord);
		substr($str,0,$size,''); # skip the next one as well
		$ord = FBCHAR;
	    }else{
		unless (isHiSurrogate($ord)){
		    poisoned2death($obj, "Malformed HI surrogate", $ord);
		}
		my $lo = unpack($endian ,substr($str,0,$size,''));
		unless (isLoSurrogate($lo &= $mask)){
		    poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
		}
		$ord = desurrogate($ord, $lo);
	    }
	}
	$utf8 .= chr($ord);
    }
    utf8::upgrade($utf8);
    return $utf8;
}

sub encode_classic($$;$)
{
    my ($obj, $utf8, $chk) = @_;
    my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
    # warn join ", ", $size, $ucs2, $endian, $mask;
    my $str   = '';
    unless ($endian){
	$endian = ($size == 4) ? 'N' : 'n';
	$str .= pack($endian, BOM_BE);
    }
    while (length($utf8)){
	my $ord  = ord(substr($utf8,0,1,''));
	unless ($size == 4 or valid_ucs2($ord)) {
	    unless(issurrogate($ord)){
		if ($ucs2){
		    $chk and 
			poisoned2death($obj, "code point too high", $ord);
		    $str .= pack($endian, FBCHAR);
		}else{
		    $str .= pack($endian.2, ensurrogate($ord));
		}
	    }else{  # not supposed to happen
		$str .= pack($endian, FBCHAR);
	    }
	}else{
	    $str .= pack($endian, $ord);
	}
    }
    return $str;
}

sub BOMB {
    my ($size, $bom) = @_;
    my $N = $size == 2 ? 'n' : 'N';
    my $ord = unpack($N, $bom);
    return ($ord eq BOM_BE) ? $N : 
	($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
}

sub poisoned2death{
    my $obj = shift;
    my $msg = shift;
    my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
    require Carp;
    Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
}

1;
__END__

=head1 NAME

Encode::Unicode -- Various Unicode Transformation Formats

=cut

=head1 SYNOPSIS

    use Encode qw/encode decode/; 
    $ucs2 = encode("UCS-2BE", $utf8);
    $utf8 = decode("UCS-2BE", $ucs2);

=head1 ABSTRACT

This module implements all Character Encoding Schemes of Unicode that
are officially documented by Unicode Consortium (except, of course,
for UTF-8, which is a native format in perl).

=over 4

=item L<http://www.unicode.org/glossary/> says:

I<Character Encoding Scheme> A character encoding form plus byte
serialization. There are seven character encoding schemes in Unicode:
UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.

=item Quick Reference

                Decodes from ord(N)           Encodes chr(N) to...
       octet/char BOM S.P d800-dfff  ord > 0xffff     \x{1abcd} ==
  ---------------+-----------------+------------------------------
  UCS-2BE	2   N   N  is bogus                  Not Available
  UCS-2LE       2   N   N     bogus                  Not Available
  UTF-16      2/4   Y   Y  is   S.P           S.P            BE/LE
  UTF-16BE    2/4   N   Y       S.P           S.P    0xd82a,0xdfcd
  UTF-16LE	2   N   Y       S.P           S.P    0x2ad8,0xcddf
  UTF-32	4   Y   -  is bogus         As is            BE/LE
  UTF-32BE	4   N   -     bogus         As is       0x0001abcd
  UTF-32LE	4   N   -     bogus         As is       0xcdab0100
  UTF-8       1-4   -   -     bogus   >= 4 octets   \xf0\x9a\af\8d
  ---------------+-----------------+------------------------------

=back

=head1 Size, Endianness, and BOM

You can categorize these CES by 3 criteria:  size of each character,
endianness, and Byte Order Mark.

=head2 by size

UCS-2 is a fixed-length encoding with each character taking 16 bits.
It B<does not> support I<surrogate pairs>.  When a surrogate pair
is encountered during decode(), its place is filled with \x{FFFD}
if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1.  When a
character whose ord value is larger than 0xFFFF is encountered,
its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
croaks if I<CHECK> is 1.

UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
form a character.  Bogus surrogates result in death.  When \x{10000}
or above is encountered during encode(), it C<ensurrogate>s them and
pushes the surrogate pair to the output stream.

UTF-32 is a fixed-length encoding with each character taking 32 bits.
Since it is 32-bit, there is no need for I<surrogate pairs>.

=head2 by endianness

The first (and now failed) goal of Unicode was to map all character
repertoires into a fixed-length integer so that programmers are happy.
Since each character is either a I<short> or I<long> in C, you have to
pay attention to the endianness of each platform when you pass data
to one another.

Anything marked as BE is Big Endian (or network byte order) and LE is
Little Endian (aka VAX byte order).  For anything not marked either
BE or LE, a character called Byte Order Mark (BOM) indicating the
endianness is prepended to the string.

=over 4

=item BOM as integer when fetched in network byte order

              16         32 bits/char
  -------------------------
  BE      0xFeFF 0x0000FeFF
  LE      0xFFeF 0xFFFe0000
  -------------------------

=back
 
This modules handles the BOM as follows.

=over 4

=item *

When BE or LE is explicitly stated as the name of encoding, BOM is
simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).

=item *

When BE or LE is omitted during decode(), it checks if BOM is at the
beginning of the string; if one is found, the endianness is set to
what the BOM says.  If no BOM is found, the routine dies.

=item *

When BE or LE is omitted during encode(), it returns a BE-encoded
string with BOM prepended.  So when you want to encode a whole text
file, make sure you encode() the whole text at once, not line by line
or each line, not file, will have a BOM prepended.

=item *

C<UCS-2> is an exception.  Unlike others, this is an alias of UCS-2BE.
UCS-2 is already registered by IANA and others that way.

=back

=head1 Surrogate Pairs

To say the least, surrogate pairs were the biggest mistake of the
Unicode Consortium.  But according to the late Douglas Adams in I<The
Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
Universe was created. This has made a lot of people very angry and
been widely regarded as a bad move>.  Their mistake was not of this
magnitude so let's forgive them.

(I don't dare make any comparison with Unicode Consortium and the
Vogons here ;)  Or, comparing Encode to Babel Fish is completely
appropriate -- if you can only stick this into your ear :)

Surrogate pairs were born when the Unicode Consortium finally
admitted that 16 bits were not big enough to hold all the world's
character repertoires.  But they already made UCS-2 16-bit.  What
do we do?

Back then, the range 0xD800-0xDFFF was not allocated.  Let's split
that range in half and use the first half to represent the C<upper
half of a character> and the second half to represent the C<lower
half of a character>.  That way, you can represent 1024 * 1024 =
1048576 more characters.  Now we can store character ranges up to
\x{10ffff} even with 16-bit encodings.  This pair of half-character is
now called a I<surrogate pair> and UTF-16 is the name of the encoding
that embraces them.

Here is a formula to ensurrogate a Unicode character \x{10000} and
above;

  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;

And to desurrogate;

 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);

Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
perl does not prohibit the use of characters within this range.  To perl, 
every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.

  (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
  integer support!

=head1 SEE ALSO

L<Encode>, L<http://www.unicode.org/glossary/>,

RFC 2781 L<http://rfc.net/rfc2781.html>,

L<http://www.unicode.org/unicode/faq/utf_bom.html>

Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
by Larry Wall, Tom Christiansen, Jon Orwant; 
O'Reilly & Associates; ISBN 0-596-00027-8

=cut
Commit	Line	Data
f2a2953c	1	package Encode::Unicode;
f2a2953c	2
df1df145	3	use strict;
f2a2953c	4	use warnings;
f2a2953c	5
0ab8f81e	6	our $VERSION = do { my @r = (q$Revision: 1.34 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
f2a2953c	7
85982a32	8	use XSLoader;
85982a32	9	XSLoader::load(__PACKAGE__,$VERSION);
df1df145	10
f2a2953c	11	#
	12	# Object Generator 8 transcoders all at once!
	13	#
df1df145	14
f2a2953c	15	require Encode;
	16	for my $name (qw(UTF-16 UTF-16BE UTF-16LE
	17	UTF-32 UTF-32BE UTF-32LE
	18	UCS-2BE UCS-2LE))
df1df145	19	{
f2a2953c	20	my ($size, $endian, $ucs2, $mask);
	21	$name =~ /^(\w+)-(\d+)(\w*)$/o;
	22	if ($ucs2 = ($1 eq 'UCS')){
	23	$size = 2;
	24	}else{
	25	$size = $2/8;
df1df145	26	}
f2a2953c	27	$endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
	28	$size == 4 and $endian = uc($endian);
	29
	30	$Encode::Encoding{$name} =
	31	bless {
	32	Name => $name,
	33	size => $size,
	34	endian => $endian,
	35	ucs2 => $ucs2,
c731e18e	36	} => __PACKAGE__;
f2a2953c	37
df1df145	38	}
df1df145	39
f2a2953c	40	sub name { shift->{'Name'} }
aae85ceb	41	sub new_sequence
	42	{
	43	my $self = shift;
	44	# Return the original if endian known
	45	return $self if ($self->{endian});
	46	# Return a clone
	47	return bless {%$self},ref($self);
	48	}
	49
0ab8f81e	50	sub needs_lines { 0 };
	51
	52	sub perlio_ok {
	53	exists $INC{"PerlIO/encoding.pm"} or return 0;
	54	return 1;
	55	}
	56
f2a2953c	57
f2a2953c	58	#
0ab8f81e	59	# three implementations of (en\|de)code exist. The XS version is the
	60	# fastest. _modern uses an array and _classic sticks with substr.
	61	# *_classic is much slower but more memory conservative.
	62	# *_xs is the default.
f2a2953c	63
	64	sub set_transcoder{
	65	no warnings qw(redefine);
	66	my $type = shift;
aae85ceb	67	if ($type eq "xs"){
	68	*decode = \&decode_xs;
	69	*encode = \&encode_xs;
	70	}elsif($type eq "modern"){
f2a2953c	71	*decode = \&decode_modern;
	72	*encode = \&encode_modern;
	73	}elsif($type eq "classic"){
	74	*decode = \&decode_classic;
	75	*encode = \&encode_classic;
	76	}else{
fcb875d4	77	require Carp;
aae85ceb	78	Carp::croak __PACKAGE__, "::set_transcoder(modern\|classic\|xs)";
f2a2953c	79	}
	80	}
	81
aae85ceb	82	set_transcoder("xs");
f2a2953c	83
f2a2953c	84	#
85982a32	85	# Aux. subs & constants
	86	#
	87
	88	sub FBCHAR(){ 0xFFFd }
	89	sub BOM_BE(){ 0xFeFF }
	90	sub BOM16LE(){ 0xFFFe }
	91	sub BOM32LE(){ 0xFFFe0000 }
	92
	93	sub valid_ucs2($){
	94	return
	95	(0 <= $_[0] && $_[0] < 0xD800)
	96	\|\| ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
	97	}
	98
	99	sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
	100	sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
	101	sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
	102
	103	sub ensurrogate($){
	104	use integer; # we have divisions
	105	my $uni = shift;
	106	my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
	107	my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
	108	return ($hi, $lo);
	109	}
	110
	111	sub desurrogate($$){
	112	my ($hi, $lo) = @_;
	113	return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
	114	}
	115
	116	sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
	117
	118	#
f2a2953c	119	# *_modern are much faster but guzzle more memory
	120	#
	121
aae85ceb	122	sub decode_modern($$;$)
df1df145	123	{
f2a2953c	124	my ($obj, $str, $chk ) = @_;
	125	my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
	126
	127	# warn "$size, $endian, $ucs2";
	128	$endian \|\|= BOMB($size, substr($str, 0, $size, ''))
	129	or poisoned2death($obj, "Where's the BOM?");
	130	my $mask = Mask->{$size};
	131	my $utf8 = '';
	132	my @ord = unpack("$endian*", $str);
	133	undef $str; # to conserve memory
	134	while (@ord){
	135	my $ord = shift @ord;
	136	unless ($size == 4 or valid_ucs2($ord &= $mask)){
	137	if ($ucs2){
fcb875d4	138	$chk and
f2a2953c	139	poisoned2death($obj, "no surrogates allowed", $ord);
	140	shift @ord; # skip the next one as well
	141	$ord = FBCHAR;
	142	}else{
	143	unless (isHiSurrogate($ord)){
	144	poisoned2death($obj, "Malformed HI surrogate", $ord);
	145	}
	146	my $lo = shift @ord;
	147	unless (isLoSurrogate($lo &= $mask)){
	148	poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
	149	}
	150	$ord = desurrogate($ord, $lo);
	151	}
	152	}
	153	$utf8 .= chr($ord);
df1df145	154	}
f2a2953c	155	utf8::upgrade($utf8);
	156	return $utf8;
	157	}
	158
aae85ceb	159	sub encode_modern($$;$)
f2a2953c	160	{
	161	my ($obj, $utf8, $chk) = @_;
	162	my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
	163	my @str = ();
	164	unless ($endian){
	165	$endian = ($size == 4) ? 'N' : 'n';
	166	push @str, BOM_BE;
	167	}
	168	my @ord = unpack("U*", $utf8);
	169	undef $utf8; # to conserve memory
	170	for my $ord (@ord){
	171	unless ($size == 4 or valid_ucs2($ord)) {
	172	unless(issurrogate($ord)){
	173	if ($ucs2){
fcb875d4	174	$chk and
f2a2953c	175	poisoned2death($obj, "code point too high", $ord);
	176
	177	push @str, FBCHAR;
	178	}else{
fcb875d4	179
f2a2953c	180	push @str, ensurrogate($ord);
	181	}
	182	}else{ # not supposed to happen
	183	push @str, FBCHAR;
	184	}
	185	}else{
	186	push @str, $ord;
	187	}
	188	}
	189	return pack("$endian*", @str);
	190	}
	191
	192	#
	193	# *_classic are slower but more memory conservative
	194	#
	195
aae85ceb	196	sub decode_classic($$;$)
f2a2953c	197	{
	198	my ($obj, $str, $chk ) = @_;
	199	my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
	200
	201	# warn "$size, $endian, $ucs2";
	202	$endian \|\|= BOMB($size, substr($str, 0, $size, ''))
	203	or poisoned2death($obj, "Where's the BOM?");
	204	my $mask = Mask->{$size};
	205	my $utf8 = '';
	206	my @ord = unpack("$endian*", $str);
	207	while (length($str)){
	208	my $ord = unpack($endian, substr($str, 0, $size, ''));
	209	unless ($size == 4 or valid_ucs2($ord &= $mask)){
	210	if ($ucs2){
fcb875d4	211	$chk and
f2a2953c	212	poisoned2death($obj, "no surrogates allowed", $ord);
	213	substr($str,0,$size,''); # skip the next one as well
	214	$ord = FBCHAR;
	215	}else{
	216	unless (isHiSurrogate($ord)){
	217	poisoned2death($obj, "Malformed HI surrogate", $ord);
	218	}
	219	my $lo = unpack($endian ,substr($str,0,$size,''));
	220	unless (isLoSurrogate($lo &= $mask)){
	221	poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
	222	}
	223	$ord = desurrogate($ord, $lo);
	224	}
	225	}
	226	$utf8 .= chr($ord);
	227	}
	228	utf8::upgrade($utf8);
	229	return $utf8;
	230	}
	231
aae85ceb	232	sub encode_classic($$;$)
f2a2953c	233	{
	234	my ($obj, $utf8, $chk) = @_;
	235	my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
	236	# warn join ", ", $size, $ucs2, $endian, $mask;
	237	my $str = '';
	238	unless ($endian){
	239	$endian = ($size == 4) ? 'N' : 'n';
	240	$str .= pack($endian, BOM_BE);
	241	}
	242	while (length($utf8)){
	243	my $ord = ord(substr($utf8,0,1,''));
	244	unless ($size == 4 or valid_ucs2($ord)) {
	245	unless(issurrogate($ord)){
	246	if ($ucs2){
fcb875d4	247	$chk and
f2a2953c	248	poisoned2death($obj, "code point too high", $ord);
	249	$str .= pack($endian, FBCHAR);
	250	}else{
	251	$str .= pack($endian.2, ensurrogate($ord));
	252	}
	253	}else{ # not supposed to happen
	254	$str .= pack($endian, FBCHAR);
	255	}
	256	}else{
	257	$str .= pack($endian, $ord);
	258	}
	259	}
	260	return $str;
	261	}
	262
	263	sub BOMB {
	264	my ($size, $bom) = @_;
	265	my $N = $size == 2 ? 'n' : 'N';
	266	my $ord = unpack($N, $bom);
fcb875d4	267	return ($ord eq BOM_BE) ? $N :
f2a2953c	268	($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
	269	}
	270
	271	sub poisoned2death{
	272	my $obj = shift;
	273	my $msg = shift;
	274	my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
	275	require Carp;
	276	Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
df1df145	277	}
	278
	279	1;
	280	__END__
67d7b5ef	281
	282	=head1 NAME
	283
0ab8f81e	284	Encode::Unicode -- Various Unicode Transformation Formats
67d7b5ef	285
67d7b5ef	286	=cut
f2a2953c	287
	288	=head1 SYNOPSIS
	289
fcb875d4	290	use Encode qw/encode decode/;
f2a2953c	291	$ucs2 = encode("UCS-2BE", $utf8);
	292	$utf8 = decode("UCS-2BE", $ucs2);
	293
	294	=head1 ABSTRACT
	295
	296	This module implements all Character Encoding Schemes of Unicode that
	297	are officially documented by Unicode Consortium (except, of course,
	298	for UTF-8, which is a native format in perl).
	299
	300	=over 4
	301
	302	=item L<http://www.unicode.org/glossary/> says:
	303
	304	I<Character Encoding Scheme> A character encoding form plus byte
	305	serialization. There are seven character encoding schemes in Unicode:
	306	UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
	307
	308	=item Quick Reference
	309
	310	Decodes from ord(N) Encodes chr(N) to...
	311	octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
	312	---------------+-----------------+------------------------------
	313	UCS-2BE 2 N N is bogus Not Available
	314	UCS-2LE 2 N N bogus Not Available
	315	UTF-16 2/4 Y Y is S.P S.P BE/LE
	316	UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
	317	UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
	318	UTF-32 4 Y - is bogus As is BE/LE
0ab8f81e	319	UTF-32BE 4 N - bogus As is 0x0001abcd
0ab8f81e	320	UTF-32LE 4 N - bogus As is 0xcdab0100
f2a2953c	321	UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
	322	---------------+-----------------+------------------------------
	323
	324	=back
	325
	326	=head1 Size, Endianness, and BOM
	327
0ab8f81e	328	You can categorize these CES by 3 criteria: size of each character,
0ab8f81e	329	endianness, and Byte Order Mark.
f2a2953c	330
0ab8f81e	331	=head2 by size
f2a2953c	332
f2a2953c	333	UCS-2 is a fixed-length encoding with each character taking 16 bits.
0ab8f81e	334	It B<does not> support I<surrogate pairs>. When a surrogate pair
	335	is encountered during decode(), its place is filled with \x{FFFD}
	336	if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a
	337	character whose ord value is larger than 0xFFFF is encountered,
	338	its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
	339	croaks if I<CHECK> is 1.
	340
	341	UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
f2a2953c	342	When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
0ab8f81e	343	following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
	344	form a character. Bogus surrogates result in death. When \x{10000}
	345	or above is encountered during encode(), it C<ensurrogate>s them and
	346	pushes the surrogate pair to the output stream.
f2a2953c	347
f2a2953c	348	UTF-32 is a fixed-length encoding with each character taking 32 bits.
0ab8f81e	349	Since it is 32-bit, there is no need for I<surrogate pairs>.
f2a2953c	350
0ab8f81e	351	=head2 by endianness
f2a2953c	352
0ab8f81e	353	The first (and now failed) goal of Unicode was to map all character
	354	repertoires into a fixed-length integer so that programmers are happy.
	355	Since each character is either a I<short> or I<long> in C, you have to
	356	pay attention to the endianness of each platform when you pass data
	357	to one another.
f2a2953c	358
f2a2953c	359	Anything marked as BE is Big Endian (or network byte order) and LE is
0ab8f81e	360	Little Endian (aka VAX byte order). For anything not marked either
	361	BE or LE, a character called Byte Order Mark (BOM) indicating the
	362	endianness is prepended to the string.
f2a2953c	363
	364	=over 4
	365
fcb875d4	366	=item BOM as integer when fetched in network byte order
f2a2953c	367
fcb875d4	368	16 32 bits/char
	369	-------------------------
	370	BE 0xFeFF 0x0000FeFF
	371	LE 0xFFeF 0xFFFe0000
	372	-------------------------
f2a2953c	373
f2a2953c	374	=back
fcb875d4	375
0ab8f81e	376	This modules handles the BOM as follows.
f2a2953c	377
	378	=over 4
	379
	380	=item *
	381
	382	When BE or LE is explicitly stated as the name of encoding, BOM is
0ab8f81e	383	simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
f2a2953c	384
	385	=item *
	386
0ab8f81e	387	When BE or LE is omitted during decode(), it checks if BOM is at the
	388	beginning of the string; if one is found, the endianness is set to
	389	what the BOM says. If no BOM is found, the routine dies.
f2a2953c	390
	391	=item *
	392
	393	When BE or LE is omitted during encode(), it returns a BE-encoded
	394	string with BOM prepended. So when you want to encode a whole text
0ab8f81e	395	file, make sure you encode() the whole text at once, not line by line
0ab8f81e	396	or each line, not file, will have a BOM prepended.
f2a2953c	397
	398	=item *
	399
0ab8f81e	400	C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE.
f2a2953c	401	UCS-2 is already registered by IANA and others that way.
f2a2953c	402
fdd579e2	403	=back
f2a2953c	404
fcb875d4	405	=head1 Surrogate Pairs
f2a2953c	406
fcb875d4	407	To say the least, surrogate pairs were the biggest mistake of the
	408	Unicode Consortium. But according to the late Douglas Adams in I<The
	409	Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
	410	Universe was created. This has made a lot of people very angry and
	411	been widely regarded as a bad move>. Their mistake was not of this
	412	magnitude so let's forgive them.
f2a2953c	413
f2a2953c	414	(I don't dare make any comparison with Unicode Consortium and the
c731e18e	415	Vogons here ;) Or, comparing Encode to Babel Fish is completely
c731e18e	416	appropriate -- if you can only stick this into your ear :)
f2a2953c	417
0ab8f81e	418	Surrogate pairs were born when the Unicode Consortium finally
fcb875d4	419	admitted that 16 bits were not big enough to hold all the world's
0ab8f81e	420	character repertoires. But they already made UCS-2 16-bit. What
f2a2953c	421	do we do?
f2a2953c	422
0ab8f81e	423	Back then, the range 0xD800-0xDFFF was not allocated. Let's split
	424	that range in half and use the first half to represent the C<upper
	425	half of a character> and the second half to represent the C<lower
	426	half of a character>. That way, you can represent 1024 * 1024 =
	427	1048576 more characters. Now we can store character ranges up to
	428	\x{10ffff} even with 16-bit encodings. This pair of half-character is
	429	now called a I<surrogate pair> and UTF-16 is the name of the encoding
	430	that embraces them.
f2a2953c	431
448e90bb	432	Here is a formula to ensurrogate a Unicode character \x{10000} and
f2a2953c	433	above;
	434
	435	$hi = ($uni - 0x10000) / 0x400 + 0xD800;
	436	$lo = ($uni - 0x10000) % 0x400 + 0xDC00;
	437
	438	And to desurrogate;
	439
	440	$uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
	441
fcb875d4	442	Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
	443	perl does not prohibit the use of characters within this range. To perl,
	444	every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
	445
	446	(*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
0ab8f81e	447	integer support!
f2a2953c	448
	449	=head1 SEE ALSO
	450
fdd579e2	451	L<Encode>, L<http://www.unicode.org/glossary/>,
f2a2953c	452
fdd579e2	453	RFC 2781 L<http://rfc.net/rfc2781.html>,
	454
	455	L<http://www.unicode.org/unicode/faq/utf_bom.html>
	456
fcb875d4	457	Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
	458	by Larry Wall, Tom Christiansen, Jon Orwant;
	459	O'Reilly & Associates; ISBN 0-596-00027-8
	460
fdd579e2	461	=cut