X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2Flib%2FEncode%2FUnicode.pm;h=1bbd9db0099fc5805104da9e32bdbef60496cd4c;hb=f2a2953c25503948c9a5e44b5ee7fe84a7da6b46;hp=f4818e330fd6c1278cc332d9114831fad1295729;hpb=735b7a62d039909fa334af8e05d4788f54c2c65a;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/lib/Encode/Unicode.pm b/ext/Encode/lib/Encode/Unicode.pm index f4818e3..1bbd9db 100644 --- a/ext/Encode/lib/Encode/Unicode.pm +++ b/ext/Encode/lib/Encode/Unicode.pm @@ -1,33 +1,259 @@ -package Encoding::Unicode; +package Encode::Unicode; + use strict; -our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +use warnings; + +our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +# +# Aux. subs & constants +# + +sub FBCHAR(){ 0xFFFd } +sub BOM_BE(){ 0xFeFF } +sub BOM16LE(){ 0xFFFe } +sub BOM32LE(){ 0xFeFF0000 } + +sub valid_ucs2($){ + if ($_[0] < 0xD800){ + return $_[0] > 0; + }else{ + return ($_[0] > 0xDFFFF && $_[0] <= 0xFFFF); + } +} + +sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } +sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 } +sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF } + +sub ensurrogate($){ + use integer; # we have divisions + my $uni = shift; + my $hi = ($uni - 0x10000) / 0x400 + 0xD800; + my $lo = ($uni - 0x10000) % 0x400 + 0xDC00; + return ($hi, $lo); +} + +sub desurrogate($$){ + my ($hi, $lo) = @_; + return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00); +} -use base 'Encode::Encoding'; +sub Mask { {2 => 0xffff, 4 => 0xffffffff} } -__PACKAGE__->Define('Unicode') unless ord('A') == 65; +# +# Object Generator 8 transcoders all at once! +# -sub decode +require Encode; +for my $name (qw(UTF-16 UTF-16BE UTF-16LE + UTF-32 UTF-32BE UTF-32LE + UCS-2BE UCS-2LE)) { - my ($obj,$str,$chk) = @_; - my $res = ''; - for (my $i = 0; $i < length($str); $i++) - { - $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1)))); + my ($size, $endian, $ucs2, $mask); + $name =~ /^(\w+)-(\d+)(\w*)$/o; + if ($ucs2 = ($1 eq 'UCS')){ + $size = 2; + }else{ + $size = $2/8; } - $_[1] = '' if $chk; - return $res; + $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ; + $size == 4 and $endian = uc($endian); + + $Encode::Encoding{$name} = + bless { + Name => $name, + size => $size, + endian => $endian, + ucs2 => $ucs2, + }, __PACKAGE__; + } -sub encode +sub name { shift->{'Name'} } +sub new_sequence { $_[0] }; + +# +# the two implementation of (en|de)code exist. *_modern use +# array and *_classic stick with substr. *_classic is much +# slower but more memory conservative. *_moder is default. + +sub set_transcoder{ + no warnings qw(redefine); + my $type = shift; + if ($type eq "modern"){ + *decode = \&decode_modern; + *encode = \&encode_modern; + }elsif($type eq "classic"){ + *decode = \&decode_classic; + *encode = \&encode_classic; + }else{ + require Carp; + Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)"; + } +} + +set_transcoder("modern"); + +# +# *_modern are much faster but guzzle more memory +# + +sub decode_modern { - my ($obj,$str,$chk) = @_; - my $res = ''; - for (my $i = 0; $i < length($str); $i++) - { - $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1)))); + my ($obj, $str, $chk ) = @_; + my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; + + # warn "$size, $endian, $ucs2"; + $endian ||= BOMB($size, substr($str, 0, $size, '')) + or poisoned2death($obj, "Where's the BOM?"); + my $mask = Mask->{$size}; + my $utf8 = ''; + my @ord = unpack("$endian*", $str); + undef $str; # to conserve memory + while (@ord){ + my $ord = shift @ord; + unless ($size == 4 or valid_ucs2($ord &= $mask)){ + if ($ucs2){ + $chk and + poisoned2death($obj, "no surrogates allowed", $ord); + shift @ord; # skip the next one as well + $ord = FBCHAR; + }else{ + unless (isHiSurrogate($ord)){ + poisoned2death($obj, "Malformed HI surrogate", $ord); + } + my $lo = shift @ord; + unless (isLoSurrogate($lo &= $mask)){ + poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); + } + $ord = desurrogate($ord, $lo); + } + } + $utf8 .= chr($ord); } - $_[1] = '' if $chk; - return $res; + utf8::upgrade($utf8); + return $utf8; +} + +sub encode_modern +{ + my ($obj, $utf8, $chk) = @_; + my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; + my @str = (); + unless ($endian){ + $endian = ($size == 4) ? 'N' : 'n'; + push @str, BOM_BE; + } + my @ord = unpack("U*", $utf8); + undef $utf8; # to conserve memory + for my $ord (@ord){ + unless ($size == 4 or valid_ucs2($ord)) { + unless(issurrogate($ord)){ + if ($ucs2){ + $chk and + poisoned2death($obj, "code point too high", $ord); + + push @str, FBCHAR; + }else{ + + push @str, ensurrogate($ord); + } + }else{ # not supposed to happen + push @str, FBCHAR; + } + }else{ + push @str, $ord; + } + } + return pack("$endian*", @str); +} + +# +# *_classic are slower but more memory conservative +# + +sub decode_classic +{ + my ($obj, $str, $chk ) = @_; + my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; + + # warn "$size, $endian, $ucs2"; + $endian ||= BOMB($size, substr($str, 0, $size, '')) + or poisoned2death($obj, "Where's the BOM?"); + my $mask = Mask->{$size}; + my $utf8 = ''; + my @ord = unpack("$endian*", $str); + while (length($str)){ + my $ord = unpack($endian, substr($str, 0, $size, '')); + unless ($size == 4 or valid_ucs2($ord &= $mask)){ + if ($ucs2){ + $chk and + poisoned2death($obj, "no surrogates allowed", $ord); + substr($str,0,$size,''); # skip the next one as well + $ord = FBCHAR; + }else{ + unless (isHiSurrogate($ord)){ + poisoned2death($obj, "Malformed HI surrogate", $ord); + } + my $lo = unpack($endian ,substr($str,0,$size,'')); + unless (isLoSurrogate($lo &= $mask)){ + poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); + } + $ord = desurrogate($ord, $lo); + } + } + $utf8 .= chr($ord); + } + utf8::upgrade($utf8); + return $utf8; +} + +sub encode_classic +{ + my ($obj, $utf8, $chk) = @_; + my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; + # warn join ", ", $size, $ucs2, $endian, $mask; + my $str = ''; + unless ($endian){ + $endian = ($size == 4) ? 'N' : 'n'; + $str .= pack($endian, BOM_BE); + } + while (length($utf8)){ + my $ord = ord(substr($utf8,0,1,'')); + unless ($size == 4 or valid_ucs2($ord)) { + unless(issurrogate($ord)){ + if ($ucs2){ + $chk and + poisoned2death($obj, "code point too high", $ord); + $str .= pack($endian, FBCHAR); + }else{ + $str .= pack($endian.2, ensurrogate($ord)); + } + }else{ # not supposed to happen + $str .= pack($endian, FBCHAR); + } + }else{ + $str .= pack($endian, $ord); + } + } + return $str; +} + +sub BOMB { + my ($size, $bom) = @_; + my $N = $size == 2 ? 'n' : 'N'; + my $ord = unpack($N, $bom); + return ($ord eq BOM_BE) ? $N : + ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef; +} + +sub poisoned2death{ + my $obj = shift; + my $msg = shift; + my $pair = join(", ", map {sprintf "\\x%x", $_} @_); + require Carp; + Carp::croak $obj->name, ":", $msg, "<$pair>.", caller; } 1; @@ -35,6 +261,162 @@ __END__ =head1 NAME -Encode::Unicode -- for internal use only +Encode::Unicode -- Various Unicode Transform Format =cut + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $ucs2 = encode("UCS-2BE", $utf8); + $utf8 = decode("UCS-2BE", $ucs2); + +=head1 ABSTRACT + +This module implements all Character Encoding Schemes of Unicode that +are officially documented by Unicode Consortium (except, of course, +for UTF-8, which is a native format in perl). + +=over 4 + +=item L says: + +I A character encoding form plus byte +serialization. There are seven character encoding schemes in Unicode: +UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE. + +=item Quick Reference + + Decodes from ord(N) Encodes chr(N) to... + octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} == + ---------------+-----------------+------------------------------ + UCS-2BE 2 N N is bogus Not Available + UCS-2LE 2 N N bogus Not Available + UTF-16 2/4 Y Y is S.P S.P BE/LE + UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd + UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf + UTF-32 4 Y - is bogus As is BE/LE + UTF-32BE 4 N - bogus As is 0x0010abcd + UTF-32LE 4 N - bogus As is 0xcdab1000 + UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d + ---------------+-----------------+------------------------------ + +=back + +=head1 Size, Endianness, and BOM + +You can categorize these CES by 3 criteria; Size of each character, +Endianness, and Byte Order Mark. + +=head2 by Size + +UCS-2 is a fixed-length encoding with each character taking 16 bits. +It B support I. When surrogate pair is +encountered during decode(), it fills its place with \xFFFD without +I or croaks if I. When a character which ord value is +larger than 0xFFFF, it uses 0xFFFD without I or croaks if +. + +UTF-16 is almost the same as UCS-2 but it supports I. +When it encounters a high surrogate (0xD800-0xDBFF), it fetches the +following low surrogate (0xDC00-0xDFFF), C them to form a +character. Bogus surrogates result in death. When \x{10000} or above +is encountered during encode(), it Cs them and push the +surrogate pair to the output stream. + +UTF-32 is a fixed-length encoding with each character taking 32 bits. +Since it is 32-bit there is no need for I. + +=head2 by Endianness + +First (and now failed) goal of Unicode was to map all character +repartories into a fixed-length integer so programmers are happy. +Since each character is either I or I in C, you have to +put endianness of each platform when you pass data to one another. + +Anything marked as BE is Big Endian (or network byte order) and LE is +Little Endian (aka VAX byte order). For anything without, a character +called Byte Order Mark (BOM) is prepended to the head of string. + +=over 4 + +=item BOM as integer + + 16 32 bits/char +------------------------- +BE 0xFeFF 0x0000FeFF +LE 0xFFeF 0xFeFF0000 +------------------------- + +=back + +This modules handles BOM as follows. + +=over 4 + +=item * + +When BE or LE is explicitly stated as the name of encoding, BOM is +simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE). + +=item * + +When BE or LE is omitted during decode(), it checks if BOM is in the +beginning of the string and if found endianness is set to what BOM +says. if not found, dies. + +=item * + +When BE or LE is omitted during encode(), it returns a BE-encoded +string with BOM prepended. So when you want to encode a whole text +file, make sure you encode() by whole text, not line by line or each +line, not file, is prepended with BOMs. + +=item * + +C is an exception. Unlike others this is an alias of UCS-2BE. +UCS-2 is already registered by IANA and others that way. + + +=head1 The Surrogate Pair + +To say the least, surrogate pair was the biggest mistake by Unicode +Consortium. I don't give a darn if they admit it or not. But +according to late Douglas Adams in I Triology, C. Their mistake was not this magnitude so let's forgive them. + +(I don't dare make any comparison with Unicode Consortium and the +Vogols here :) + +A surrogate pair was born when Unicode Consortium had finally +admitted that 16 bit was not big enough to hold all the world's +character repartorie. But they have already made UCS-2 16-bit. What +do we do? + +Back then 0xD800-0xDFFF was not allocated. Let's split them half and +use the first half to represent C and the +latter C. That way you can represent 1024 +* 1024 = 1048576 more characters. Now we can store character ranges +up to \x{10ffff} even with 16-bit encodings. This pair of +half-character is now called a I and UTF-16 is the +name of encoding that embraces them. + +Here is a fomula to ensurrogate a Unicode character \x{10000} and +above; + + $hi = ($uni - 0x10000) / 0x400 + 0xD800; + $lo = ($uni - 0x10000) % 0x400 + 0xDC00; + +And to desurrogate; + + $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); + +Note this move has made \x{D800}-\x{DFFF} forbidden zone but perl +does not prohibit them for uses. + +=head1 SEE ALSO + +L, L + +=back