From: Jarkko Hietaniemi Date: Sun, 16 Dec 2001 02:45:06 +0000 (+0000) Subject: Make creating UTF-8 surrogates a punishable act. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9466bab696bbe701541ead3a883c2387f5110da2;p=p5sagit%2Fp5-mst-13.2.git Make creating UTF-8 surrogates a punishable act. p4raw-id: //depot/perl@13707 --- diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 34be258..c10d56c 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3998,6 +3998,14 @@ C<< @foo->[23] >> or C<< @$ref->[99] >>. Versions of perl <= 5.6.1 used to allow this syntax, but shouldn't have. It is now deprecated, and will be removed in a future version. +=item UTF-16 surrogate %s + +(F) You tried to generate half of an UTF-16 surrogate by requesting +a Unicode character between the code points 0xD800 and 0xDFFF (inclusive). +That range is reserved exclusively for the use of UTF-16 encoding +(by having two 16-bit UCS-2 characters); but Perl encodes its characters +in UTF-8, so what you got is a very illegal character. + =item Value of %s can be "0"; test with defined() (W misc) In a conditional expression, you used , <*> (glob), diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 4102fc4..103b33b 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -740,6 +740,12 @@ and the decoding is $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00); +If you try to generate surrogates (for example by using chr()), you +will get an error because firstly a surrogate on its own is +meaningless, and secondly because Perl encodes its Unicode characters +in UTF-8 (not 16-bit numbers), which makes the encoded character doubly +illegal. + Because of the 16-bitness, UTF-16 is byteorder dependent. UTF-16 itself can be used for in-memory computations, but if storage or transfer is required, either UTF-16BE (Big Endian) or UTF-16LE diff --git a/t/op/ord.t b/t/op/ord.t index f664078..f746055 100755 --- a/t/op/ord.t +++ b/t/op/ord.t @@ -1,34 +1,42 @@ #!./perl +BEGIN { + chdir 't' if -d 't'; + @INC = '.'; + require "test.pl"; +} + print "1..8\n"; # compile time evaluation # 'A' 65 ASCII # 'A' 193 EBCDIC -if (ord('A') == 65 || ord('A') == 193) {print "ok 1\n";} else {print "not ok 1\n";} -print "not " unless ord(chr(500)) == 500; -print "ok 2\n"; +ok(ord('A') == 65 || ord('A') == 193, "ord('A') is ".ord('A')); + +is(ord(chr(500)), 500, "compile time chr 500"); # run time evaluation $x = 'ABC'; -if (ord($x) == 65 || ord($x) == 193) {print "ok 3\n";} else {print "not ok 3\n";} -if (chr 65 eq 'A' || chr 193 eq 'A') {print "ok 4\n";} else {print "not ok 4\n";} +ok(ord($x) == 65 || ord($x) == 193, "ord('$x') is ".ord($x)); -print "not " unless ord(chr(500)) == 500; -print "ok 5\n"; +ok(chr 65 eq 'A' || chr 193 eq 'A', "chr can produce 'A'"); $x = 500; -print "not " unless ord(chr($x)) == $x; -print "ok 6\n"; +is(ord(chr($x)), $x, "runtime chr $x"); -print "not " unless ord("\x{1234}") == 0x1234; -print "ok 7\n"; +is(ord("\x{1234}"), 0x1234, 'compile time ord \x{....}'); $x = "\x{1234}"; -print "not " unless ord($x) == 0x1234; -print "ok 8\n"; +is(ord($x), 0x1234, 'runtime ord \x{....}'); + +{ + eval 'my $surrogate = chr(0xD800)'; + + like($@, qr/^UTF-16 surrogate 0xd800 /, "surrogates bad"); +} + diff --git a/utf8.c b/utf8.c index 75226ca..f21b13c 100644 --- a/utf8.c +++ b/utf8.c @@ -68,6 +68,8 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) return d; } if (uv < 0x10000) { + if (UNICODE_IS_SURROGATE(uv)) + Perl_croak(aTHX_ "UTF-16 surrogate 0x%04"UVxf, uv); *d++ = (( uv >> 12) | 0xe0); *d++ = (((uv >> 6) & 0x3f) | 0x80); *d++ = (( uv & 0x3f) | 0x80);