X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=4f4c3eaffe5ee36cbf52c3271ccee7633d01cbd3;hb=3127de7dcb470aaa3997c4164384a5bfb3162ffe;hp=08fc5416730e9eeedb2dfe1d05591e4283b41594;hpb=87cea99e29dc843a5ce7742434c86a627eb3f6f5;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 08fc541..4f4c3ea 100644 --- a/utf8.c +++ b/utf8.c @@ -9,16 +9,23 @@ */ /* - * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever - * heard of that we don't want to see any closer; and that's the one place - * we're trying to get to! And that's just where we can't get, nohow.' + * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever + * heard of that we don't want to see any closer; and that's the one place + * we're trying to get to! And that's just where we can't get, nohow.' + * + * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"] * * 'Well do I understand your speech,' he answered in the same language; * 'yet few strangers do so. Why then do you not speak in the Common Tongue, - * as is the custom in the West, if you wish to be answered?' + * as is the custom in the West, if you wish to be answered?' + * --Gandalf, addressing Théoden's door wardens + * + * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] * * ...the travellers perceived that the floor was paved with stones of many * hues; branching runes and strange devices intertwined beneath their feet. + * + * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] */ #include "EXTERN.h" @@ -240,9 +247,9 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len) =for apidoc is_utf8_char Tests if some arbitrary number of bytes begins in a valid UTF-8 -character. Note that an INVARIANT (i.e. ASCII) character is a valid -UTF-8 character. The actual number of bytes in the UTF-8 character -will be returned if it is valid, otherwise 0. +character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) +character is a valid UTF-8 character. The actual number of bytes in the UTF-8 +character will be returned if it is valid, otherwise 0. =cut */ STRLEN @@ -641,7 +648,7 @@ Returns the Unicode code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -This function should only be used when returned UV is considered +This function should only be used when the returned UV is considered an index into the Unicode semantic tables (e.g. swashes). If C does not point to a well-formed UTF-8 character, zero is @@ -765,7 +772,7 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off) /* =for apidoc utf8_to_bytes -Converts a string C of length C from UTF-8 into byte encoding. +Converts a string C of length C from UTF-8 into native byte encoding. Unlike C, this over-writes the original string, and updates len to contain the new length. Returns zero on failure, setting C to -1. @@ -810,12 +817,13 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) /* =for apidoc bytes_from_utf8 -Converts a string C of length C from UTF-8 into byte encoding. +Converts a string C of length C from UTF-8 into native byte encoding. Unlike C but like C, returns a pointer to the newly-created string, and updates C to contain the new length. Returns the original string if no conversion occurs, C is unchanged. Do nothing if C points to 0. Sets C to -0 if C is converted or contains all 7bit characters. +0 if C is converted or consisted entirely of characters that are invariant +in utf8 (i.e., US-ASCII on non-EBCDIC machines). =cut */ @@ -867,11 +875,14 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) /* =for apidoc bytes_to_utf8 -Converts a string C of length C from ASCII into UTF-8 encoding. +Converts a string C of length C from the native encoding into UTF-8. Returns a pointer to the newly-created string, and sets C to reflect the new length. -If you want to convert to UTF-8 from other encodings than ASCII, +A NUL character will be written after the end of the string. + +If you want to convert to UTF-8 from encodings other than +the native (Latin1 or EBCDIC), see sv_recode_to_utf8(). =cut @@ -1487,7 +1498,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, if (special && (uv1 == 0xDF || uv1 > 0xFF)) { /* It might be "special" (sometimes, but not always, * a multicharacter mapping) */ - HV * const hv = get_hv(special, FALSE); + HV * const hv = get_hv(special, 0); SV **svp; if (hv && @@ -2254,7 +2265,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f PERL_ARGS_ASSERT_PV_UNI_DISPLAY; - sv_setpvn(dsv, "", 0); + sv_setpvs(dsv, ""); SvUTF8_off(dsv); for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { UV u; @@ -2288,7 +2299,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f } if (ok) { const char string = ok; - sv_catpvn(dsv, "\\", 1); + sv_catpvs(dsv, "\\"); sv_catpvn(dsv, &string, 1); } }