X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=a36cc7420b21b71c3fa8799685629639a31948a2;hb=16ba52cf04376a6afd806dc9ac35d10e783fbbd1;hp=156e63f7174990cf0255cb6b6cf5008eceeffc12;hpb=847a5fae45dac396d0f9e1bb61d5b4ff9d94cdcd;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 156e63f..a36cc74 100644 --- a/utf8.c +++ b/utf8.c @@ -583,6 +583,59 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) } /* +=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8 + +Converts a string C of length C from UTF8 into byte encoding. +Unlike but like C, returns a pointer to +the newly-created string, and updates C to contain the new +length. Returns the original string if no conversion occurs, C +is unchanged. Do nothing if C points to 0. Sets C to +0 if C is converted or contains all 7bit characters. + +=cut */ + +U8 * +Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8) +{ + U8 *send; + U8 *d; + U8 *start = s; + I32 count = 0; + + if (!*is_utf8) + return start; + + /* ensure valid UTF8 and chars < 256 before converting string */ + for (send = s + *len; s < send;) { + U8 c = *s++; + if (!UTF8_IS_ASCII(c)) { + if (UTF8_IS_CONTINUATION(c) || s >= send || + !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c)) + return start; + s++, count++; + } + } + + *is_utf8 = 0; + + if (!count) + return start; + + Newz(801, d, (*len) - count + 1, U8); + s = start; start = d; + while (s < send) { + U8 c = *s++; + if (UTF8_IS_ASCII(c)) + *d++ = c; + else + *d++ = UTF8_ACCUMULATE(c&3, *s++); + } + *d = '\0'; + *len = d - start; + return start; +} + +/* =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len Converts a string C of length C from ASCII into UTF8 encoding.