X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=e86c49fdcf63ab3aac316e0f63656274f4ddbf6b;hb=a65e9df7d11c78917193138b8249299b47efd8b4;hp=9bb89a4ca6631221f43b15254133b072076099c4;hpb=1e72252ad7b8e23d1a1142285b8aa82986bd2491;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 9bb89a4..e86c49f 100644 --- a/utf8.c +++ b/utf8.c @@ -134,6 +134,30 @@ Perl_is_utf8_char(pTHX_ U8 *s) return len; } +/* +=for apidoc Am|bool_utf8_string|U8 *s|STRLEN len + +Returns true if first C bytes of the given string form valid a UTF8 +string, false otherwise. + +=cut +*/ + +bool +Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) +{ + U8* x=s; + U8* send=s+len; + int c; + while (x < send) { + c = is_utf8_char(x); + x += c; + if (!c || x > send) + return 0; + } + return 1; +} + UV Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen) { @@ -296,19 +320,26 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len) return dst; } -/* XXX NOTHING CALLS THE FOLLOWING TWO ROUTINES YET!!! */ /* - * Convert native or reversed UTF-16 to UTF-8. + * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. * * Destination must be pre-extended to 3/2 source. Do not use in-place. * We optimize for native, for obvious reasons. */ U8* -Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen) +Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) { - U16* pend = p + bytelen / 2; + U8* pend; + U8* dstart = d; + + if (bytelen & 1) + Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen"); + + pend = p + bytelen; + while (p < pend) { - UV uv = *p++; + UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ + p += 2; if (uv < 0x80) { *d++ = uv; continue; @@ -320,13 +351,9 @@ Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen) } if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */ dTHR; - int low = *p++; - if (low < 0xdc00 || low >= 0xdfff) { - if (ckWARN_d(WARN_UTF8)) - Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-16 surrogate"); - p--; - uv = 0xfffd; - } + UV low = *p++; + if (low < 0xdc00 || low >= 0xdfff) + Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; } if (uv < 0x10000) { @@ -343,13 +370,14 @@ Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen) continue; } } + *newlen = d - dstart; return d; } /* Note: this one is slightly destructive of the source. */ U8* -Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen) +Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) { U8* s = (U8*)p; U8* send = s + bytelen; @@ -359,7 +387,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen) s[1] = tmp; s += 2; } - return utf16_to_utf8(p, d, bytelen); + return utf16_to_utf8(p, d, bytelen, newlen); } /* for now these are all defined (inefficiently) in terms of the utf8 versions */