X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=3b5d5d4a9c5185ad6da7fb207759aaeb8a35f61c;hb=bc9c751170bc3ae5aa8c6bbdbe4c1e970b6ad2a6;hp=d9533301ff984a7a00ee2845376681ddef82d54e;hpb=a0714e2c8319bd04d1f7d262de652b6b5ec054f7;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index d953330..3b5d5d4 100644 --- a/utf8.c +++ b/utf8.c @@ -1,6 +1,6 @@ /* utf8.c * - * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, + * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 * by Larry Wall and others * * You may distribute under the terms of either the GNU General Public @@ -25,6 +25,13 @@ #define PERL_IN_UTF8_C #include "perl.h" +#ifndef EBCDIC +/* Separate prototypes needed because in ASCII systems these + * usually macros but they still are compiled as code, too. */ +PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags); +PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); +#endif + static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; @@ -64,6 +71,8 @@ is the recommended Unicode-aware way of saying U8 * Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) { + PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; + if (ckWARN(WARN_UTF8)) { if (UNICODE_IS_SURROGATE(uv) && !(flags & UNICODE_ALLOW_SURROGATE)) @@ -186,12 +195,14 @@ five bytes or more. =cut */ STATIC STRLEN -S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len) +S_is_utf8_char_slow(const U8 *s, const STRLEN len) { U8 u = *s; STRLEN slen; UV uv, ouv; + PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW; + if (UTF8_IS_INVARIANT(u)) return 1; @@ -238,6 +249,9 @@ STRLEN Perl_is_utf8_char(pTHX_ const U8 *s) { const STRLEN len = UTF8SKIP(s); + + PERL_ARGS_ASSERT_IS_UTF8_CHAR; + PERL_UNUSED_CONTEXT; #ifdef IS_UTF8_CHAR if (IS_UTF8_CHAR_FAST(len)) return IS_UTF8_CHAR(s, len) ? len : 0; @@ -261,12 +275,11 @@ See also is_utf8_string_loclen() and is_utf8_string_loc(). bool Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) { + const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; - const U8* send; - if (!len) - len = strlen((const char *)s); - send = s + len; + PERL_ARGS_ASSERT_IS_UTF8_STRING; + PERL_UNUSED_CONTEXT; while (x < send) { STRLEN c; @@ -281,9 +294,10 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) c = UTF8SKIP(x); if (IS_UTF8_CHAR_FAST(c)) { if (!IS_UTF8_CHAR(x, c)) - goto out; - } else if (!is_utf8_char_slow(x, c)) - goto out; + c = 0; + } + else + c = is_utf8_char_slow(x, c); #else c = is_utf8_char(x); #endif /* #ifdef IS_UTF8_CHAR */ @@ -326,15 +340,13 @@ See also is_utf8_string_loc() and is_utf8_string(). bool Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { + const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; - const U8* send; STRLEN c; + STRLEN outlen = 0; - if (!len) - len = strlen((const char *)s); - send = s + len; - if (el) - *el = 0; + PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; + PERL_UNUSED_CONTEXT; while (x < send) { /* Inline the easy bits of is_utf8_char() here for speed... */ @@ -358,17 +370,16 @@ Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN goto out; } x += c; - if (el) - (*el)++; + outlen++; } out: + if (el) + *el = outlen; + if (ep) *ep = x; - if (x != send) - return FALSE; - - return TRUE; + return (x == send); } /* @@ -376,7 +387,7 @@ Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags Bottom level UTF-8 decode routine. -Returns the unicode code point value of the first character in the string C +Returns the Unicode code point value of the first character in the string C which is assumed to be in UTF-8 encoding and no longer than C; C will be set to the length, in bytes, of that character. @@ -400,7 +411,7 @@ UV Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) { dVAR; - const U8 *s0 = s; + const U8 * const s0 = s; UV uv = *s, ouv = 0; STRLEN len = 1; const bool dowarn = ckWARN_d(WARN_UTF8); @@ -408,6 +419,8 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) STRLEN expectlen = 0; U32 warning = 0; + PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; + /* This list is a superset of the UTF8_ALLOW_XXX. */ #define UTF8_WARN_EMPTY 1 @@ -529,12 +542,12 @@ malformed: if (flags & UTF8_CHECK_ONLY) { if (retlen) - *retlen = -1; + *retlen = ((STRLEN) -1); return 0; } if (dowarn) { - SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character ")); + SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP); switch (warning) { case 0: /* Intentionally empty. */ break; @@ -615,6 +628,8 @@ returned and retlen is set, if possible, to -1. UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) { + PERL_ARGS_ASSERT_UTF8_TO_UVCHR; + return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } @@ -638,6 +653,8 @@ returned and retlen is set, if possible, to -1. UV Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) { + PERL_ARGS_ASSERT_UTF8_TO_UVUNI; + /* Call the low level routine asking for checks */ return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); @@ -658,6 +675,9 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) { dVAR; STRLEN len = 0; + U8 t = 0; + + PERL_ARGS_ASSERT_UTF8_LENGTH; /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. * the bitops (especially ~) can create illegal UTF-8. @@ -666,7 +686,7 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) if (e < s) goto warn_and_return; while (s < e) { - const U8 t = UTF8SKIP(s); + t = UTF8SKIP(s); if (e - s < t) { warn_and_return: if (ckWARN_d(WARN_UTF8)) { @@ -700,43 +720,9 @@ same UTF-8 buffer. IV Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) { - dVAR; - IV off = 0; - - /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. - * the bitops (especially ~) can create illegal UTF-8. - * In other words: in Perl UTF-8 is not just for Unicode. */ - - if (a < b) { - while (a < b) { - const U8 c = UTF8SKIP(a); - if (b - a < c) - goto warn_and_return; - a += c; - off--; - } - } - else { - while (b < a) { - const U8 c = UTF8SKIP(b); - - if (a - b < c) { - warn_and_return: - if (ckWARN_d(WARN_UTF8)) { - if (PL_op) - Perl_warner(aTHX_ packWARN(WARN_UTF8), - "%s in %s", unees, OP_DESC(PL_op)); - else - Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); - } - return off; - } - b += c; - off++; - } - } + PERL_ARGS_ASSERT_UTF8_DISTANCE; - return off; + return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); } /* @@ -755,6 +741,9 @@ on the first byte of character or just after the last byte of a character. U8 * Perl_utf8_hop(pTHX_ const U8 *s, I32 off) { + PERL_ARGS_ASSERT_UTF8_HOP; + + PERL_UNUSED_CONTEXT; /* Note: cannot use UTF8_IS_...() too eagerly here since e.g * the bitops (especially ~) can create illegal UTF-8. * In other words: in Perl UTF-8 is not just for Unicode. */ @@ -781,24 +770,28 @@ Unlike C, this over-writes the original string, and updates len to contain the new length. Returns zero on failure, setting C to -1. +If you need a copy of the string, see C. + =cut */ U8 * Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) { - U8 *send; + U8 * const save = s; + U8 * const send = s + *len; U8 *d; - U8 *save = s; + + PERL_ARGS_ASSERT_UTF8_TO_BYTES; /* ensure valid UTF-8 and chars < 256 before updating string */ - for (send = s + *len; s < send; ) { + while (s < send) { U8 c = *s++; if (!UTF8_IS_INVARIANT(c) && (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send) || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) { - *len = -1; + *len = ((STRLEN) -1); return 0; } } @@ -835,6 +828,9 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) const U8 *send; I32 count = 0; + PERL_ARGS_ASSERT_BYTES_FROM_UTF8; + + PERL_UNUSED_CONTEXT; if (!*is_utf8) return (U8 *)start; @@ -850,9 +846,9 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) } } - *is_utf8 = 0; + *is_utf8 = FALSE; - Newxz(d, (*len) - count + 1, U8); + Newx(d, (*len) - count + 1, U8); s = start; start = d; while (s < send) { U8 c = *s++; @@ -888,7 +884,10 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) U8 *d; U8 *dst; - Newxz(d, (*len) * 2 + 1, U8); + PERL_ARGS_ASSERT_BYTES_TO_UTF8; + PERL_UNUSED_CONTEXT; + + Newx(d, (*len) * 2 + 1, U8); dst = d; while (s < send) { @@ -917,6 +916,8 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) U8* pend; U8* dstart = d; + PERL_ARGS_ASSERT_UTF16_TO_UTF8; + if (bytelen == 1 && p[0] == 0) { /* Be understanding. */ d[0] = 0; *newlen = 1; @@ -924,7 +925,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) } if (bytelen & 1) - Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen); + Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); pend = p + bytelen; @@ -932,7 +933,11 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ p += 2; if (uv < 0x80) { +#ifdef EBCDIC + *d++ = UNI_TO_NATIVE(uv); +#else *d++ = (U8)uv; +#endif continue; } if (uv < 0x800) { @@ -971,9 +976,12 @@ U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) { U8* s = (U8*)p; - U8* send = s + bytelen; + U8* const send = s + bytelen; + + PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; + while (s < send) { - U8 tmp = s[0]; + const U8 tmp = s[0]; s[0] = s[1]; s[1] = tmp; s += 2; @@ -1098,6 +1106,8 @@ Perl_is_uni_xdigit(pTHX_ UV c) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) { + PERL_ARGS_ASSERT_TO_UNI_UPPER; + uvchr_to_utf8(p, c); return to_utf8_upper(p, p, lenp); } @@ -1105,6 +1115,8 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) UV Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) { + PERL_ARGS_ASSERT_TO_UNI_TITLE; + uvchr_to_utf8(p, c); return to_utf8_title(p, p, lenp); } @@ -1112,6 +1124,8 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) UV Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) { + PERL_ARGS_ASSERT_TO_UNI_LOWER; + uvchr_to_utf8(p, c); return to_utf8_lower(p, p, lenp); } @@ -1119,6 +1133,8 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) UV Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp) { + PERL_ARGS_ASSERT_TO_UNI_FOLD; + uvchr_to_utf8(p, c); return to_utf8_fold(p, p, lenp); } @@ -1244,6 +1260,9 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, const char *const swashname) { dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_COMMON; + if (!is_utf8_char(p)) return FALSE; if (!*swash) @@ -1255,120 +1274,168 @@ bool Perl_is_utf8_alnum(pTHX_ const U8 *p) { dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_ALNUM; + /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true * descendant of isalnum(3), in other words, it doesn't * contain the '_'. --jhi */ - return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord"); + return is_utf8_common(p, &PL_utf8_alnum, "IsWord"); } bool Perl_is_utf8_alnumc(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC"); + + PERL_ARGS_ASSERT_IS_UTF8_ALNUMC; + + return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC"); } bool Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ { dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_IDFIRST; + if (*p == '_') return TRUE; /* is_utf8_idstart would be more logical. */ - return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart"); + return is_utf8_common(p, &PL_utf8_idstart, "IdStart"); } bool Perl_is_utf8_idcont(pTHX_ const U8 *p) { dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_IDCONT; + if (*p == '_') return TRUE; - return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue"); + return is_utf8_common(p, &PL_utf8_idcont, "IdContinue"); } bool Perl_is_utf8_alpha(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha"); + + PERL_ARGS_ASSERT_IS_UTF8_ALPHA; + + return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha"); } bool Perl_is_utf8_ascii(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii"); + + PERL_ARGS_ASSERT_IS_UTF8_ASCII; + + return is_utf8_common(p, &PL_utf8_ascii, "IsAscii"); } bool Perl_is_utf8_space(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl"); + + PERL_ARGS_ASSERT_IS_UTF8_SPACE; + + return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl"); } bool Perl_is_utf8_digit(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit"); + + PERL_ARGS_ASSERT_IS_UTF8_DIGIT; + + return is_utf8_common(p, &PL_utf8_digit, "IsDigit"); } bool Perl_is_utf8_upper(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase"); + + PERL_ARGS_ASSERT_IS_UTF8_UPPER; + + return is_utf8_common(p, &PL_utf8_upper, "IsUppercase"); } bool Perl_is_utf8_lower(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase"); + + PERL_ARGS_ASSERT_IS_UTF8_LOWER; + + return is_utf8_common(p, &PL_utf8_lower, "IsLowercase"); } bool Perl_is_utf8_cntrl(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl"); + + PERL_ARGS_ASSERT_IS_UTF8_CNTRL; + + return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl"); } bool Perl_is_utf8_graph(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph"); + + PERL_ARGS_ASSERT_IS_UTF8_GRAPH; + + return is_utf8_common(p, &PL_utf8_graph, "IsGraph"); } bool Perl_is_utf8_print(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint"); + + PERL_ARGS_ASSERT_IS_UTF8_PRINT; + + return is_utf8_common(p, &PL_utf8_print, "IsPrint"); } bool Perl_is_utf8_punct(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct"); + + PERL_ARGS_ASSERT_IS_UTF8_PUNCT; + + return is_utf8_common(p, &PL_utf8_punct, "IsPunct"); } bool Perl_is_utf8_xdigit(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit"); + + PERL_ARGS_ASSERT_IS_UTF8_XDIGIT; + + return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit"); } bool Perl_is_utf8_mark(pTHX_ const U8 *p) { dVAR; - return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM"); + + PERL_ARGS_ASSERT_IS_UTF8_MARK; + + return is_utf8_common(p, &PL_utf8_mark, "IsM"); } /* @@ -1403,12 +1470,14 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, dVAR; U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN len = 0; - const UV uv0 = utf8_to_uvchr(p, NULL); /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings * are necessary in EBCDIC, they are redundant no-ops * in ASCII-ish platforms, and hopefully optimized away. */ const UV uv1 = NATIVE_TO_UNI(uv0); + + PERL_ARGS_ASSERT_TO_UTF8_CASE; + uvuni_to_utf8(tmpbuf, uv1); if (!*swashp) /* load on-demand */ @@ -1418,10 +1487,10 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, if (special && (uv1 == 0xDF || uv1 > 0xFF)) { /* It might be "special" (sometimes, but not always, * a multicharacter mapping) */ - HV *hv; + HV * const hv = get_hv(special, FALSE); SV **svp; - if ((hv = get_hv(special, FALSE)) && + if (hv && (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) && (*svp)) { const char *s; @@ -1441,7 +1510,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, STRLEN tlen = 0; while (t < tend) { - UV c = utf8_to_uvchr(t, &tlen); + const UV c = utf8_to_uvchr(t, &tlen); if (tlen > 0) { d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); t += tlen; @@ -1466,12 +1535,11 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, } if (!len && *swashp) { - UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE); - + const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE); + if (uv2) { /* It was "normal" (a single character mapping). */ - UV uv3 = UNI_TO_NATIVE(uv2); - + const UV uv3 = UNI_TO_NATIVE(uv2); len = uvchr_to_utf8(ustrp, uv3) - ustrp; } } @@ -1502,6 +1570,9 @@ UV Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) { dVAR; + + PERL_ARGS_ASSERT_TO_UTF8_UPPER; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper"); } @@ -1523,6 +1594,9 @@ UV Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) { dVAR; + + PERL_ARGS_ASSERT_TO_UTF8_TITLE; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle"); } @@ -1544,6 +1618,9 @@ UV Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) { dVAR; + + PERL_ARGS_ASSERT_TO_UTF8_LOWER; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower"); } @@ -1566,6 +1643,9 @@ UV Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) { dVAR; + + PERL_ARGS_ASSERT_TO_UTF8_FOLD; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold"); } @@ -1581,13 +1661,14 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits { dVAR; SV* retval; - SV* const tokenbufsv = sv_newmortal(); dSP; const size_t pkg_len = strlen(pkg); const size_t name_len = strlen(name); - HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE); + HV * const stash = gv_stashpvn(pkg, pkg_len, 0); SV* errsv_save; + PERL_ARGS_ASSERT_SWASH_INIT; + PUSHSTACKi(PERLSI_MAGIC); ENTER; SAVEI32(PL_hints); @@ -1596,6 +1677,13 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */ ENTER; errsv_save = newSVsv(ERRSV); + /* It is assumed that callers of this routine are not passing in any + user derived data. */ + /* Need to do this after save_re_context() as it will set PL_tainted to + 1 while saving $1 etc (see the code after getrx: in Perl_magic_get). + Even line to create errsv_save can turn on PL_tainted. */ + SAVEBOOL(PL_tainted); + PL_tainted = 0; Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), NULL); if (!SvTRUE(ERRSV)) @@ -1606,18 +1694,12 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits SPAGAIN; PUSHMARK(SP); EXTEND(SP,5); - PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len))); - PUSHs(sv_2mortal(newSVpvn(name, name_len))); + mPUSHp(pkg, pkg_len); + mPUSHp(name, name_len); PUSHs(listsv); - PUSHs(sv_2mortal(newSViv(minbits))); - PUSHs(sv_2mortal(newSViv(none))); + mPUSHi(minbits); + mPUSHi(none); PUTBACK; - if (IN_PERL_COMPILETIME) { - /* XXX ought to be handled by lex_start */ - SAVEI32(PL_in_my); - PL_in_my = 0; - sv_setpv(tokenbufsv, PL_tokenbuf); - } errsv_save = newSVsv(ERRSV); if (call_method("SWASHNEW", G_SCALAR)) retval = newSVsv(*PL_stack_sp--); @@ -1629,16 +1711,12 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits LEAVE; POPSTACK; if (IN_PERL_COMPILETIME) { - STRLEN len; - const char* const pv = SvPV_const(tokenbufsv, len); - - Copy(pv, PL_tokenbuf, len+1, char); - PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK); + CopHINTS_set(PL_curcop, PL_hints); } if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { if (SvPOK(retval)) Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"", - retval); + SVfARG(retval)); Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); } return retval; @@ -1671,7 +1749,9 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) U32 bit; SV *swatch; U8 tmputf8[2]; - UV c = NATIVE_TO_ASCII(*ptr); + const UV c = NATIVE_TO_ASCII(*ptr); + + PERL_ARGS_ASSERT_SWASH_FETCH; if (!do_utf8 && !UNI_IS_INVARIANT(c)) { tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); @@ -1734,7 +1814,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) needents); if (IN_PERL_COMPILETIME) - PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK); + CopHINTS_set(PL_curcop, PL_hints); svp = hv_store(hv, (const char *)ptr, klen, swatch, 0); @@ -1744,7 +1824,8 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) } PL_last_swash_hv = hv; - PL_last_swash_klen = klen; + assert(klen <= sizeof(PL_last_swash_key)); + PL_last_swash_klen = (U8)klen; /* FIXME change interpvar.h? */ PL_last_swash_tmps = (U8 *) tmps; PL_last_swash_slen = slen; @@ -1767,7 +1848,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ; } Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width"); - return 0; + NORETURN_FUNCTION_END; } /* Note: @@ -1782,7 +1863,6 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) SV *swatch; U8 *l, *lend, *x, *xend, *s; STRLEN lcur, xcur, scur; - HV* const hv = (HV*)SvRV(swash); SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); @@ -1796,15 +1876,17 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) const UV none = SvUV(*nonesvp); const UV end = start + span; + PERL_ARGS_ASSERT_SWASH_GET; + if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf, (UV)bits); } /* create and initialize $swatch */ - swatch = newSVpvs(""); scur = octets ? (span * octets) : (span + 7) / 8; - SvGROW(swatch, scur + 1); + swatch = newSV(scur); + SvPOK_on(swatch); s = (U8*)SvPVX(swatch); if (octets && none) { const U8* const e = s + scur; @@ -1834,7 +1916,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) l = (U8*)SvPV(*listsvp, lcur); lend = l + lcur; while (l < lend) { - UV min, max, val, key; + UV min, max, val; STRLEN numlen; I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; @@ -1907,6 +1989,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) continue; if (octets) { + UV key; if (min < start) { if (!none || val < none) { val += start - min; @@ -1937,6 +2020,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) } } else { /* bits == 1, then val should be ignored */ + UV key; if (min < start) min = start; for (key = min; key <= max; key++) { @@ -1962,7 +2046,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) U8 *s, *o, *nl; STRLEN slen, olen; - U8 opc = *x++; + const U8 opc = *x++; if (opc == '\n') continue; @@ -2032,7 +2116,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) else { STRLEN otheroctets = otherbits >> 3; STRLEN offset = 0; - U8* send = s + slen; + U8* const send = s + slen; while (s < send) { UV otherval = 0; @@ -2051,7 +2135,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) } if (opc == '+' && otherval) - ; /* replace with otherval */ + NOOP; /* replace with otherval */ else if (opc == '!' && !otherval) otherval = 1; else if (opc == '-' && otherval) @@ -2105,12 +2189,16 @@ is the recommended wide native character-aware way of saying U8 * Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) { + PERL_ARGS_ASSERT_UVCHR_TO_UTF8; + return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0); } U8 * Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) { + PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS; + return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags); } @@ -2135,6 +2223,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) { const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); + + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; + return UNI_TO_NATIVE(uv); } @@ -2161,7 +2252,10 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f int truncated = 0; const char *s, *e; + PERL_ARGS_ASSERT_PV_UNI_DISPLAY; + sv_setpvn(dsv, "", 0); + SvUTF8_off(dsv); for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { UV u; /* This serves double duty as a flag and a character to print after @@ -2193,12 +2287,14 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f default: break; } if (ok) { - Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok); + const char string = ok; + sv_catpvn(dsv, &string, 1); } } /* isPRINT() is the locale-blind version. */ if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { - Perl_sv_catpvf(aTHX_ dsv, "%c", c); + const char string = c; + sv_catpvn(dsv, &string, 1); ok = 1; } } @@ -2222,10 +2318,13 @@ The flags argument is as in pv_uni_display(). The pointer to the PV of the dsv is returned. -=cut */ +=cut +*/ char * Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) { + PERL_ARGS_ASSERT_SV_UNI_DISPLAY; + return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv), SvCUR(ssv), pvlim, flags); } @@ -2273,16 +2372,23 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const U8 natbuf[1+1]; STRLEN foldlen1, foldlen2; bool match; + + PERL_ARGS_ASSERT_IBCMP_UTF8; if (pe1) e1 = *(U8**)pe1; + /* assert(e1 || l1); */ if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1))) f1 = (const U8*)s1 + l1; if (pe2) e2 = *(U8**)pe2; + /* assert(e2 || l2); */ if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2))) f2 = (const U8*)s2 + l2; + /* This shouldn't happen. However, putting an assert() there makes some + * tests fail. */ + /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */ if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)) return 1; /* mismatch; possible infinite loop or false positive */