X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=1b13809dcd993c68c610cb04b73108ad5639783d;hb=1aa6dd61aa6395f566dba3dd09a3a1a4396547e3;hp=8f04b8931c30765f1b13b75c9812e92e554c6802;hpb=e9101d720b80106fbcfaf9d6e6c2a2d224de977f;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 8f04b89..1b13809 100644 --- a/utf8.c +++ b/utf8.c @@ -24,6 +24,8 @@ #define PERL_IN_UTF8_C #include "perl.h" +static char unees[] = "Malformed UTF-8 character (unexpected end of string)"; + /* =head1 Unicode Support @@ -57,22 +59,19 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) if (ckWARN(WARN_UTF8)) { if (UNICODE_IS_SURROGATE(uv) && !(flags & UNICODE_ALLOW_SURROGATE)) - Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv); + Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv); else if ( ((uv >= 0xFDD0 && uv <= 0xFDEF && !(flags & UNICODE_ALLOW_FDD0)) || - ((uv & 0xFFFF) == 0xFFFE && - !(flags & UNICODE_ALLOW_FFFE)) - || - ((uv & 0xFFFF) == 0xFFFF && + ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */ !(flags & UNICODE_ALLOW_FFFF))) && /* UNICODE_ALLOW_SUPER includes * FFFEs and FFFFs beyond 0x10FFFF. */ ((uv <= PERL_UNICODE_MAX) || !(flags & UNICODE_ALLOW_SUPER)) ) - Perl_warner(aTHX_ WARN_UTF8, + Perl_warner(aTHX_ packWARN(WARN_UTF8), "Unicode character 0x%04"UVxf" is illegal", uv); } if (UNI_IS_INVARIANT(uv)) { @@ -294,9 +293,8 @@ Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) #define UTF8_WARN_SHORT 5 #define UTF8_WARN_OVERFLOW 6 #define UTF8_WARN_SURROGATE 7 -#define UTF8_WARN_BOM 8 -#define UTF8_WARN_LONG 9 -#define UTF8_WARN_FFFF 10 +#define UTF8_WARN_LONG 8 +#define UTF8_WARN_FFFF 9 /* Also FFFE. */ if (curlen == 0 && !(flags & UTF8_ALLOW_EMPTY)) { @@ -391,10 +389,6 @@ Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) !(flags & UTF8_ALLOW_SURROGATE)) { warning = UTF8_WARN_SURROGATE; goto malformed; - } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) && - !(flags & UTF8_ALLOW_BOM)) { - warning = UTF8_WARN_BOM; - goto malformed; } else if ((expectlen > UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) { warning = UTF8_WARN_LONG; @@ -450,9 +444,6 @@ malformed: case UTF8_WARN_SURROGATE: Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); break; - case UTF8_WARN_BOM: - Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv); - break; case UTF8_WARN_LONG: Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte); @@ -469,10 +460,10 @@ malformed: char *s = SvPVX(sv); if (PL_op) - Perl_warner(aTHX_ WARN_UTF8, + Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s in %s", s, OP_DESC(PL_op)); else - Perl_warner(aTHX_ WARN_UTF8, "%s", s); + Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s); } } @@ -498,7 +489,8 @@ returned and retlen is set, if possible, to -1. UV Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen) { - return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0); + return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* @@ -521,7 +513,8 @@ UV Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen) { /* Call the low level routine asking for checks */ - return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0); + return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* @@ -543,13 +536,29 @@ Perl_utf8_length(pTHX_ U8 *s, U8 *e) * the bitops (especially ~) can create illegal UTF-8. * In other words: in Perl UTF-8 is not just for Unicode. */ - if (e < s) - Perl_croak(aTHX_ "panic: utf8_length: unexpected end"); + if (e < s) { + if (ckWARN_d(WARN_UTF8)) { + if (PL_op) + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "%s in %s", unees, OP_DESC(PL_op)); + else + Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); + } + return 0; + } while (s < e) { U8 t = UTF8SKIP(s); - if (e - s < t) - Perl_croak(aTHX_ "panic: utf8_length: unaligned end"); + if (e - s < t) { + if (ckWARN_d(WARN_UTF8)) { + if (PL_op) + Perl_warner(aTHX_ packWARN(WARN_UTF8), + unees, OP_DESC(PL_op)); + else + Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); + } + return len; + } s += t; len++; } @@ -582,8 +591,16 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) while (a < b) { U8 c = UTF8SKIP(a); - if (b - a < c) - Perl_croak(aTHX_ "panic: utf8_distance: unaligned end"); + if (b - a < c) { + if (ckWARN_d(WARN_UTF8)) { + if (PL_op) + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "%s in %s", unees, OP_DESC(PL_op)); + else + Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); + } + return off; + } a += c; off--; } @@ -592,8 +609,16 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) while (b < a) { U8 c = UTF8SKIP(b); - if (a - b < c) - Perl_croak(aTHX_ "panic: utf8_distance: unaligned end"); + if (a - b < c) { + if (ckWARN_d(WARN_UTF8)) { + if (PL_op) + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "%s in %s", unees, OP_DESC(PL_op)); + else + Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); + } + return off; + } b += c; off++; } @@ -952,33 +977,29 @@ Perl_is_uni_xdigit(pTHX_ UV c) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) { - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; - uvchr_to_utf8(tmpbuf, c); - return to_utf8_upper(tmpbuf, p, lenp); + uvchr_to_utf8(p, c); + return to_utf8_upper(p, p, lenp); } UV Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) { - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; - uvchr_to_utf8(tmpbuf, c); - return to_utf8_title(tmpbuf, p, lenp); + uvchr_to_utf8(p, c); + return to_utf8_title(p, p, lenp); } UV Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) { - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; - uvchr_to_utf8(tmpbuf, c); - return to_utf8_lower(tmpbuf, p, lenp); + uvchr_to_utf8(p, c); + return to_utf8_lower(p, p, lenp); } UV Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp) { - U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; - uvchr_to_utf8(tmpbuf, c); - return to_utf8_fold(tmpbuf, p, lenp); + uvchr_to_utf8(p, c); + return to_utf8_fold(p, p, lenp); } /* for now these all assume no locale info available for Unicode > 255 */ @@ -1341,15 +1362,17 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma } } else { - while (t < tend) - d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t++)); + while (t < tend) { + d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t)); + t++; + } } len = d - tmpbuf; Copy(tmpbuf, ustrp, len, U8); #else Copy(s, ustrp, len, U8); - } #endif + } } } @@ -1511,8 +1534,12 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none) Copy(pv, PL_tokenbuf, len+1, char); PL_curcop->op_private = PL_hints; } - if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) + if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { + if (SvPOK(retval)) + Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"", + SvPV_nolen(retval)); Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); + } return retval; } @@ -1590,7 +1617,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) /* We use utf8n_to_uvuni() as we want an index into Unicode tables, not a native character number. */ - UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0); + UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); SV *errsv_save; ENTER; SAVETMPS; @@ -1753,7 +1782,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags) case '\a': Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break; case '\\': - Perl_sv_catpvf(aTHX_ dsv, "\\" ); ok = TRUE; break; + Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break; default: break; } }