From: Jarkko Hietaniemi Date: Thu, 14 Sep 2000 14:40:40 +0000 (+0000) Subject: Batch of UTF-8 patches from Simon Cozens. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=67e989fb549091286d76fd8d29f1ec03b9da175d;p=p5sagit%2Fp5-mst-13.2.git Batch of UTF-8 patches from Simon Cozens. p4raw-id: //depot/perl@7075 --- diff --git a/doop.c b/doop.c index 77c7324..80cc0f6 100644 --- a/doop.c +++ b/doop.c @@ -77,7 +77,7 @@ S_do_trans_simple(pTHX_ SV *sv) ulen = 1; /* Need to check this, otherwise 128..255 won't match */ - c = utf8_to_uv(s, &ulen); + c = utf8_to_uv(s, &ulen, 0); if (c < 0x100 && (ch = tbl[(short)c]) >= 0) { matches++; if (ch < 0x80) @@ -125,7 +125,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ I32 ulen; ulen = 1; if (hasutf) - c = utf8_to_uv(s,&ulen); + c = utf8_to_uv(s,&ulen, 0); else c = *s; if (c < 0x100 && tbl[c] >= 0) @@ -364,7 +364,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else if (uv == none) { /* "none" is unmapped character */ I32 ulen; - *d++ = (U8)utf8_to_uv(s, &ulen); + *d++ = (U8)utf8_to_uv(s, &ulen, 0); s += ulen; puv = 0xfeedface; continue; @@ -405,7 +405,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else if (uv == none) { /* "none" is unmapped character */ I32 ulen; - *d++ = (U8)utf8_to_uv(s, &ulen); + *d++ = (U8)utf8_to_uv(s, &ulen, 0); s += ulen; continue; } @@ -969,10 +969,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) switch (optype) { case OP_BIT_AND: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, &ulen); + luc = utf8_to_uv((U8*)lc, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, &ulen); + ruc = utf8_to_uv((U8*)rc, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc & ruc; @@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) break; case OP_BIT_XOR: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, &ulen); + luc = utf8_to_uv((U8*)lc, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, &ulen); + ruc = utf8_to_uv((U8*)rc, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc ^ ruc; @@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) goto mop_up_utf; case OP_BIT_OR: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, &ulen); + luc = utf8_to_uv((U8*)lc, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, &ulen); + ruc = utf8_to_uv((U8*)rc, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc | ruc; diff --git a/embed.h b/embed.h index 9bd72ad..7e030a9 100644 --- a/embed.h +++ b/embed.h @@ -2186,7 +2186,7 @@ #define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b) #define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) #define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) -#define utf8_to_uv(a,b) Perl_utf8_to_uv(aTHX_ a,b) +#define utf8_to_uv(a,b,c) Perl_utf8_to_uv(aTHX_ a,b,c) #define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b) #define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a) #define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b) diff --git a/embed.pl b/embed.pl index c8e83f8..23214a3 100755 --- a/embed.pl +++ b/embed.pl @@ -2074,7 +2074,7 @@ Ap |I32 |utf8_distance |U8 *a|U8 *b Ap |U8* |utf8_hop |U8 *s|I32 off ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len -Ap |UV |utf8_to_uv |U8 *s|I32* retlen +Ap |UV |utf8_to_uv |U8 *s|I32* retlen|bool checking Ap |U8* |uv_to_utf8 |U8 *d|UV uv p |void |vivify_defelem |SV* sv p |void |vivify_ref |SV* sv|U32 to_what diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index cc0a86a..5f4a77e 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -2,29 +2,104 @@ #include "perl.h" #include "XSUB.h" -MODULE = Encode PACKAGE = Encode +#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \ + Perl_croak("panic_unimplemented"); \ + } +UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) +UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) + +void call_failure (SV *routine, U8* done, U8* dest, U8* orig); + +MODULE = Encode PACKAGE = Encode PROTOTYPES: ENABLE -SV * +I32 _bytes_to_utf8(sv, ...) - SV * sv + SV * sv CODE: - { - SV * encoding = 2 ? ST(1) : Nullsv; - RETVAL = &PL_sv_undef; - } + { + SV * encoding = items == 2 ? ST(1) : Nullsv; + + if (encoding) + RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); + else { + STRLEN len; + U8* s = SvPV(sv, len); + U8* converted; + + converted = bytes_to_utf8(s, &len); /* This allocs */ + sv_setpvn(sv, converted, len); + SvUTF8_on(sv); /* XXX Should we? */ + Safefree(converted); /* ... so free it */ + RETVAL = len; + } + } OUTPUT: - RETVAL + RETVAL -SV * +I32 _utf8_to_bytes(sv, ...) - SV * sv + SV * sv CODE: - { - SV * to = items > 1 ? ST(1) : Nullsv; - SV * check = items > 2 ? ST(2) : Nullsv; - RETVAL = &PL_sv_undef; + { + SV * to = items > 1 ? ST(1) : Nullsv; + SV * check = items > 2 ? ST(2) : Nullsv; + + if (to) + RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); + else { + U8 *s; + STRLEN len; + s = SvPV(sv, len); + + if (SvTRUE(check)) { + /* Must do things the slow way */ + U8 *dest; + U8 *src = savepv(s); /* We need a copy to pass to check() */ + U8 *send = s + len; + + New(83, dest, len, U8); /* I think */ + + while (s < send) { + if (*s < 0x80) + *dest++ = *s++; + else { + I32 ulen; + I32 byte; + I32 uv = *s++; + + /* Have to do it all ourselves because of error routine, + aargh. */ + if (!(uv & 0x40)) + goto failure; + if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } + else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } + else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } + else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } + else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } + else if (!(uv & 0x01)) { ulen = 7; uv = 0; } + else { ulen = 13; uv = 0; } + + /* Note change to utf8.c variable naming, for variety */ + while (ulen--) { + if ((*s & 0xc0) != 0x80) + goto failure; + + else + uv = (uv << 6) | (*s++ & 0x3f); + } + if (uv > 256) { + failure: + call_failure(check, s, dest, src); + /* Now what happens? */ + } + *dest++ = (U8)uv; + } + } + } else + RETVAL = (utf8_to_bytes(s, &len) ? len : 0); + } } OUTPUT: RETVAL diff --git a/handy.h b/handy.h index d82b1c6..c240c42 100644 --- a/handy.h +++ b/handy.h @@ -448,21 +448,21 @@ Converts the specified character to lowercase. #define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f') #define isBLANK_utf8(c) isBLANK(c) /* could be wrong */ -#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv(p, 0)) -#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv(p, 0)) -#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv(p, 0)) -#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv(p, 0)) -#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv(p, 0)) -#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv(p, 0)) -#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv(p, 0)) -#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv(p, 0)) -#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv(p, 0)) -#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv(p, 0)) -#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv(p, 0)) -#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv(p, 0)) -#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv(p, 0)) -#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, 0)) -#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, 0)) +#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv(p, 0, 0)) +#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv(p, 0, 0)) +#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv(p, 0, 0)) +#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv(p, 0, 0)) +#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv(p, 0, 0)) +#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv(p, 0, 0)) +#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv(p, 0, 0)) +#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv(p, 0, 0)) +#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv(p, 0, 0)) +#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv(p, 0, 0)) +#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv(p, 0, 0)) +#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv(p, 0, 0)) +#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv(p, 0, 0)) +#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, 0, 0)) +#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, 0, 0)) #define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f') #define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */ diff --git a/op.c b/op.c index 74d67e3..d24396a 100644 --- a/op.c +++ b/op.c @@ -2656,7 +2656,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) qsort(cp, i, sizeof(U8*), utf8compare); for (j = 0; j < i; j++) { U8 *s = cp[j]; - UV val = utf8_to_uv(s, &ulen); + UV val = utf8_to_uv(s, &ulen, 0); s += ulen; diff = val - nextmin; if (diff > 0) { @@ -2669,7 +2669,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } } if (*s == 0xff) - val = utf8_to_uv(s+1, &ulen); + val = utf8_to_uv(s+1, &ulen, 0); if (val >= nextmin) nextmin = val + 1; } @@ -2696,10 +2696,10 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) while (t < tend || tfirst <= tlast) { /* see if we need more "t" chars */ if (tfirst > tlast) { - tfirst = (I32)utf8_to_uv(t, &ulen); + tfirst = (I32)utf8_to_uv(t, &ulen, 0); t += ulen; if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */ - tlast = (I32)utf8_to_uv(++t, &ulen); + tlast = (I32)utf8_to_uv(++t, &ulen, 0); t += ulen; } else @@ -2709,10 +2709,10 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) /* now see if we need more "r" chars */ if (rfirst > rlast) { if (r < rend) { - rfirst = (I32)utf8_to_uv(r, &ulen); + rfirst = (I32)utf8_to_uv(r, &ulen, 0); r += ulen; if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */ - rlast = (I32)utf8_to_uv(++r, &ulen); + rlast = (I32)utf8_to_uv(++r, &ulen, 0); r += ulen; } else diff --git a/pod/perlapi.pod b/pod/perlapi.pod index b1feed3..ca2ba7c 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -3182,10 +3182,18 @@ Found in file handy.h =item U8 *s -Returns true if first C bytes of the given string form valid a UTF8 -string, false otherwise. +Returns the character value of the first character in the string C +which is assumed to be in UTF8 encoding; C will be set to the +length, in bytes, of that character, and the pointer C will be +advanced to the end of the character. - is_utf8_string U8 *s(STRLEN len) +If C does not point to a well-formed UTF8 character, the behaviour +is dependent on the value of C: if this is true, it is +assumed that the caller will raise a warning, and this function will +set C to C<-1> and return. If C is not true, an optional UTF8 +warning is produced. + + utf8_to_uv U8 *s(I32 *retlen, I32 checking) =for hackers Found in file utf8.c @@ -3195,7 +3203,7 @@ Found in file utf8.c Converts a string C of length C from UTF8 into byte encoding. Unlike C, this over-writes the original string, and updates len to contain the new length. -Returns zero on failure leaving the string and len unchanged +Returns zero on failure, setting C to -1. U8 * utf8_to_bytes(U8 *s, STRLEN *len) diff --git a/pp.c b/pp.c index d5d5dd8..1c5a963 100644 --- a/pp.c +++ b/pp.c @@ -2195,7 +2195,7 @@ PP(pp_ord) I32 retlen; if ((*tmps & 0x80) && DO_UTF8(tmpsv)) - value = utf8_to_uv(tmps, &retlen); + value = utf8_to_uv(tmps, &retlen, 0); else value = (UV)(*tmps & 255); XPUSHu(value); @@ -2262,7 +2262,7 @@ PP(pp_ucfirst) I32 ulen; U8 tmpbuf[UTF8_MAXLEN]; U8 *tend; - UV uv = utf8_to_uv(s, &ulen); + UV uv = utf8_to_uv(s, &ulen, 0); if (PL_op->op_private & OPpLOCALE) { TAINT; @@ -2321,7 +2321,7 @@ PP(pp_lcfirst) I32 ulen; U8 tmpbuf[UTF8_MAXLEN]; U8 *tend; - UV uv = utf8_to_uv(s, &ulen); + UV uv = utf8_to_uv(s, &ulen, 0); if (PL_op->op_private & OPpLOCALE) { TAINT; @@ -2398,7 +2398,7 @@ PP(pp_uc) TAINT; SvTAINTED_on(TARG); while (s < send) { - d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, &ulen))); + d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, &ulen, 0))); s += ulen; } } @@ -2472,7 +2472,7 @@ PP(pp_lc) TAINT; SvTAINTED_on(TARG); while (s < send) { - d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, &ulen))); + d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, &ulen, 0))); s += ulen; } } @@ -3614,7 +3614,7 @@ PP(pp_unpack) len = strend - s; if (checksum) { while (len-- > 0 && s < strend) { - auint = utf8_to_uv((U8*)s, &along); + auint = utf8_to_uv((U8*)s, &along, 0); s += along; if (checksum > 32) cdouble += (NV)auint; @@ -3626,7 +3626,7 @@ PP(pp_unpack) EXTEND(SP, len); EXTEND_MORTAL(len); while (len-- > 0 && s < strend) { - auint = utf8_to_uv((U8*)s, &along); + auint = utf8_to_uv((U8*)s, &along, 0); s += along; sv = NEWSV(37, 0); sv_setuv(sv, (UV)auint); diff --git a/pp_ctl.c b/pp_ctl.c index 8981bb8..3cc74e5 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -2959,13 +2959,13 @@ PP(pp_require) U8 *s = (U8*)SvPVX(sv); U8 *end = (U8*)SvPVX(sv) + SvCUR(sv); if (s < end) { - rev = utf8_to_uv(s, &len); + rev = utf8_to_uv(s, &len, 0); s += len; if (s < end) { - ver = utf8_to_uv(s, &len); + ver = utf8_to_uv(s, &len, 0); s += len; if (s < end) - sver = utf8_to_uv(s, &len); + sver = utf8_to_uv(s, &len, 0); } } if (PERL_REVISION < rev diff --git a/proto.h b/proto.h index ed08b20..6a0229a 100644 --- a/proto.h +++ b/proto.h @@ -817,7 +817,7 @@ PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b); PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len); PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len); -PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen); +PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen, bool checking); PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv); PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv); PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what); diff --git a/regcomp.c b/regcomp.c index 723cbbe..c60ab84 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2881,7 +2881,7 @@ tryagain: default: normal_default: if ((*p & 0xc0) == 0xc0 && UTF) { - ender = utf8_to_uv((U8*)p, &numlen); + ender = utf8_to_uv((U8*)p, &numlen, 0); p += numlen; } else @@ -3635,12 +3635,12 @@ S_regclassutf8(pTHX) namedclass = OOB_NAMEDCLASS; if (!range) rangebegin = PL_regcomp_parse; - value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); + value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen, 0); PL_regcomp_parse += numlen; if (value == '[') namedclass = regpposixcc(value); else if (value == '\\') { - value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen); + value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen, 0); PL_regcomp_parse += numlen; /* Some compilers cannot handle switching on 64-bit integer * values, therefore value cannot be an UV. Yes, this will diff --git a/regexec.c b/regexec.c index 6401710..990791b 100644 --- a/regexec.c +++ b/regexec.c @@ -914,7 +914,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUNDUTF8: - tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n'; + tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0, 0) : '\n'; tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == BOUNDUTF8 ? @@ -950,7 +950,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUNDUTF8: - tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n'; + tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0, 0) : '\n'; tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == NBOUNDUTF8 ? @@ -1995,7 +1995,7 @@ S_regmatch(pTHX_ regnode *prog) while (s < e) { if (l >= PL_regeol) sayNO; - if (utf8_to_uv((U8*)s, 0) != (c1 ? + if (utf8_to_uv((U8*)s, 0, 0) != (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l))) { @@ -2133,7 +2133,7 @@ S_regmatch(pTHX_ regnode *prog) case NBOUNDUTF8: /* was last char in word? */ ln = (locinput != PL_regbol) - ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev; + ? utf8_to_uv(reghop((U8*)locinput, -1), 0, 0) : PL_regprev; if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { ln = isALNUM_uni(ln); n = swash_fetch(PL_utf8_alnum, (U8*)locinput); diff --git a/sv.c b/sv.c index 4da49cc..2c45cae 100644 --- a/sv.c +++ b/sv.c @@ -2398,6 +2398,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) { int hicount; char *c; + char *s; if (!sv || !SvPOK(sv) || SvUTF8(sv)) return; @@ -2406,30 +2407,16 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) * to signal if there are any hibit chars in the string */ hicount = 0; - for (c = SvPVX(sv); c < SvEND(sv); c++) { + for (c = s = SvPVX(sv); c < SvEND(sv); c++) { if (*c & 0x80) hicount++; } if (hicount) { - char *src, *dst; - SvGROW(sv, SvCUR(sv) + hicount + 1); - - src = SvEND(sv) - 1; - SvCUR_set(sv, SvCUR(sv) + hicount); - dst = SvEND(sv) - 1; - - while (src < dst) { - if (*src & 0x80) { - dst--; - uv_to_utf8((U8*)dst, (U8)*src--); - dst--; - } - else { - *dst-- = *src--; - } - } - + STRLEN len = SvCUR(sv) + 1; /* Plus the \0 */ + SvPVX(sv) = bytes_to_utf8(s, &len); + SvCUR(sv) = len - 1; + Safefree(s); /* No longer using what was there before */ SvUTF8_on(sv); } } @@ -2450,46 +2437,14 @@ Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok) { if (SvPOK(sv) && SvUTF8(sv)) { char *c = SvPVX(sv); - char *first_hi = 0; - /* need to figure out if this is possible at all first */ - while (c < SvEND(sv)) { - if (*c & 0x80) { - I32 len; - UV uv = utf8_to_uv((U8*)c, &len); - if (uv >= 256) { - if (fail_ok) - return FALSE; - else { - /* XXX might want to make a callback here instead */ - Perl_croak(aTHX_ "Big byte"); - } - } - if (!first_hi) - first_hi = c; - c += len; - } - else { - c++; - } - } - - if (first_hi) { - char *src = first_hi; - char *dst = first_hi; - while (src < SvEND(sv)) { - if (*src & 0x80) { - I32 len; - U8 u = (U8)utf8_to_uv((U8*)src, &len); - *dst++ = u; - src += len; - } - else { - *dst++ = *src++; - } - } - SvCUR_set(sv, dst - SvPVX(sv)); - } - SvUTF8_off(sv); + STRLEN len = SvCUR(sv); + if (!utf8_to_bytes(c, &len)) { + if (fail_ok) + return FALSE; + else + Perl_croak("big byte"); + } + SvCUR(sv) = len - 1; } return TRUE; } @@ -2523,24 +2478,15 @@ Perl_sv_utf8_decode(pTHX_ register SV *sv) * we want to make sure everything inside is valid utf8 first. */ c = SvPVX(sv); + if (!is_utf8_string(c,SvCUR(c)+1)) + return FALSE; + while (c < SvEND(sv)) { - if (*c & 0x80) { - I32 len; - (void)utf8_to_uv((U8*)c, &len); - if (len == 1) { - /* bad utf8 */ - return FALSE; - } - c += len; - has_utf = TRUE; - } - else { - c++; - } + if (*c++ & 0x80) { + SvUTF8_on(sv); + break; + } } - - if (has_utf) - SvUTF8_on(sv); } return TRUE; } @@ -6373,7 +6319,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV break; } if (utf) - iv = (IV)utf8_to_uv(vecstr, &ulen); + iv = (IV)utf8_to_uv(vecstr, &ulen, 0); else { iv = *vecstr; ulen = 1; @@ -6455,7 +6401,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV break; } if (utf) - uv = utf8_to_uv(vecstr, &ulen); + uv = utf8_to_uv(vecstr, &ulen, 0); else { uv = *vecstr; ulen = 1; diff --git a/toke.c b/toke.c index e5f737b..31f5f0a 100644 --- a/toke.c +++ b/toke.c @@ -812,7 +812,7 @@ Perl_str_to_version(pTHX_ SV *sv) I32 skip; UV n; if (utf) - n = utf8_to_uv((U8*)start, &skip); + n = utf8_to_uv((U8*)start, &skip, 0); else { n = *(U8*)start; skip = 1; @@ -1323,7 +1323,7 @@ S_scan_const(pTHX_ char *start) /* (now in tr/// code again) */ if (*s & 0x80 && thisutf) { - (void)utf8_to_uv((U8*)s, &len); + (void)utf8_to_uv((U8*)s, &len, 0); if (len == 1) { /* illegal UTF8, make it valid */ char *old_pvx = SvPVX(sv); diff --git a/utf8.c b/utf8.c index a9600e9..d97a8b0 100644 --- a/utf8.c +++ b/utf8.c @@ -158,8 +158,25 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) return 1; } +/* +=for apidoc Am|utf8_to_uv|U8 *s|I32 *retlen|I32 checking + +Returns the character value of the first character in the string C +which is assumed to be in UTF8 encoding; C will be set to the +length, in bytes, of that character, and the pointer C will be +advanced to the end of the character. + +If C does not point to a well-formed UTF8 character, the behaviour +is dependent on the value of C: if this is true, it is +assumed that the caller will raise a warning, and this function will +set C to C<-1> and return. If C is not true, an optional UTF8 +warning is produced. + +=cut +*/ + UV -Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen) +Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen, bool checking) { UV uv = *s; int len; @@ -170,6 +187,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen) } if (!(uv & 0x40)) { dTHR; + if (checking && retlen) { + *retlen = -1; + return 0; + } + if (ckWARN_d(WARN_UTF8)) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character"); if (retlen) @@ -192,6 +214,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen) while (len--) { if ((*s & 0xc0) != 0x80) { dTHR; + if (checking && retlen) { + *retlen = -1; + return 0; + } + if (ckWARN_d(WARN_UTF8)) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character"); if (retlen) @@ -253,7 +280,7 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off) Converts a string C of length C from UTF8 into byte encoding. Unlike C, this over-writes the original string, and updates len to contain the new length. -Returns zero on failure leaving the string and len unchanged +Returns zero on failure, setting C to -1. =cut */ @@ -273,8 +300,10 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) while (s < send) { U8 c = *s++; if (c >= 0x80 && - ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) + ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) { + *len = -1; return 0; + } } s = save; while (s < send) { @@ -282,7 +311,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) *d++ = *s++; else { I32 ulen; - *d++ = (U8)utf8_to_uv(s, &ulen); + *d++ = (U8)utf8_to_uv(s, &ulen, 0); s += ulen; } } @@ -810,7 +839,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p) if (!PL_utf8_toupper) PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_toupper, p); - return uv ? uv : utf8_to_uv(p,0); + return uv ? uv : utf8_to_uv(p,0,0); } UV @@ -821,7 +850,7 @@ Perl_to_utf8_title(pTHX_ U8 *p) if (!PL_utf8_totitle) PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_totitle, p); - return uv ? uv : utf8_to_uv(p,0); + return uv ? uv : utf8_to_uv(p,0,0); } UV @@ -832,7 +861,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p) if (!PL_utf8_tolower) PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_tolower, p); - return uv ? uv : utf8_to_uv(p,0); + return uv ? uv : utf8_to_uv(p,0,0); } /* a "swash" is a swatch hash */ @@ -922,7 +951,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); - PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0) & ~(needents - 1)))); + PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0, 0) & ~(needents - 1)))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR))