From: Jarkko Hietaniemi Date: Fri, 5 Jan 2001 06:44:27 +0000 (+0000) Subject: UTF-8 cleanup. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=fd400ab9029cac48f6054d57b59a99ec1c5d5e1e;p=p5sagit%2Fp5-mst-13.2.git UTF-8 cleanup. p4raw-id: //depot/perl@8328 --- diff --git a/doop.c b/doop.c index f6dbe67..212d496 100644 --- a/doop.c +++ b/doop.c @@ -66,7 +66,7 @@ S_do_trans_simple(pTHX_ SV *sv) c = utf8_to_uv(s, send - s, &ulen, 0); if (c < 0x100 && (ch = tbl[(short)c]) >= 0) { matches++; - if (ch < 0x80) + if (UTF8_IS_ASCII(ch)) *d++ = ch; else d = uv_to_utf8(d,ch); @@ -254,7 +254,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ if (!isutf8) { U8 *t = s, *e = s + len; while (t < e) - if ((hibit = *t++ & 0x80)) + if ((hibit = UTF8_IS_CONTINUED(*t++))) break; if (hibit) s = bytes_to_utf8(s, &len); @@ -330,7 +330,7 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ if (!SvUTF8(sv)) { U8 *t = s, *e = s + len; while (t < e) - if ((hibit = *t++ & 0x80)) + if ((hibit = !UTF8_IS_ASCII(*t++))) break; if (hibit) start = s = bytes_to_utf8(s, &len); @@ -374,7 +374,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ if (!isutf8) { U8 *t = s, *e = s + len; while (t < e) - if ((hibit = *t++ & 0x80)) + if ((hibit = !UTF8_IS_ASCII(*t++))) break; if (hibit) s = bytes_to_utf8(s, &len); diff --git a/op.c b/op.c index 00bf204..3cbfd6d 100644 --- a/op.c +++ b/op.c @@ -112,7 +112,7 @@ Perl_pad_allocmy(pTHX_ char *name) if (!(PL_in_my == KEY_our || isALPHA(name[1]) || - (PL_hints & HINT_UTF8 && (name[1] & 0xc0) == 0xc0) || + (PL_hints & HINT_UTF8 && UTF8_IS_START(name[1])) || (name[1] == '_' && (int)strlen(name) > 2))) { if (!isPRINT(name[1]) || strchr("\t\n\r\f", name[1])) { @@ -2170,7 +2170,7 @@ Perl_localize(pTHX_ OP *o, I32 lex) else { if (ckWARN(WARN_PARENTHESIS) && PL_bufptr > PL_oldbufptr && PL_bufptr[-1] == ',') { char *s; - for (s = PL_bufptr; *s && (isALNUM(*s) || (*s & 0x80) || strchr("@$%, ",*s)); s++) ; + for (s = PL_bufptr; *s && (isALNUM(*s) || UTF8_IS_CONTINUED(*s) || strchr("@$%, ",*s)); s++) ; if (*s == ';' || *s == '=') Perl_warner(aTHX_ WARN_PARENTHESIS, "Parentheses missing around \"%s\" list", diff --git a/pp.c b/pp.c index 1ea2a07..6bb1618 100644 --- a/pp.c +++ b/pp.c @@ -2095,7 +2095,7 @@ PP(pp_negate) sv_setsv(TARG, sv); *SvPV_force(TARG, len) = *s == '-' ? '+' : '-'; } - else if (DO_UTF8(sv) && *(U8*)s >= 0xc0 && isIDFIRST_utf8((U8*)s)) { + else if (DO_UTF8(sv) && UTF8_IS_START(*s) && isIDFIRST_utf8((U8*)s)) { sv_setpvn(TARG, "-", 1); sv_catsv(TARG, sv); } @@ -2958,7 +2958,8 @@ PP(pp_chr) (void)SvUPGRADE(TARG,SVt_PV); - if ((value > 255 && !IN_BYTE) || (value & 0x80 && PL_hints & HINT_UTF8) ) { + if ((value > 255 && !IN_BYTE) || + (UTF8_IS_CONTINUED(value) && (PL_hints & HINT_UTF8)) ) { SvGROW(TARG, UTF8_MAXLEN+1); tmps = SvPVX(TARG); tmps = (char*)uv_to_utf8((U8*)tmps, (UV)value); @@ -3009,7 +3010,7 @@ PP(pp_ucfirst) register U8 *s; STRLEN slen; - if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) { + if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && UTF8_IS_START(*s)) { STRLEN ulen; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tend; @@ -3068,7 +3069,7 @@ PP(pp_lcfirst) register U8 *s; STRLEN slen; - if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) { + if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && UTF8_IS_START(*s)) { STRLEN ulen; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tend; @@ -3284,7 +3285,7 @@ PP(pp_quotemeta) d = SvPVX(TARG); if (DO_UTF8(sv)) { while (len) { - if (*s & 0x80) { + if (UTF8_IS_CONTINUED(*s)) { STRLEN ulen = UTF8SKIP(s); if (ulen > len) ulen = len; @@ -4797,7 +4798,7 @@ PP(pp_unpack) while ((len > 0) && (s < strend)) { auv = (auv << 7) | (*s & 0x7f); - if (!(*s++ & 0x80)) { + if (UTF8_IS_ASCII(*s++)) { bytes = 0; sv = NEWSV(40, 0); sv_setuv(sv, auv); diff --git a/pp_ctl.c b/pp_ctl.c index 2302ad8..70c3ea3 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -545,7 +545,7 @@ PP(pp_formline) s = item; if (item_is_utf) { while (arg--) { - if (*s & 0x80) { + if (UTF8_IS_CONTINUED(*s)) { switch (UTF8SKIP(s)) { case 7: *t++ = *s++; case 6: *t++ = *s++; diff --git a/pp_hot.c b/pp_hot.c index b36aeb8..30cc61d 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -2913,7 +2913,7 @@ S_method_common(pTHX_ SV* meth, U32* hashp) !(ob=(SV*)GvIO(iogv))) { if (!packname || - ((*(U8*)packname >= 0xc0 && DO_UTF8(sv)) + ((UTF8_IS_START(*packname) && DO_UTF8(sv)) ? !isIDFIRST_utf8((U8*)packname) : !isIDFIRST(*packname) )) diff --git a/pp_sys.c b/pp_sys.c index 8105c68..ca4d1bd 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -3278,12 +3278,12 @@ PP(pp_fttext) continue; #endif /* utf8 characters don't count as odd */ - if (*s & 0x40) { + if (UTF8_IS_START(*s)) { int ulen = UTF8SKIP(s); if (ulen < len - i) { int j; for (j = 1; j < ulen; j++) { - if ((s[j] & 0xc0) != 0x80) + if (!UTF8_IS_CONTINUATION(s[j])) goto not_utf8; } --ulen; /* loop does extra increment */ diff --git a/regcomp.c b/regcomp.c index c39405e..19d8e8e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2925,7 +2925,7 @@ tryagain: break; default: normal_default: - if ((*p & 0xc0) == 0xc0 && UTF) { + if (UTF8_IS_START(*p) && UTF) { ender = utf8_to_uv((U8*)p, RExC_end - p, &numlen, 0); p += numlen; @@ -2945,6 +2945,8 @@ tryagain: if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; + /* ender is a Unicode value so it can be > 0xff -- + * in other words, do not use UTF8_IS_CONTINUED(). */ else if (ender >= 0x80 && UTF) { reguni(pRExC_state, ender, s, &numlen); s += numlen; @@ -2956,6 +2958,8 @@ tryagain: } break; } + /* ender is a Unicode value so it can be > 0xff -- + * in other words, do not use UTF8_IS_CONTINUED(). */ if (ender >= 0x80 && UTF) { reguni(pRExC_state, ender, s, &numlen); s += numlen; diff --git a/regexec.c b/regexec.c index 0c29b78..0b65d11 100644 --- a/regexec.c +++ b/regexec.c @@ -2273,7 +2273,7 @@ S_regmatch(pTHX_ regnode *prog) if (!nextchr) sayNO; if (do_utf8) { - if (nextchr & 0x80) { + if (UTF8_IS_CONTINUED(nextchr)) { if (!(OP(scan) == SPACE ? swash_fetch(PL_utf8_space, (U8*)locinput) : isSPACE_LC_utf8((U8*)locinput))) diff --git a/sv.c b/sv.c index 139d98a..c4935d8 100644 --- a/sv.c +++ b/sv.c @@ -2945,7 +2945,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) e = SvEND(sv); t = s; while (t < e) { - if ((hibit = *t++ & 0x80)) + if ((hibit = UTF8_IS_CONTINUED(*t++))) break; } @@ -3037,7 +3037,7 @@ Perl_sv_utf8_decode(pTHX_ register SV *sv) return FALSE; e = SvEND(sv); while (c < e) { - if (*c++ & 0x80) { + if (UTF8_IS_CONTINUED(*c++)) { SvUTF8_on(sv); break; } diff --git a/toke.c b/toke.c index 018e235..850d913 100644 --- a/toke.c +++ b/toke.c @@ -1408,7 +1408,7 @@ S_scan_const(pTHX_ char *start) int hicount = 0; char *c; for (c = SvPVX(sv); c < d; c++) { - if (*c & 0x80) + if (UTF8_IS_CONTINUED(*c)) hicount++; } if (hicount) { @@ -1421,7 +1421,7 @@ S_scan_const(pTHX_ char *start) dst = d - 1; while (src < dst) { - if (*src & 0x80) { + if (UTF8_IS_CONTINUED(*src)) { dst--; uv_to_utf8((U8*)dst, (U8)*src--); dst--; @@ -1547,7 +1547,7 @@ S_scan_const(pTHX_ char *start) /* (now in tr/// code again) */ - if (*s & 0x80 && (this_utf8 || has_utf8)) { + if (UTF8_IS_CONTINUED(*s) && (this_utf8 || has_utf8)) { STRLEN len = (STRLEN) -1; UV uv; if (this_utf8) { @@ -3647,7 +3647,7 @@ Perl_yylex(pTHX) missingterm((char*)0); yylval.ival = OP_CONST; for (d = SvPV(PL_lex_stuff, len); len; len--, d++) { - if (*d == '$' || *d == '@' || *d == '\\' || *d & 0x80) { + if (*d == '$' || *d == '@' || *d == '\\' || UTF8_IS_CONTINUED(*d)) { yylval.ival = OP_STRINGIFY; break; } @@ -5918,9 +5918,9 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag *d++ = *s++; *d++ = *s++; } - else if (UTF && *(U8*)s >= 0xc0 && isALNUM_utf8((U8*)s)) { + else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { char *t = s + UTF8SKIP(s); - while (*t & 0x80 && is_utf8_mark((U8*)t)) + while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t)) t += UTF8SKIP(t); if (d + (t - s) > e) Perl_croak(aTHX_ ident_too_long); @@ -5970,9 +5970,9 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des *d++ = *s++; *d++ = *s++; } - else if (UTF && *(U8*)s >= 0xc0 && isALNUM_utf8((U8*)s)) { + else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { char *t = s + UTF8SKIP(s); - while (*t & 0x80 && is_utf8_mark((U8*)t)) + while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t)) t += UTF8SKIP(t); if (d + (t - s) > e) Perl_croak(aTHX_ ident_too_long); @@ -6025,7 +6025,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des e = s; while ((e < send && isALNUM_lazy_if(e,UTF)) || *e == ':') { e += UTF8SKIP(e); - while (e < send && *e & 0x80 && is_utf8_mark((U8*)e)) + while (e < send && UTF8_IS_CONTINUED(*e) && is_utf8_mark((U8*)e)) e += UTF8SKIP(e); } Copy(s, d, e - s, char); @@ -6642,7 +6642,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* after skipping whitespace, the next character is the terminator */ term = *s; - if ((term & 0x80) && UTF) + if (UTF8_IS_CONTINUED(term) && UTF) has_utf8 = TRUE; /* mark where we are */ @@ -6689,7 +6689,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) have found the terminator */ else if (*s == term) break; - else if (!has_utf8 && (*s & 0x80) && UTF) + else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6718,7 +6718,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) break; else if (*s == PL_multi_open) brackets++; - else if (!has_utf8 && (*s & 0x80) && UTF) + else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) has_utf8 = TRUE; *to = *s; }