From: Jarkko Hietaniemi Date: Thu, 31 May 2001 00:16:44 +0000 (+0000) Subject: Salvage bits and pieces from the experimental 'utf8 everywhere' X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=0064a8a9866779dceb087452b9bfaa733c51adce;p=p5sagit%2Fp5-mst-13.2.git Salvage bits and pieces from the experimental 'utf8 everywhere' patch: rename HINT_BYTE and IN_BYTE to HINT_BYTES and IN_BYTES to match the pragma name; various robustness cleanups. p4raw-id: //depot/perl@10339 --- diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index 2327d3d..a843737 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -36,7 +36,9 @@ sub SWASHNEW { } { - $list ||= ($caller ne 'main' && eval { $caller->$type(); }) + $list ||= + ( exists &{"${caller}::${type}"} && + eval { $caller->$type() } ) || do "$file.pl" || do "$encoding/$file.pl" || do "$encoding/Is/${type}.pl" diff --git a/mg.c b/mg.c index 4b6d25d..b9a027e 100644 --- a/mg.c +++ b/mg.c @@ -375,9 +375,11 @@ Perl_magic_regdatum_get(pTHX_ SV *sv, MAGIC *mg) if (i > 0 && DO_UTF8(PL_reg_sv)) { char *b = rx->subbeg; - i = Perl_utf8_length(aTHX_ (U8*)b, (U8*)(b+i)); + if (b) + i = Perl_utf8_length(aTHX_ (U8*)b, (U8*)(b+i)); } - sv_setiv(sv,i); + + sv_setiv(sv, i); } } return 0; diff --git a/perl.h b/perl.h index 9283e8e..7286921 100644 --- a/perl.h +++ b/perl.h @@ -2891,7 +2891,7 @@ enum { /* pass one of these to get_vtbl */ #define HINT_INTEGER 0x00000001 #define HINT_STRICT_REFS 0x00000002 /* #define HINT_notused4 0x00000004 */ -#define HINT_BYTE 0x00000008 +#define HINT_BYTES 0x00000008 /* #define HINT_notused10 0x00000010 */ /* Note: 20,40,80 used for NATIVE_HINTS */ diff --git a/pp.c b/pp.c index 4bb8d3e..6328a6c 100644 --- a/pp.c +++ b/pp.c @@ -3001,7 +3001,7 @@ PP(pp_chr) (void)SvUPGRADE(TARG,SVt_PV); - if (value > 255 && !IN_BYTE) { + if (value > 255 && !IN_BYTES) { SvGROW(TARG, UNISKIP(value)+1); tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value); SvCUR_set(TARG, tmps - SvPVX(TARG)); diff --git a/pp_ctl.c b/pp_ctl.c index 736f84d..032be2e 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -323,7 +323,7 @@ PP(pp_formline) NV value; bool gotsome = FALSE; STRLEN len; - STRLEN fudge = SvCUR(tmpForm) * (IN_BYTE ? 1 : 3) + 1; + STRLEN fudge = SvCUR(tmpForm) * (IN_BYTES ? 1 : 3) + 1; bool item_is_utf = FALSE; if (!SvMAGICAL(tmpForm) || !SvCOMPILED(tmpForm)) { diff --git a/pp_sys.c b/pp_sys.c index 062cee0..e64fbef 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -1535,7 +1535,7 @@ PP(pp_sysread) io = GvIO(gv); if (!io || !IoIFP(io)) goto say_undef; - if ((fp_utf8 = PerlIO_isutf8(IoIFP(io))) && !IN_BYTE) { + if ((fp_utf8 = PerlIO_isutf8(IoIFP(io))) && !IN_BYTES) { buffer = SvPVutf8_force(bufsv, blen); /* UTF8 may not have been set if they are all low bytes */ SvUTF8_on(bufsv); @@ -1663,7 +1663,7 @@ PP(pp_sysread) SvCUR_set(bufsv, count+(buffer - SvPVX(bufsv))); *SvEND(bufsv) = '\0'; (void)SvPOK_only(bufsv); - if (fp_utf8 && !IN_BYTE) { + if (fp_utf8 && !IN_BYTES) { /* Look at utf8 we got back and count the characters */ char *bend = buffer + count; while (buffer < bend) { diff --git a/regexec.c b/regexec.c index b8659f2..c795a5b 100644 --- a/regexec.c +++ b/regexec.c @@ -966,7 +966,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta else { U8 *r = reghop3((U8*)s, -1, (U8*)startpos); - tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0); + if (s > (char*)r) + tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0); } tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); @@ -1009,7 +1010,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta else { U8 *r = reghop3((U8*)s, -1, (U8*)startpos); - tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0); + if (s > (char*)r) + tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0); } tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); diff --git a/sv.c b/sv.c index 2279ea0..6b933b1 100644 --- a/sv.c +++ b/sv.c @@ -3067,7 +3067,7 @@ Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok) if (fail_ok) return FALSE; #ifdef USE_BYTES_DOWNGRADES - else if (IN_BYTE) { + else if (IN_BYTES) { U8 *d = s; U8 *e = (U8 *) SvEND(sv); int first = 1; @@ -4893,7 +4893,7 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2) pv2 = SvPV(sv2, cur2); /* do not utf8ize the comparands as a side-effect */ - if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) { + if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTES) { bool is_utf8 = TRUE; /* UTF-8ness differs */ if (PL_hints & HINT_UTF8_DISTINCT) @@ -4960,7 +4960,7 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2) pv2 = SvPV(sv2, cur2); /* do not utf8ize the comparands as a side-effect */ - if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) { + if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTES) { if (PL_hints & HINT_UTF8_DISTINCT) return SvUTF8(sv1) ? 1 : -1; @@ -7265,7 +7265,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV uv = args ? va_arg(*args, int) : SvIVx(argsv); if ((uv > 255 || (!UNI_IS_INVARIANT(uv) && SvUTF8(sv))) - && !IN_BYTE) { + && !IN_BYTES) { eptr = (char*)utf8buf; elen = uvchr_to_utf8((U8*)eptr, uv) - utf8buf; is_utf = TRUE; diff --git a/toke.c b/toke.c index 54fcd16..851b759 100644 --- a/toke.c +++ b/toke.c @@ -3867,7 +3867,7 @@ Perl_yylex(pTHX) CLINE; yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpv(PL_tokenbuf,0)); yylval.opval->op_private = OPpCONST_BARE; - if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len)) + if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len)) SvUTF8_on(((SVOP*)yylval.opval)->op_sv); TERM(WORD); } @@ -4028,7 +4028,7 @@ Perl_yylex(pTHX) if (*s == '=' && s[1] == '>') { CLINE; sv_setpv(((SVOP*)yylval.opval)->op_sv, PL_tokenbuf); - if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len)) + if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len)) SvUTF8_on(((SVOP*)yylval.opval)->op_sv); TERM(WORD); } @@ -4204,7 +4204,7 @@ Perl_yylex(pTHX) } #endif #ifdef PERLIO_LAYERS - if (UTF && !IN_BYTE) + if (UTF && !IN_BYTES) PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8"); #endif PL_rsfp = Nullfp; @@ -6532,7 +6532,7 @@ retval: Renew(SvPVX(tmpstr), SvLEN(tmpstr), char); } SvREFCNT_dec(herewas); - if (UTF && !IN_BYTE && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) + if (UTF && !IN_BYTES && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) SvUTF8_on(tmpstr); PL_lex_stuff = tmpstr; yylval.ival = op_type; diff --git a/utf8.c b/utf8.c index f692164..1c1a5d4 100644 --- a/utf8.c +++ b/utf8.c @@ -507,7 +507,7 @@ Perl_utf8_length(pTHX_ U8 *s, U8 *e) U8 t = UTF8SKIP(s); if (e - s < t) - Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t); + Perl_croak(aTHX_ "panic: utf8_length: unaligned end"); s += t; len++; } diff --git a/utf8.h b/utf8.h index 8898cf1..baee5ae 100644 --- a/utf8.h +++ b/utf8.h @@ -111,10 +111,10 @@ END_EXTERN_C * (that is, the two high bits are set). Otherwise we risk loading in the * heavy-duty SWASHINIT and SWASHGET routines unnecessarily. */ -#define isIDFIRST_lazy_if(p,c) ((IN_BYTE || (!c || (*((U8*)p) < 0xc0))) \ +#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((U8*)p) < 0xc0))) \ ? isIDFIRST(*(p)) \ : isIDFIRST_utf8((U8*)p)) -#define isALNUM_lazy_if(p,c) ((IN_BYTE || (!c || (*((U8*)p) < 0xc0))) \ +#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || (*((U8*)p) < 0xc0))) \ ? isALNUM(*(p)) \ : isALNUM_utf8((U8*)p)) @@ -129,8 +129,8 @@ END_EXTERN_C #define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */ /* #define IN_UTF8 (PL_curcop->op_private & HINT_UTF8) */ -#define IN_BYTE (PL_curcop->op_private & HINT_BYTE) -#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE) +#define IN_BYTES (PL_curcop->op_private & HINT_BYTES) +#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES) #define UTF8_ALLOW_EMPTY 0x0001 #define UTF8_ALLOW_CONTINUATION 0x0002 diff --git a/utfebcdic.h b/utfebcdic.h index 0dd73d2..2c56006 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -234,10 +234,10 @@ END_EXTERN_C * unnecessarily. */ -#define isIDFIRST_lazy_if(p,c) ((IN_BYTE || (!c || UTF8_IS_INVARIANT(*p))) \ +#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || UTF8_IS_INVARIANT(*p))) \ ? isIDFIRST(*(p)) \ : isIDFIRST_utf8((U8*)p)) -#define isALNUM_lazy_if(p,c) ((IN_BYTE || (!c || UTF8_IS_INVARIANT(*p))) \ +#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || UTF8_IS_INVARIANT(*p))) \ ? isALNUM(*(p)) \ : isALNUM_utf8((U8*)p))