X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=e86b3fd7861170784b67b96ec2ebc90a80f6feda;hb=72e9304675ac276b2ac244c09a40ea9e7b9ea35d;hp=efcc602bb1dd5ddf019d80ef663870a8395fcb35;hpb=3f2394f296786314979c2ad3096f123eb7b22930;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index efcc602..e86b3fd7 100644 --- a/utf8.c +++ b/utf8.c @@ -1,6 +1,6 @@ /* utf8.c * - * Copyright (c) 1998-2002, Larry Wall + * Copyright (C) 2000, 2001, 2002, 2003, by Larry Wall and others * * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. @@ -217,10 +217,10 @@ Perl_is_utf8_char(pTHX_ U8 *s) /* =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len -Returns true if first C bytes of the given string form a valid UTF8 -string, false otherwise. Note that 'a valid UTF8 string' does not mean -'a string that contains UTF8' because a valid ASCII string is a valid -UTF8 string. +Returns true if first C bytes of the given string form a valid +UTF8 string, false otherwise. Note that 'a valid UTF8 string' does +not mean 'a string that contains code points above 0x7F encoded in +UTF8' because a valid ASCII string is a valid UTF8 string. =cut */ @@ -237,9 +237,17 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) send = s + len; while (x < send) { - c = is_utf8_char(x); - if (!c) - return FALSE; + /* Inline the easy bits of is_utf8_char() here for speed... */ + if (UTF8_IS_INVARIANT(*x)) + c = 1; + else if (!UTF8_IS_START(*x)) + return FALSE; + else { + /* ... and call is_utf8_char() only if really needed. */ + c = is_utf8_char(x); + if (!c) + return FALSE; + } x += c; } if (x != send) @@ -762,6 +770,9 @@ Converts a string C of length C from ASCII into UTF8 encoding. Returns a pointer to the newly-created string, and sets C to reflect the new length. +If you want to convert to UTF8 from other encodings than ASCII, +see sv_recode_to_utf8(). + =cut */ @@ -1224,7 +1235,7 @@ Perl_is_utf8_upper(pTHX_ U8 *p) if (!is_utf8_char(p)) return FALSE; if (!PL_utf8_upper) - PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0); + PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_upper, p, TRUE) != 0; } @@ -1234,7 +1245,7 @@ Perl_is_utf8_lower(pTHX_ U8 *p) if (!is_utf8_char(p)) return FALSE; if (!PL_utf8_lower) - PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0); + PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_lower, p, TRUE) != 0; } @@ -1348,15 +1359,6 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma HE *he; SV *val; -#if defined(UNDER_CE) -/*strange: compiler complaints that I redefine macro UVXf and points where - it was first defined. I copied line from there without any changes. - Nothing should change. - But when I do not do this, there is an error on a line with - Perl_newSVpvf(aTHX_ "%04"UVXf, uv1) -*/ -#define UVXf "lX" /**/ -#endif if ((hv = get_hv(special, FALSE)) && (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) && (he = hv_fetch_ent(hv, keysv, FALSE, 0)) && @@ -1543,6 +1545,7 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none) if (PL_curcop == &PL_compiling) { /* XXX ought to be handled by lex_start */ SAVEI32(PL_in_my); + PL_in_my = 0; sv_setpv(tokenbufsv, PL_tokenbuf); } errsv_save = newSVsv(ERRSV); @@ -1564,8 +1567,8 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none) } if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { if (SvPOK(retval)) - Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"", - SvPV_nolen(retval)); + Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"", + retval); Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); } return retval;