X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=fda99209335c094720d4450a63c8c9eb77f30759;hb=07766739ad671051f274806a83c05be36e7ca89a;hp=66d3fec81c23c3319fbce109e92158883aa3f580;hpb=7d85a32c7dc09903975590ebedb298bcbd436874;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 66d3fec..fda9920 100644 --- a/utf8.c +++ b/utf8.c @@ -27,7 +27,7 @@ /* Unicode support */ /* -=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv +=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv Adds the UTF8 representation of the Unicode codepoint C to the end of the string C; C should be have at least C free @@ -141,7 +141,8 @@ character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 chara The actual number of bytes in the UTF-8 character will be returned if it is valid, otherwise 0. -=cut */ +=cut +*/ STRLEN Perl_is_utf8_char(pTHX_ U8 *s) { @@ -162,14 +163,14 @@ Perl_is_utf8_char(pTHX_ U8 *s) slen = len - 1; s++; - /* The initial value is dubious */ + u &= UTF_START_MASK(len); uv = u; ouv = uv; while (slen--) { if (!UTF8_IS_CONTINUATION(*s)) return 0; uv = UTF8_ACCUMULATE(uv, *s); - if (uv < ouv) + if (uv < ouv) return 0; ouv = uv; s++; @@ -236,10 +237,11 @@ the strict UTF-8 encoding (see F). Most code should use utf8_to_uvchr() rather than call this directly. -=cut */ +=cut +*/ UV -Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) +Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) { UV uv = *s, ouv; STRLEN len = 1; @@ -439,7 +441,7 @@ malformed: } /* -=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen +=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen Returns the native character value of the first character in the string C which is assumed to be in UTF8 encoding; C will be set to the @@ -452,13 +454,13 @@ returned and retlen is set, if possible, to -1. */ UV -Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen) +Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen) { return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0); } /* -=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen +=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen Returns the Unicode code point of the first character in the string C which is assumed to be in UTF8 encoding; C will be set to the @@ -474,14 +476,14 @@ returned and retlen is set, if possible, to -1. */ UV -Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen) +Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen) { /* Call the low level routine asking for checks */ return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0); } /* -=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e +=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e Return the length of the UTF-8 char encoded string C in characters. Stops at C (inclusive). If C s> or if the scan would end @@ -491,7 +493,7 @@ up past C, croaks. */ STRLEN -Perl_utf8_length(pTHX_ U8* s, U8* e) +Perl_utf8_length(pTHX_ U8 *s, U8 *e) { STRLEN len = 0; @@ -522,7 +524,8 @@ and C. WARNING: use only if you *know* that the pointers point inside the same UTF-8 buffer. -=cut */ +=cut +*/ IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b) @@ -558,7 +561,7 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) } /* -=for apidoc A|U8*|utf8_hop|U8 *s|I32 off +=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off Return the UTF-8 pointer C displaced by C characters, either forward or backward. @@ -567,7 +570,8 @@ WARNING: do not use the following unless you *know* C is within the UTF-8 data pointed to by C *and* that on entry C is aligned on the first byte of character or just after the last byte of a character. -=cut */ +=cut +*/ U8 * Perl_utf8_hop(pTHX_ U8 *s, I32 off) @@ -602,7 +606,7 @@ Returns zero on failure, setting C to -1. */ U8 * -Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) +Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) { U8 *send; U8 *d; @@ -641,10 +645,11 @@ length. Returns the original string if no conversion occurs, C is unchanged. Do nothing if C points to 0. Sets C to 0 if C is converted or contains all 7bit characters. -=cut */ +=cut +*/ U8 * -Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8) +Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8) { U8 *d; U8 *start = s; @@ -695,7 +700,7 @@ reflect the new length. */ U8* -Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len) +Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len) { U8 *send; U8 *d; @@ -1342,7 +1347,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); - PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1)))); + /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */ + PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR)) @@ -1389,7 +1395,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) /* -=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv +=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv Adds the UTF8 representation of the Native codepoint C to the end of the string C; C should be have at least C free @@ -1417,7 +1423,7 @@ Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) /* -=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags +=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags Returns the native character value of the first character in the string C which is assumed to be in UTF8 encoding; C will be set to the @@ -1432,7 +1438,7 @@ Allows length and flags to be passed to low level routine. */ #undef Perl_utf8n_to_uvchr UV -Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) +Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) { UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); return UNI_TO_NATIVE(uv);