X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=25cd0fdc4184135aa4e324e2ff8753a337202c27;hb=7d3fb23018f73b213481a8b6b108e1dc03cefcff;hp=7302bb77e2256f72ca6d2d5b71c0bd06c009ce88;hpb=c7f1f0165ac822994a67426c2d8003c3974e49ef;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/utf8.c b/utf8.c
index 7302bb7..25cd0fd 100644
--- a/utf8.c
+++ b/utf8.c
@@ -46,10 +46,22 @@ is the recommended Unicode-aware way of saying
 U8 *
 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 {
-    if (uv < 0x80) {
-	*d++ = uv;
+    if (UNI_IS_INVARIANT(uv)) {
+	*d++ = UTF_TO_NATIVE(uv);
 	return d;
     }
+#if defined(EBCDIC) || 1 /* always for testing */
+    else {
+	STRLEN len  = UNISKIP(uv);
+	U8 *p = d+len-1;
+	while (p > d) {
+	    *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
+	    uv >>= UTF_ACCUMULATION_SHIFT;
+	}
+	*p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
+	return d+len;
+    }
+#else /* Non loop style */
     if (uv < 0x800) {
 	*d++ = (( uv >>  6)         | 0xc0);
 	*d++ = (( uv        & 0x3f) | 0x80);
@@ -116,39 +128,16 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 	return d;
     }
 #endif
+#endif /* Loop style */
 }
 
-/*
-=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
-
-Adds the UTF8 representation of the Native codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
-    d = uvchr_to_utf8(d, uv);
-
-is the recommended wide native character-aware way of saying
-
-    *(d++) = uv;
-
-=cut
-*/
-
-U8 *
-Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
-{
-    if (uv < 0x100)
-	uv = NATIVE_TO_ASCII(uv);
-    return Perl_uvuni_to_utf8(aTHX_ d, uv);
-}
 
 
 /*
 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 
 Tests if some arbitrary number of bytes begins in a valid UTF-8
-character.  Note that an ASCII character is a valid UTF-8 character.
+character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 The actual number of bytes in the UTF-8 character will be returned if
 it is valid, otherwise 0.
 
@@ -160,7 +149,7 @@ Perl_is_utf8_char(pTHX_ U8 *s)
     STRLEN slen, len;
     UV uv, ouv;
 
-    if (UTF8_IS_ASCII(u))
+    if (UTF8_IS_INVARIANT(u))
 	return 1;
 
     if (!UTF8_IS_START(u))
@@ -173,7 +162,8 @@ Perl_is_utf8_char(pTHX_ U8 *s)
 
     slen = len - 1;
     s++;
-    uv = u;
+    /* The initial value is dubious */
+    uv  = u;
     ouv = uv;
     while (slen--) {
 	if (!UTF8_IS_CONTINUATION(*s))
@@ -276,10 +266,10 @@ Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 	goto malformed;
     }
 
-    if (UTF8_IS_ASCII(uv)) {
+    if (UTF8_IS_INVARIANT(uv)) {
 	if (retlen)
 	    *retlen = 1;
-	return (UV) (*s);
+	return (UV) (NATIVE_TO_UTF(*s));
     }
 
     if (UTF8_IS_CONTINUATION(uv) &&
@@ -294,20 +284,29 @@ Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 	goto malformed;
     }
 
+#ifdef EBCDIC
+    uv = NATIVE_TO_UTF(uv);
+#else
     if ((uv == 0xfe || uv == 0xff) &&
 	!(flags & UTF8_ALLOW_FE_FF)) {
 	warning = UTF8_WARN_FE_FF;
 	goto malformed;
     }
-	
+#endif
+
     if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
     else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
     else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
     else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
+#ifdef EBCDIC
+    else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
+    else			{ len =  7; uv &= 0x01; }
+#else
     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
     else if (!(uv & 0x01))	{ len =  7; uv = 0; }
-    else 			{ len = 13; uv = 0; } /* whoa! */
-	
+    else			{ len = 13; uv = 0; } /* whoa! */
+#endif
+
     if (retlen)
 	*retlen = len;
 
@@ -440,27 +439,6 @@ malformed:
 }
 
 /*
-=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
-
-Returns the native character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Allows length and flags to be passed to low level routine.
-
-=cut
-*/
-
-UV
-Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
-{
-    UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
-    if (uv < 0x100)
-        return (UV) ASCII_TO_NATIVE(uv);
-    return uv;
-}
-
-/*
 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
 
 Returns the native character value of the first character in the string C<s>
@@ -527,7 +505,7 @@ Perl_utf8_length(pTHX_ U8* s, U8* e)
 	U8 t = UTF8SKIP(s);
 
 	if (e - s < t)
-	    Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
+	    Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
 	s += t;
 	len++;
     }
@@ -634,9 +612,9 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
     for (send = s + *len; s < send; ) {
         U8 c = *s++;
 
-        if (c >= 0x80 &&
-            ((s >= send) ||
-	     ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
+        if (!UTF8_IS_INVARIANT(c) &&
+            (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
+	     || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
             *len = -1;
             return 0;
         }
@@ -668,9 +646,9 @@ is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 U8 *
 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
 {
-    U8 *send;
     U8 *d;
     U8 *start = s;
+    U8 *send;
     I32 count = 0;
 
     if (!*is_utf8)
@@ -679,28 +657,27 @@ Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
     /* ensure valid UTF8 and chars < 256 before converting string */
     for (send = s + *len; s < send;) {
 	U8 c = *s++;
-        if (!UTF8_IS_ASCII(c)) {
-	    if (UTF8_IS_CONTINUATION(c) || s >= send ||
-		!UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
+	if (!UTF8_IS_INVARIANT(c)) {
+	    if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
+                (c = *s++) && UTF8_IS_CONTINUATION(c))
+		count++;
+	    else
 		return start;
-	    s++, count++;
-        }
+	}
     }
 
     *is_utf8 = 0;		
 
-    if (!count)
-	return start;
-
     Newz(801, d, (*len) - count + 1, U8);
     s = start; start = d;
     while (s < send) {
 	U8 c = *s++;
-
-	if (UTF8_IS_ASCII(c))
-	    *d++ = c;
-	else
-	    *d++ = UTF8_ACCUMULATE(c, *s++);
+	if (!UTF8_IS_INVARIANT(c)) {
+	    /* Then it is two-byte encoded */
+	    c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
+	    c = ASCII_TO_NATIVE(c);
+	}
+	*d++ = c;
     }
     *d = '\0';
     *len = d - start;
@@ -729,11 +706,10 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
     dst = d;
 
     while (s < send) {
-        if (UTF8_IS_ASCII(*s))
-            *d++ = *s++;
+        UV uv = NATIVE_TO_ASCII(*s++);
+        if (UNI_IS_INVARIANT(uv))
+            *d++ = UTF_TO_NATIVE(uv);
         else {
-            UV uv = *s++;
-
             *d++ = UTF8_EIGHT_BIT_HI(uv);
             *d++ = UTF8_EIGHT_BIT_LO(uv);
         }
@@ -818,7 +794,7 @@ bool
 Perl_is_uni_alnum(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alnum(tmpbuf);
 }
 
@@ -826,7 +802,7 @@ bool
 Perl_is_uni_alnumc(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alnumc(tmpbuf);
 }
 
@@ -834,7 +810,7 @@ bool
 Perl_is_uni_idfirst(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_idfirst(tmpbuf);
 }
 
@@ -842,7 +818,7 @@ bool
 Perl_is_uni_alpha(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alpha(tmpbuf);
 }
 
@@ -850,7 +826,7 @@ bool
 Perl_is_uni_ascii(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_ascii(tmpbuf);
 }
 
@@ -858,7 +834,7 @@ bool
 Perl_is_uni_space(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_space(tmpbuf);
 }
 
@@ -866,7 +842,7 @@ bool
 Perl_is_uni_digit(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_digit(tmpbuf);
 }
 
@@ -874,7 +850,7 @@ bool
 Perl_is_uni_upper(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_upper(tmpbuf);
 }
 
@@ -882,7 +858,7 @@ bool
 Perl_is_uni_lower(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_lower(tmpbuf);
 }
 
@@ -890,7 +866,7 @@ bool
 Perl_is_uni_cntrl(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_cntrl(tmpbuf);
 }
 
@@ -898,7 +874,7 @@ bool
 Perl_is_uni_graph(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_graph(tmpbuf);
 }
 
@@ -906,7 +882,7 @@ bool
 Perl_is_uni_print(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_print(tmpbuf);
 }
 
@@ -914,7 +890,7 @@ bool
 Perl_is_uni_punct(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_punct(tmpbuf);
 }
 
@@ -922,7 +898,7 @@ bool
 Perl_is_uni_xdigit(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_xdigit(tmpbuf);
 }
 
@@ -930,7 +906,7 @@ U32
 Perl_to_uni_upper(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return to_utf8_upper(tmpbuf);
 }
 
@@ -938,7 +914,7 @@ U32
 Perl_to_uni_title(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return to_utf8_title(tmpbuf);
 }
 
@@ -946,7 +922,7 @@ U32
 Perl_to_uni_lower(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return to_utf8_lower(tmpbuf);
 }
 
@@ -1258,8 +1234,9 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
     SV* retval;
     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
     dSP;
+    HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
 
-    if (!gv_stashpv(pkg, 0)) {	/* demand load utf8 */
+    if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
 	ENTER;
 	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
 	LEAVE;
@@ -1303,14 +1280,34 @@ UV
 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 {
     HV* hv = (HV*)SvRV(sv);
+    /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
+       then the "swatch" is a vec() for al the chars which start
+       with 0xAA..0xYY
+       So the key in the hash is length of encoded char -1
+     */
     U32 klen = UTF8SKIP(ptr) - 1;
-    U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
+    U32 off = ptr[klen];
     STRLEN slen;
-    STRLEN needents = (klen ? 64 : 128);
+    STRLEN needents;
     U8 *tmps;
     U32 bit;
     SV *retval;
 
+    if (klen == 0)
+     {
+      /* If char in invariant then swatch is for all the invariant chars
+       * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
+       */
+      needents = UTF_CONTINUATION_MARK;
+      off      = NATIVE_TO_UTF(ptr[klen]);
+     }
+    else
+     {
+      /* If char is encoded then swatch is for the prefix */
+      needents = (1 << UTF_ACCUMULATION_SHIFT);
+      off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+     }
+
     /*
      * This single-entry cache saves about 1/3 of the utf8 overhead in test
      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
@@ -1334,6 +1331,10 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 	/* If not cached, generate it via utf8::SWASHGET */
 	if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
 	    dSP;
+	    /* We use utf8n_to_uvuni() as we want an index into
+	       Unicode tables, not a native character number.
+	     */
+	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
 	    ENTER;
 	    SAVETMPS;
 	    save_re_context();
@@ -1341,10 +1342,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 	    PUSHMARK(SP);
 	    EXTEND(SP,3);
 	    PUSHs((SV*)sv);
-	    /* We call utf8_to_uni as we want and index into Unicode tables,
-	       not a native character number.
-	     */
-	    PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
+	    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
+	    PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
 	    PUSHs(sv_2mortal(newSViv(needents)));
 	    PUTBACK;
 	    if (call_method("SWASHGET", G_SCALAR))
@@ -1359,7 +1358,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 
 	    svp = hv_store(hv, (char*)ptr, klen, retval, 0);
 
-	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
+	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
 		Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
 	}
 
@@ -1388,3 +1387,56 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
     Perl_croak(aTHX_ "panic: swash_fetch");
     return 0;
 }
+
+
+/*
+=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
+
+Adds the UTF8 representation of the Native codepoint C<uv> to the end
+of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+    d = uvchr_to_utf8(d, uv);
+
+is the recommended wide native character-aware way of saying
+
+    *(d++) = uv;
+
+=cut
+*/
+
+/* On ASCII machines this is normally a macro but we want a
+   real function in case XS code wants it
+*/
+#undef Perl_uvchr_to_utf8
+U8 *
+Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+}
+
+
+/*
+=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
+
+Returns the native character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Allows length and flags to be passed to low level routine.
+
+=cut
+*/
+/* On ASCII machines this is normally a macro but we want a
+   real function in case XS code wants it
+*/
+#undef Perl_utf8n_to_uvchr
+UV
+Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+{
+    UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
+    return UNI_TO_NATIVE(uv);
+}
+
+