Fix for "a\x{100}" =~ /A/i.
[p5sagit/p5-mst-13.2.git] / utf8.c
diff --git a/utf8.c b/utf8.c
index ac90a38..d7b0784 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1180,6 +1180,30 @@ Perl_is_utf8_mark(pTHX_ U8 *p)
     return swash_fetch(PL_utf8_mark, p, TRUE);
 }
 
+/*
+=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
+
+The "p" contains the pointer to the UTF-8 string encoding
+the character that is being converted.
+
+The "ustrp" is a pointer to the character buffer to put the
+conversion result to.  The "lenp" is a pointer to the length
+of the result.
+
+The "swash" is a pointer to the swash to use.
+
+The "normal" is a string like "ToLower" which means the swash
+$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
+and loaded by SWASHGET, using lib/utf8_heavy.pl.
+
+The "special" is a string like "utf8::ToSpecLower", which means
+the hash %utf8::ToSpecLower, which is stored in the same file,
+lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
+to the hash is by Perl_to_utf8_case().
+
+=cut
+ */
+
 UV
 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
 {
@@ -1196,6 +1220,8 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal
         HE *he;
 
         uv = utf8_to_uvchr(p, 0);
+        if (uv <= 0xff)
+            uv = NATIVE_TO_UTF(uv);
 
         if ((hv    = get_hv(special, FALSE)) &&
             (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
@@ -1206,6 +1232,7 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal
              if (*lenp > 1 || UNI_IS_INVARIANT(c))
                   Copy(s, ustrp, *lenp, U8);
              else {
+                  c = UTF_TO_NATIVE(c);
                   /* something in the 0x80..0xFF range */
                   ustrp[0] = UTF8_EIGHT_BIT_HI(c);
                   ustrp[1] = UTF8_EIGHT_BIT_LO(c);
@@ -1223,21 +1250,28 @@ UV
 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
-                            &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
+                             &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
 }
 
 UV
 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
-                            &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
+                             &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
 }
 
 UV
 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
-                            &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
+                             &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
+}
+
+UV
+Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
+{
+    return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
+                             &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
 }
 
 /* a "swash" is a swatch hash */
@@ -1313,7 +1347,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
     U32 off;
     STRLEN slen;
     STRLEN needents;
-    U8 *tmps;
+    U8 *tmps = NULL;
     U32 bit;
     SV *retval;
     U8 tmputf8[2];
@@ -1484,4 +1518,76 @@ Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
     return UNI_TO_NATIVE(uv);
 }
 
+char *
+Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
+{
+    int truncated = 0;
+    char *s, *e;
+
+    sv_setpvn(dsv, "", 0);
+    for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
+        UV u;
+        if (pvlim && SvCUR(dsv) >= pvlim) {
+             truncated++;
+             break;
+        }
+        u = utf8_to_uvchr((U8*)s, 0);
+        Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
+    }
+    if (truncated)
+        sv_catpvn(dsv, "...", 3);
+    
+    return SvPVX(dsv);
+}
+
+char *
+Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
+{
+     return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
+                               pvlim, flags);
+}
+
+I32
+Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len)
+{
+     register U8 *a = (U8*)s1;
+     register U8 *b = (U8*)s2;
+     STRLEN la, lb;
+     UV ca, cb;
+     STRLEN ulen1, ulen2;
+     U8 tmpbuf1[UTF8_MAXLEN*3+1];
+     U8 tmpbuf2[UTF8_MAXLEN*3+1];
+
+     while (len) {
+         if (u1)
+              ca = utf8_to_uvchr((U8*)a, &la);
+         else {
+              ca = *a;
+              la = 1;
+         }
+         if (u2)
+              cb = utf8_to_uvchr((U8*)b, &lb);
+         else {
+              cb = *b;
+              lb = 1;
+         }
+         if (ca != cb) {
+              if (u1)
+                   to_uni_lower(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
+              else
+                   ulen1 = 1;
+              if (u2)
+                   to_uni_lower(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
+              else
+                   ulen2 = 1;
+              if (ulen1 != ulen2
+                  || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
+                  || memNE(tmpbuf1, tmpbuf2, ulen1))
+                   return 1;
+         }
+         a += la;
+         b += lb;
+    }
+    return 0;
+}