utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
  28
  29 /*
  30 =head1 Unicode Support
  31
  32 This file contains various utility functions for manipulating UTF8-encoded
  33 strings. For the uninitiated, this is a method of representing arbitrary
  34 Unicode characters as a variable number of bytes, in such a way that
  35 characters in the ASCII range are unmodified, and a zero byte never appears
  36 within non-zero characters.
  37
  38 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  39
  40 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  41 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  42 bytes available. The return value is the pointer to the byte after the
  43 end of the new character. In other words,
  44
  45     d = uvuni_to_utf8_flags(d, uv, flags);
  46
  47 or, in most cases,
  48
  49     d = uvuni_to_utf8(d, uv);
  50
  51 (which is equivalent to)
  52
  53     d = uvuni_to_utf8_flags(d, uv, 0);
  54
  55 is the recommended Unicode-aware way of saying
  56
  57     *(d++) = uv;
  58
  59 =cut
  60 */
  61
  62 U8 *
  63 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  64 {
  65     if (ckWARN(WARN_UTF8)) {
  66          if (UNICODE_IS_SURROGATE(uv) &&
  67              !(flags & UNICODE_ALLOW_SURROGATE))
  68               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  69          else if (
  70                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  71                     !(flags & UNICODE_ALLOW_FDD0))
  72                    ||
  73                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  74                     !(flags & UNICODE_ALLOW_FFFF))) &&
  75                   /* UNICODE_ALLOW_SUPER includes
  76                    * FFFEs and FFFFs beyond 0x10FFFF. */
  77                   ((uv <= PERL_UNICODE_MAX) ||
  78                    !(flags & UNICODE_ALLOW_SUPER))
  79                   )
  80               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  81                          "Unicode character 0x%04"UVxf" is illegal", uv);
  82     }
  83     if (UNI_IS_INVARIANT(uv)) {
  84         *d++ = (U8)UTF_TO_NATIVE(uv);
  85         return d;
  86     }
  87 #if defined(EBCDIC)
  88     else {
  89         STRLEN len  = UNISKIP(uv);
  90         U8 *p = d+len-1;
  91         while (p > d) {
  92             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  93             uv >>= UTF_ACCUMULATION_SHIFT;
  94         }
  95         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  96         return d+len;
  97     }
  98 #else /* Non loop style */
  99     if (uv < 0x800) {
 100         *d++ = (U8)(( uv >>  6)         | 0xc0);
 101         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 102         return d;
 103     }
 104     if (uv < 0x10000) {
 105         *d++ = (U8)(( uv >> 12)         | 0xe0);
 106         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 107         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 108         return d;
 109     }
 110     if (uv < 0x200000) {
 111         *d++ = (U8)(( uv >> 18)         | 0xf0);
 112         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 113         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 114         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 115         return d;
 116     }
 117     if (uv < 0x4000000) {
 118         *d++ = (U8)(( uv >> 24)         | 0xf8);
 119         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 120         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 121         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 122         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 123         return d;
 124     }
 125     if (uv < 0x80000000) {
 126         *d++ = (U8)(( uv >> 30)         | 0xfc);
 127         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 128         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 129         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 131         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 132         return d;
 133     }
 134 #ifdef HAS_QUAD
 135     if (uv < UTF8_QUAD_MAX)
 136 #endif
 137     {
 138         *d++ =                            0xfe; /* Can't match U+FEFF! */
 139         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 140         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 141         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 144         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 145         return d;
 146     }
 147 #ifdef HAS_QUAD
 148     {
 149         *d++ =                            0xff;         /* Can't match U+FFFE! */
 150         *d++ =                            0x80;         /* 6 Reserved bits */
 151         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 152         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 153         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 154         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 161         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 162         return d;
 163     }
 164 #endif
 165 #endif /* Loop style */
 166 }
 167
 168 U8 *
 169 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 170 {
 171     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 172 }
 173
 174
 175 /*
 176 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 177
 178 Tests if some arbitrary number of bytes begins in a valid UTF-8
 179 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 180 UTF-8 character.  The actual number of bytes in the UTF-8 character
 181 will be returned if it is valid, otherwise 0.
 182
 183 =cut */
 184 STRLEN
 185 Perl_is_utf8_char(pTHX_ U8 *s)
 186 {
 187     U8 u = *s;
 188     STRLEN slen, len;
 189     UV uv, ouv;
 190
 191     if (UTF8_IS_INVARIANT(u))
 192         return 1;
 193
 194     if (!UTF8_IS_START(u))
 195         return 0;
 196
 197     len = UTF8SKIP(s);
 198
 199     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 200         return 0;
 201
 202     slen = len - 1;
 203     s++;
 204     u &= UTF_START_MASK(len);
 205     uv  = u;
 206     ouv = uv;
 207     while (slen--) {
 208         if (!UTF8_IS_CONTINUATION(*s))
 209             return 0;
 210         uv = UTF8_ACCUMULATE(uv, *s);
 211         if (uv < ouv)
 212             return 0;
 213         ouv = uv;
 214         s++;
 215     }
 216
 217     if ((STRLEN)UNISKIP(uv) < len)
 218         return 0;
 219
 220     return len;
 221 }
 222
 223 /*
 224 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 225
 226 Returns true if first C<len> bytes of the given string form a valid
 227 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 228 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 229 because a valid ASCII string is a valid UTF-8 string.
 230
 231 =cut
 232 */
 233
 234 bool
 235 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 236 {
 237     U8* x = s;
 238     U8* send;
 239     STRLEN c;
 240
 241     if (!len && s)
 242         len = strlen((char *)s);
 243     send = s + len;
 244
 245     while (x < send) {
 246          /* Inline the easy bits of is_utf8_char() here for speed... */
 247          if (UTF8_IS_INVARIANT(*x))
 248               c = 1;
 249          else if (!UTF8_IS_START(*x))
 250               return FALSE;
 251          else {
 252               /* ... and call is_utf8_char() only if really needed. */
 253               c = is_utf8_char(x);
 254               if (!c)
 255                    return FALSE;
 256          }
 257         x += c;
 258     }
 259     if (x != send)
 260         return FALSE;
 261
 262     return TRUE;
 263 }
 264
 265 /*
 266 =for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
 267
 268 Like is_ut8_string but store the location of the failure in
 269 the last argument.
 270
 271 =cut
 272 */
 273
 274 bool
 275 Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
 276 {
 277     U8* x = s;
 278     U8* send;
 279     STRLEN c;
 280
 281     if (!len && s)
 282         len = strlen((char *)s);
 283     send = s + len;
 284
 285     while (x < send) {
 286          /* Inline the easy bits of is_utf8_char() here for speed... */
 287          if (UTF8_IS_INVARIANT(*x))
 288               c = 1;
 289          else if (!UTF8_IS_START(*x)) {
 290               if (p)
 291                   *p = x;
 292               return FALSE;
 293          }
 294          else {
 295               /* ... and call is_utf8_char() only if really needed. */
 296               c = is_utf8_char(x);
 297               if (!c) {
 298                    if (p)
 299                       *p = x;
 300                    return FALSE;
 301               }
 302          }
 303         x += c;
 304     }
 305     if (x != send) {
 306        if (p)
 307            *p = x;
 308         return FALSE;
 309     }
 310
 311     return TRUE;
 312 }
 313
 314 /*
 315 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 316
 317 Bottom level UTF-8 decode routine.
 318 Returns the unicode code point value of the first character in the string C<s>
 319 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 320 C<retlen> will be set to the length, in bytes, of that character.
 321
 322 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 323 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 324 it is assumed that the caller will raise a warning, and this function
 325 will silently just set C<retlen> to C<-1> and return zero.  If the
 326 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 327 malformations will be given, C<retlen> will be set to the expected
 328 length of the UTF-8 character in bytes, and zero will be returned.
 329
 330 The C<flags> can also contain various flags to allow deviations from
 331 the strict UTF-8 encoding (see F<utf8.h>).
 332
 333 Most code should use utf8_to_uvchr() rather than call this directly.
 334
 335 =cut
 336 */
 337
 338 UV
 339 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 340 {
 341     U8 *s0 = s;
 342     UV uv = *s, ouv = 0;
 343     STRLEN len = 1;
 344     bool dowarn = ckWARN_d(WARN_UTF8);
 345     UV startbyte = *s;
 346     STRLEN expectlen = 0;
 347     U32 warning = 0;
 348
 349 /* This list is a superset of the UTF8_ALLOW_XXX. */
 350
 351 #define UTF8_WARN_EMPTY                          1
 352 #define UTF8_WARN_CONTINUATION                   2
 353 #define UTF8_WARN_NON_CONTINUATION               3
 354 #define UTF8_WARN_FE_FF                          4
 355 #define UTF8_WARN_SHORT                          5
 356 #define UTF8_WARN_OVERFLOW                       6
 357 #define UTF8_WARN_SURROGATE                      7
 358 #define UTF8_WARN_LONG                           8
 359 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 360
 361     if (curlen == 0 &&
 362         !(flags & UTF8_ALLOW_EMPTY)) {
 363         warning = UTF8_WARN_EMPTY;
 364         goto malformed;
 365     }
 366
 367     if (UTF8_IS_INVARIANT(uv)) {
 368         if (retlen)
 369             *retlen = 1;
 370         return (UV) (NATIVE_TO_UTF(*s));
 371     }
 372
 373     if (UTF8_IS_CONTINUATION(uv) &&
 374         !(flags & UTF8_ALLOW_CONTINUATION)) {
 375         warning = UTF8_WARN_CONTINUATION;
 376         goto malformed;
 377     }
 378
 379     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 380         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 381         warning = UTF8_WARN_NON_CONTINUATION;
 382         goto malformed;
 383     }
 384
 385 #ifdef EBCDIC
 386     uv = NATIVE_TO_UTF(uv);
 387 #else
 388     if ((uv == 0xfe || uv == 0xff) &&
 389         !(flags & UTF8_ALLOW_FE_FF)) {
 390         warning = UTF8_WARN_FE_FF;
 391         goto malformed;
 392     }
 393 #endif
 394
 395     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 396     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 397     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 398     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 399 #ifdef EBCDIC
 400     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 401     else                        { len =  7; uv &= 0x01; }
 402 #else
 403     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 404     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 405     else                        { len = 13; uv = 0; } /* whoa! */
 406 #endif
 407
 408     if (retlen)
 409         *retlen = len;
 410
 411     expectlen = len;
 412
 413     if ((curlen < expectlen) &&
 414         !(flags & UTF8_ALLOW_SHORT)) {
 415         warning = UTF8_WARN_SHORT;
 416         goto malformed;
 417     }
 418
 419     len--;
 420     s++;
 421     ouv = uv;
 422
 423     while (len--) {
 424         if (!UTF8_IS_CONTINUATION(*s) &&
 425             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 426             s--;
 427             warning = UTF8_WARN_NON_CONTINUATION;
 428             goto malformed;
 429         }
 430         else
 431             uv = UTF8_ACCUMULATE(uv, *s);
 432         if (!(uv > ouv)) {
 433             /* These cannot be allowed. */
 434             if (uv == ouv) {
 435                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 436                     warning = UTF8_WARN_LONG;
 437                     goto malformed;
 438                 }
 439             }
 440             else { /* uv < ouv */
 441                 /* This cannot be allowed. */
 442                 warning = UTF8_WARN_OVERFLOW;
 443                 goto malformed;
 444             }
 445         }
 446         s++;
 447         ouv = uv;
 448     }
 449
 450     if (UNICODE_IS_SURROGATE(uv) &&
 451         !(flags & UTF8_ALLOW_SURROGATE)) {
 452         warning = UTF8_WARN_SURROGATE;
 453         goto malformed;
 454     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 455                !(flags & UTF8_ALLOW_LONG)) {
 456         warning = UTF8_WARN_LONG;
 457         goto malformed;
 458     } else if (UNICODE_IS_ILLEGAL(uv) &&
 459                !(flags & UTF8_ALLOW_FFFF)) {
 460         warning = UTF8_WARN_FFFF;
 461         goto malformed;
 462     }
 463
 464     return uv;
 465
 466 malformed:
 467
 468     if (flags & UTF8_CHECK_ONLY) {
 469         if (retlen)
 470             *retlen = -1;
 471         return 0;
 472     }
 473
 474     if (dowarn) {
 475         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 476
 477         switch (warning) {
 478         case 0: /* Intentionally empty. */ break;
 479         case UTF8_WARN_EMPTY:
 480             Perl_sv_catpv(aTHX_ sv, "(empty string)");
 481             break;
 482         case UTF8_WARN_CONTINUATION:
 483             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 484             break;
 485         case UTF8_WARN_NON_CONTINUATION:
 486             if (s == s0)
 487                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 488                            (UV)s[1], startbyte);
 489             else
 490                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 491                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 492
 493             break;
 494         case UTF8_WARN_FE_FF:
 495             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 496             break;
 497         case UTF8_WARN_SHORT:
 498             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 499                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 500             expectlen = curlen;         /* distance for caller to skip */
 501             break;
 502         case UTF8_WARN_OVERFLOW:
 503             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 504                            ouv, *s, startbyte);
 505             break;
 506         case UTF8_WARN_SURROGATE:
 507             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 508             break;
 509         case UTF8_WARN_LONG:
 510             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 511                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 512             break;
 513         case UTF8_WARN_FFFF:
 514             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 515             break;
 516         default:
 517             Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
 518             break;
 519         }
 520
 521         if (warning) {
 522             char *s = SvPVX(sv);
 523
 524             if (PL_op)
 525                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 526                             "%s in %s", s,  OP_DESC(PL_op));
 527             else
 528                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 529         }
 530     }
 531
 532     if (retlen)
 533         *retlen = expectlen ? expectlen : len;
 534
 535     return 0;
 536 }
 537
 538 /*
 539 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 540
 541 Returns the native character value of the first character in the string C<s>
 542 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 543 length, in bytes, of that character.
 544
 545 If C<s> does not point to a well-formed UTF-8 character, zero is
 546 returned and retlen is set, if possible, to -1.
 547
 548 =cut
 549 */
 550
 551 UV
 552 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 553 {
 554     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
 555                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 556 }
 557
 558 /*
 559 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 560
 561 Returns the Unicode code point of the first character in the string C<s>
 562 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 563 length, in bytes, of that character.
 564
 565 This function should only be used when returned UV is considered
 566 an index into the Unicode semantic tables (e.g. swashes).
 567
 568 If C<s> does not point to a well-formed UTF-8 character, zero is
 569 returned and retlen is set, if possible, to -1.
 570
 571 =cut
 572 */
 573
 574 UV
 575 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 576 {
 577     /* Call the low level routine asking for checks */
 578     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 579                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 580 }
 581
 582 /*
 583 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 584
 585 Return the length of the UTF-8 char encoded string C<s> in characters.
 586 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 587 up past C<e>, croaks.
 588
 589 =cut
 590 */
 591
 592 STRLEN
 593 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 594 {
 595     STRLEN len = 0;
 596
 597     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 598      * the bitops (especially ~) can create illegal UTF-8.
 599      * In other words: in Perl UTF-8 is not just for Unicode. */
 600
 601     if (e < s) {
 602         if (ckWARN_d(WARN_UTF8)) {
 603             if (PL_op)
 604                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 605                             "%s in %s", unees, OP_DESC(PL_op));
 606             else
 607                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 608         }
 609         return 0;
 610     }
 611     while (s < e) {
 612         U8 t = UTF8SKIP(s);
 613
 614         if (e - s < t) {
 615             if (ckWARN_d(WARN_UTF8)) {
 616                 if (PL_op)
 617                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 618                                 unees, OP_DESC(PL_op));
 619                 else
 620                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 621             }
 622             return len;
 623         }
 624         s += t;
 625         len++;
 626     }
 627
 628     return len;
 629 }
 630
 631 /*
 632 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 633
 634 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 635 and C<b>.
 636
 637 WARNING: use only if you *know* that the pointers point inside the
 638 same UTF-8 buffer.
 639
 640 =cut
 641 */
 642
 643 IV
 644 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 645 {
 646     IV off = 0;
 647
 648     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 649      * the bitops (especially ~) can create illegal UTF-8.
 650      * In other words: in Perl UTF-8 is not just for Unicode. */
 651
 652     if (a < b) {
 653         while (a < b) {
 654             U8 c = UTF8SKIP(a);
 655
 656             if (b - a < c) {
 657                 if (ckWARN_d(WARN_UTF8)) {
 658                     if (PL_op)
 659                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 660                                     "%s in %s", unees, OP_DESC(PL_op));
 661                     else
 662                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 663                 }
 664                 return off;
 665             }
 666             a += c;
 667             off--;
 668         }
 669     }
 670     else {
 671         while (b < a) {
 672             U8 c = UTF8SKIP(b);
 673
 674             if (a - b < c) {
 675                 if (ckWARN_d(WARN_UTF8)) {
 676                     if (PL_op)
 677                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 678                                     "%s in %s", unees, OP_DESC(PL_op));
 679                     else
 680                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 681                 }
 682                 return off;
 683             }
 684             b += c;
 685             off++;
 686         }
 687     }
 688
 689     return off;
 690 }
 691
 692 /*
 693 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 694
 695 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 696 forward or backward.
 697
 698 WARNING: do not use the following unless you *know* C<off> is within
 699 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 700 on the first byte of character or just after the last byte of a character.
 701
 702 =cut
 703 */
 704
 705 U8 *
 706 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 707 {
 708     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 709      * the bitops (especially ~) can create illegal UTF-8.
 710      * In other words: in Perl UTF-8 is not just for Unicode. */
 711
 712     if (off >= 0) {
 713         while (off--)
 714             s += UTF8SKIP(s);
 715     }
 716     else {
 717         while (off++) {
 718             s--;
 719             while (UTF8_IS_CONTINUATION(*s))
 720                 s--;
 721         }
 722     }
 723     return s;
 724 }
 725
 726 /*
 727 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 728
 729 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 730 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 731 updates len to contain the new length.
 732 Returns zero on failure, setting C<len> to -1.
 733
 734 =cut
 735 */
 736
 737 U8 *
 738 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 739 {
 740     U8 *send;
 741     U8 *d;
 742     U8 *save = s;
 743
 744     /* ensure valid UTF-8 and chars < 256 before updating string */
 745     for (send = s + *len; s < send; ) {
 746         U8 c = *s++;
 747
 748         if (!UTF8_IS_INVARIANT(c) &&
 749             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 750              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 751             *len = -1;
 752             return 0;
 753         }
 754     }
 755
 756     d = s = save;
 757     while (s < send) {
 758         STRLEN ulen;
 759         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 760         s += ulen;
 761     }
 762     *d = '\0';
 763     *len = d - save;
 764     return save;
 765 }
 766
 767 /*
 768 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 769
 770 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 771 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 772 the newly-created string, and updates C<len> to contain the new
 773 length.  Returns the original string if no conversion occurs, C<len>
 774 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 775 0 if C<s> is converted or contains all 7bit characters.
 776
 777 =cut
 778 */
 779
 780 U8 *
 781 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 782 {
 783     U8 *d;
 784     U8 *start = s;
 785     U8 *send;
 786     I32 count = 0;
 787
 788     if (!*is_utf8)
 789         return start;
 790
 791     /* ensure valid UTF-8 and chars < 256 before converting string */
 792     for (send = s + *len; s < send;) {
 793         U8 c = *s++;
 794         if (!UTF8_IS_INVARIANT(c)) {
 795             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 796                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 797                 count++;
 798             else
 799                 return start;
 800         }
 801     }
 802
 803     *is_utf8 = 0;
 804
 805     Newz(801, d, (*len) - count + 1, U8);
 806     s = start; start = d;
 807     while (s < send) {
 808         U8 c = *s++;
 809         if (!UTF8_IS_INVARIANT(c)) {
 810             /* Then it is two-byte encoded */
 811             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 812             c = ASCII_TO_NATIVE(c);
 813         }
 814         *d++ = c;
 815     }
 816     *d = '\0';
 817     *len = d - start;
 818     return start;
 819 }
 820
 821 /*
 822 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 823
 824 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 825 Returns a pointer to the newly-created string, and sets C<len> to
 826 reflect the new length.
 827
 828 If you want to convert to UTF-8 from other encodings than ASCII,
 829 see sv_recode_to_utf8().
 830
 831 =cut
 832 */
 833
 834 U8*
 835 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 836 {
 837     U8 *send;
 838     U8 *d;
 839     U8 *dst;
 840     send = s + (*len);
 841
 842     Newz(801, d, (*len) * 2 + 1, U8);
 843     dst = d;
 844
 845     while (s < send) {
 846         UV uv = NATIVE_TO_ASCII(*s++);
 847         if (UNI_IS_INVARIANT(uv))
 848             *d++ = (U8)UTF_TO_NATIVE(uv);
 849         else {
 850             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 851             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 852         }
 853     }
 854     *d = '\0';
 855     *len = d-dst;
 856     return dst;
 857 }
 858
 859 /*
 860  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 861  *
 862  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 863  * We optimize for native, for obvious reasons. */
 864
 865 U8*
 866 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 867 {
 868     U8* pend;
 869     U8* dstart = d;
 870
 871     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 872          d[0] = 0;
 873          *newlen = 1;
 874          return d;
 875     }
 876
 877     if (bytelen & 1)
 878         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 879
 880     pend = p + bytelen;
 881
 882     while (p < pend) {
 883         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 884         p += 2;
 885         if (uv < 0x80) {
 886             *d++ = (U8)uv;
 887             continue;
 888         }
 889         if (uv < 0x800) {
 890             *d++ = (U8)(( uv >>  6)         | 0xc0);
 891             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 892             continue;
 893         }
 894         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 895             UV low = (p[0] << 8) + p[1];
 896             p += 2;
 897             if (low < 0xdc00 || low >= 0xdfff)
 898                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 899             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 900         }
 901         if (uv < 0x10000) {
 902             *d++ = (U8)(( uv >> 12)         | 0xe0);
 903             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 904             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 905             continue;
 906         }
 907         else {
 908             *d++ = (U8)(( uv >> 18)         | 0xf0);
 909             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 910             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 911             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 912             continue;
 913         }
 914     }
 915     *newlen = d - dstart;
 916     return d;
 917 }
 918
 919 /* Note: this one is slightly destructive of the source. */
 920
 921 U8*
 922 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 923 {
 924     U8* s = (U8*)p;
 925     U8* send = s + bytelen;
 926     while (s < send) {
 927         U8 tmp = s[0];
 928         s[0] = s[1];
 929         s[1] = tmp;
 930         s += 2;
 931     }
 932     return utf16_to_utf8(p, d, bytelen, newlen);
 933 }
 934
 935 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 936
 937 bool
 938 Perl_is_uni_alnum(pTHX_ UV c)
 939 {
 940     U8 tmpbuf[UTF8_MAXBYTES+1];
 941     uvchr_to_utf8(tmpbuf, c);
 942     return is_utf8_alnum(tmpbuf);
 943 }
 944
 945 bool
 946 Perl_is_uni_alnumc(pTHX_ UV c)
 947 {
 948     U8 tmpbuf[UTF8_MAXBYTES+1];
 949     uvchr_to_utf8(tmpbuf, c);
 950     return is_utf8_alnumc(tmpbuf);
 951 }
 952
 953 bool
 954 Perl_is_uni_idfirst(pTHX_ UV c)
 955 {
 956     U8 tmpbuf[UTF8_MAXBYTES+1];
 957     uvchr_to_utf8(tmpbuf, c);
 958     return is_utf8_idfirst(tmpbuf);
 959 }
 960
 961 bool
 962 Perl_is_uni_alpha(pTHX_ UV c)
 963 {
 964     U8 tmpbuf[UTF8_MAXBYTES+1];
 965     uvchr_to_utf8(tmpbuf, c);
 966     return is_utf8_alpha(tmpbuf);
 967 }
 968
 969 bool
 970 Perl_is_uni_ascii(pTHX_ UV c)
 971 {
 972     U8 tmpbuf[UTF8_MAXBYTES+1];
 973     uvchr_to_utf8(tmpbuf, c);
 974     return is_utf8_ascii(tmpbuf);
 975 }
 976
 977 bool
 978 Perl_is_uni_space(pTHX_ UV c)
 979 {
 980     U8 tmpbuf[UTF8_MAXBYTES+1];
 981     uvchr_to_utf8(tmpbuf, c);
 982     return is_utf8_space(tmpbuf);
 983 }
 984
 985 bool
 986 Perl_is_uni_digit(pTHX_ UV c)
 987 {
 988     U8 tmpbuf[UTF8_MAXBYTES+1];
 989     uvchr_to_utf8(tmpbuf, c);
 990     return is_utf8_digit(tmpbuf);
 991 }
 992
 993 bool
 994 Perl_is_uni_upper(pTHX_ UV c)
 995 {
 996     U8 tmpbuf[UTF8_MAXBYTES+1];
 997     uvchr_to_utf8(tmpbuf, c);
 998     return is_utf8_upper(tmpbuf);
 999 }
1000
1001 bool
1002 Perl_is_uni_lower(pTHX_ UV c)
1003 {
1004     U8 tmpbuf[UTF8_MAXBYTES+1];
1005     uvchr_to_utf8(tmpbuf, c);
1006     return is_utf8_lower(tmpbuf);
1007 }
1008
1009 bool
1010 Perl_is_uni_cntrl(pTHX_ UV c)
1011 {
1012     U8 tmpbuf[UTF8_MAXBYTES+1];
1013     uvchr_to_utf8(tmpbuf, c);
1014     return is_utf8_cntrl(tmpbuf);
1015 }
1016
1017 bool
1018 Perl_is_uni_graph(pTHX_ UV c)
1019 {
1020     U8 tmpbuf[UTF8_MAXBYTES+1];
1021     uvchr_to_utf8(tmpbuf, c);
1022     return is_utf8_graph(tmpbuf);
1023 }
1024
1025 bool
1026 Perl_is_uni_print(pTHX_ UV c)
1027 {
1028     U8 tmpbuf[UTF8_MAXBYTES+1];
1029     uvchr_to_utf8(tmpbuf, c);
1030     return is_utf8_print(tmpbuf);
1031 }
1032
1033 bool
1034 Perl_is_uni_punct(pTHX_ UV c)
1035 {
1036     U8 tmpbuf[UTF8_MAXBYTES+1];
1037     uvchr_to_utf8(tmpbuf, c);
1038     return is_utf8_punct(tmpbuf);
1039 }
1040
1041 bool
1042 Perl_is_uni_xdigit(pTHX_ UV c)
1043 {
1044     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1045     uvchr_to_utf8(tmpbuf, c);
1046     return is_utf8_xdigit(tmpbuf);
1047 }
1048
1049 UV
1050 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1051 {
1052     uvchr_to_utf8(p, c);
1053     return to_utf8_upper(p, p, lenp);
1054 }
1055
1056 UV
1057 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1058 {
1059     uvchr_to_utf8(p, c);
1060     return to_utf8_title(p, p, lenp);
1061 }
1062
1063 UV
1064 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1065 {
1066     uvchr_to_utf8(p, c);
1067     return to_utf8_lower(p, p, lenp);
1068 }
1069
1070 UV
1071 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1072 {
1073     uvchr_to_utf8(p, c);
1074     return to_utf8_fold(p, p, lenp);
1075 }
1076
1077 /* for now these all assume no locale info available for Unicode > 255 */
1078
1079 bool
1080 Perl_is_uni_alnum_lc(pTHX_ UV c)
1081 {
1082     return is_uni_alnum(c);     /* XXX no locale support yet */
1083 }
1084
1085 bool
1086 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1087 {
1088     return is_uni_alnumc(c);    /* XXX no locale support yet */
1089 }
1090
1091 bool
1092 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1093 {
1094     return is_uni_idfirst(c);   /* XXX no locale support yet */
1095 }
1096
1097 bool
1098 Perl_is_uni_alpha_lc(pTHX_ UV c)
1099 {
1100     return is_uni_alpha(c);     /* XXX no locale support yet */
1101 }
1102
1103 bool
1104 Perl_is_uni_ascii_lc(pTHX_ UV c)
1105 {
1106     return is_uni_ascii(c);     /* XXX no locale support yet */
1107 }
1108
1109 bool
1110 Perl_is_uni_space_lc(pTHX_ UV c)
1111 {
1112     return is_uni_space(c);     /* XXX no locale support yet */
1113 }
1114
1115 bool
1116 Perl_is_uni_digit_lc(pTHX_ UV c)
1117 {
1118     return is_uni_digit(c);     /* XXX no locale support yet */
1119 }
1120
1121 bool
1122 Perl_is_uni_upper_lc(pTHX_ UV c)
1123 {
1124     return is_uni_upper(c);     /* XXX no locale support yet */
1125 }
1126
1127 bool
1128 Perl_is_uni_lower_lc(pTHX_ UV c)
1129 {
1130     return is_uni_lower(c);     /* XXX no locale support yet */
1131 }
1132
1133 bool
1134 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1135 {
1136     return is_uni_cntrl(c);     /* XXX no locale support yet */
1137 }
1138
1139 bool
1140 Perl_is_uni_graph_lc(pTHX_ UV c)
1141 {
1142     return is_uni_graph(c);     /* XXX no locale support yet */
1143 }
1144
1145 bool
1146 Perl_is_uni_print_lc(pTHX_ UV c)
1147 {
1148     return is_uni_print(c);     /* XXX no locale support yet */
1149 }
1150
1151 bool
1152 Perl_is_uni_punct_lc(pTHX_ UV c)
1153 {
1154     return is_uni_punct(c);     /* XXX no locale support yet */
1155 }
1156
1157 bool
1158 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1159 {
1160     return is_uni_xdigit(c);    /* XXX no locale support yet */
1161 }
1162
1163 U32
1164 Perl_to_uni_upper_lc(pTHX_ U32 c)
1165 {
1166     /* XXX returns only the first character -- do not use XXX */
1167     /* XXX no locale support yet */
1168     STRLEN len;
1169     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1170     return (U32)to_uni_upper(c, tmpbuf, &len);
1171 }
1172
1173 U32
1174 Perl_to_uni_title_lc(pTHX_ U32 c)
1175 {
1176     /* XXX returns only the first character XXX -- do not use XXX */
1177     /* XXX no locale support yet */
1178     STRLEN len;
1179     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1180     return (U32)to_uni_title(c, tmpbuf, &len);
1181 }
1182
1183 U32
1184 Perl_to_uni_lower_lc(pTHX_ U32 c)
1185 {
1186     /* XXX returns only the first character -- do not use XXX */
1187     /* XXX no locale support yet */
1188     STRLEN len;
1189     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1190     return (U32)to_uni_lower(c, tmpbuf, &len);
1191 }
1192
1193 bool
1194 Perl_is_utf8_alnum(pTHX_ U8 *p)
1195 {
1196     if (!is_utf8_char(p))
1197         return FALSE;
1198     if (!PL_utf8_alnum)
1199         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1200          * descendant of isalnum(3), in other words, it doesn't
1201          * contain the '_'. --jhi */
1202         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1203     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1204 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1205 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1206     if (!PL_utf8_alnum)
1207         PL_utf8_alnum = swash_init("utf8", "",
1208             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1209     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1210 #endif
1211 }
1212
1213 bool
1214 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1215 {
1216     if (!is_utf8_char(p))
1217         return FALSE;
1218     if (!PL_utf8_alnum)
1219         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1220     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1221 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1222 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1223     if (!PL_utf8_alnum)
1224         PL_utf8_alnum = swash_init("utf8", "",
1225             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1226     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1227 #endif
1228 }
1229
1230 bool
1231 Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1232 {
1233     if (*p == '_')
1234         return TRUE;
1235     if (!is_utf8_char(p))
1236         return FALSE;
1237     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1238         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1239     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1240 }
1241
1242 bool
1243 Perl_is_utf8_idcont(pTHX_ U8 *p)
1244 {
1245     if (*p == '_')
1246         return TRUE;
1247     if (!is_utf8_char(p))
1248         return FALSE;
1249     if (!PL_utf8_idcont)
1250         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1251     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1252 }
1253
1254 bool
1255 Perl_is_utf8_alpha(pTHX_ U8 *p)
1256 {
1257     if (!is_utf8_char(p))
1258         return FALSE;
1259     if (!PL_utf8_alpha)
1260         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1261     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1262 }
1263
1264 bool
1265 Perl_is_utf8_ascii(pTHX_ U8 *p)
1266 {
1267     if (!is_utf8_char(p))
1268         return FALSE;
1269     if (!PL_utf8_ascii)
1270         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1271     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1272 }
1273
1274 bool
1275 Perl_is_utf8_space(pTHX_ U8 *p)
1276 {
1277     if (!is_utf8_char(p))
1278         return FALSE;
1279     if (!PL_utf8_space)
1280         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1281     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1282 }
1283
1284 bool
1285 Perl_is_utf8_digit(pTHX_ U8 *p)
1286 {
1287     if (!is_utf8_char(p))
1288         return FALSE;
1289     if (!PL_utf8_digit)
1290         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1291     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1292 }
1293
1294 bool
1295 Perl_is_utf8_upper(pTHX_ U8 *p)
1296 {
1297     if (!is_utf8_char(p))
1298         return FALSE;
1299     if (!PL_utf8_upper)
1300         PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1301     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1302 }
1303
1304 bool
1305 Perl_is_utf8_lower(pTHX_ U8 *p)
1306 {
1307     if (!is_utf8_char(p))
1308         return FALSE;
1309     if (!PL_utf8_lower)
1310         PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1311     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1312 }
1313
1314 bool
1315 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1316 {
1317     if (!is_utf8_char(p))
1318         return FALSE;
1319     if (!PL_utf8_cntrl)
1320         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1321     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1322 }
1323
1324 bool
1325 Perl_is_utf8_graph(pTHX_ U8 *p)
1326 {
1327     if (!is_utf8_char(p))
1328         return FALSE;
1329     if (!PL_utf8_graph)
1330         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1331     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1332 }
1333
1334 bool
1335 Perl_is_utf8_print(pTHX_ U8 *p)
1336 {
1337     if (!is_utf8_char(p))
1338         return FALSE;
1339     if (!PL_utf8_print)
1340         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1341     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1342 }
1343
1344 bool
1345 Perl_is_utf8_punct(pTHX_ U8 *p)
1346 {
1347     if (!is_utf8_char(p))
1348         return FALSE;
1349     if (!PL_utf8_punct)
1350         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1351     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1352 }
1353
1354 bool
1355 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1356 {
1357     if (!is_utf8_char(p))
1358         return FALSE;
1359     if (!PL_utf8_xdigit)
1360         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1361     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1362 }
1363
1364 bool
1365 Perl_is_utf8_mark(pTHX_ U8 *p)
1366 {
1367     if (!is_utf8_char(p))
1368         return FALSE;
1369     if (!PL_utf8_mark)
1370         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1371     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1372 }
1373
1374 /*
1375 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1376
1377 The "p" contains the pointer to the UTF-8 string encoding
1378 the character that is being converted.
1379
1380 The "ustrp" is a pointer to the character buffer to put the
1381 conversion result to.  The "lenp" is a pointer to the length
1382 of the result.
1383
1384 The "swashp" is a pointer to the swash to use.
1385
1386 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1387 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1388 but not always, a multicharacter mapping), is tried first.
1389
1390 The "special" is a string like "utf8::ToSpecLower", which means the
1391 hash %utf8::ToSpecLower.  The access to the hash is through
1392 Perl_to_utf8_case().
1393
1394 The "normal" is a string like "ToLower" which means the swash
1395 %utf8::ToLower.
1396
1397 =cut */
1398
1399 UV
1400 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1401 {
1402     UV uv0, uv1;
1403     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1404     STRLEN len = 0;
1405
1406     uv0 = utf8_to_uvchr(p, 0);
1407     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1408      * are necessary in EBCDIC, they are redundant no-ops
1409      * in ASCII-ish platforms, and hopefully optimized away. */
1410     uv1 = NATIVE_TO_UNI(uv0);
1411     uvuni_to_utf8(tmpbuf, uv1);
1412
1413     if (!*swashp) /* load on-demand */
1414          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1415
1416     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1417     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1418          /* It might be "special" (sometimes, but not always,
1419           * a multicharacter mapping) */
1420          HV *hv;
1421          SV **svp;
1422
1423          if ((hv  = get_hv(special, FALSE)) &&
1424              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1425              (*svp)) {
1426               char *s;
1427
1428               s = SvPV(*svp, len);
1429               if (len == 1)
1430                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1431               else {
1432 #ifdef EBCDIC
1433                    /* If we have EBCDIC we need to remap the characters
1434                     * since any characters in the low 256 are Unicode
1435                     * code points, not EBCDIC. */
1436                    U8 *t = (U8*)s, *tend = t + len, *d;
1437
1438                    d = tmpbuf;
1439                    if (SvUTF8(*svp)) {
1440                         STRLEN tlen = 0;
1441
1442                         while (t < tend) {
1443                              UV c = utf8_to_uvchr(t, &tlen);
1444                              if (tlen > 0) {
1445                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1446                                   t += tlen;
1447                              }
1448                              else
1449                                   break;
1450                         }
1451                    }
1452                    else {
1453                         while (t < tend) {
1454                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1455                              t++;
1456                         }
1457                    }
1458                    len = d - tmpbuf;
1459                    Copy(tmpbuf, ustrp, len, U8);
1460 #else
1461                    Copy(s, ustrp, len, U8);
1462 #endif
1463               }
1464          }
1465     }
1466
1467     if (!len && *swashp) {
1468          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1469
1470          if (uv2) {
1471               /* It was "normal" (a single character mapping). */
1472               UV uv3 = UNI_TO_NATIVE(uv2);
1473
1474               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1475          }
1476     }
1477
1478     if (!len) /* Neither: just copy. */
1479          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1480
1481     if (lenp)
1482          *lenp = len;
1483
1484     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1485 }
1486
1487 /*
1488 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1489
1490 Convert the UTF-8 encoded character at p to its uppercase version and
1491 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1492 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1493 the uppercase version may be longer than the original character.
1494
1495 The first character of the uppercased version is returned
1496 (but note, as explained above, that there may be more.)
1497
1498 =cut */
1499
1500 UV
1501 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1502 {
1503     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1504                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1505 }
1506
1507 /*
1508 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1509
1510 Convert the UTF-8 encoded character at p to its titlecase version and
1511 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1512 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1513 titlecase version may be longer than the original character.
1514
1515 The first character of the titlecased version is returned
1516 (but note, as explained above, that there may be more.)
1517
1518 =cut */
1519
1520 UV
1521 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1522 {
1523     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1524                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1525 }
1526
1527 /*
1528 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1529
1530 Convert the UTF-8 encoded character at p to its lowercase version and
1531 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1532 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1533 lowercase version may be longer than the original character.
1534
1535 The first character of the lowercased version is returned
1536 (but note, as explained above, that there may be more.)
1537
1538 =cut */
1539
1540 UV
1541 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1542 {
1543     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1544                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1545 }
1546
1547 /*
1548 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1549
1550 Convert the UTF-8 encoded character at p to its foldcase version and
1551 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1552 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1553 foldcase version may be longer than the original character (up to
1554 three characters).
1555
1556 The first character of the foldcased version is returned
1557 (but note, as explained above, that there may be more.)
1558
1559 =cut */
1560
1561 UV
1562 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1563 {
1564     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1565                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1566 }
1567
1568 /* a "swash" is a swatch hash */
1569
1570 SV*
1571 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1572 {
1573     SV* retval;
1574     SV* tokenbufsv = sv_newmortal();
1575     dSP;
1576     size_t pkg_len = strlen(pkg);
1577     size_t name_len = strlen(name);
1578     HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
1579     SV* errsv_save;
1580
1581     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1582         ENTER;
1583         errsv_save = newSVsv(ERRSV);
1584         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1585                          Nullsv);
1586         if (!SvTRUE(ERRSV))
1587             sv_setsv(ERRSV, errsv_save);
1588         SvREFCNT_dec(errsv_save);
1589         LEAVE;
1590     }
1591     SPAGAIN;
1592     PUSHSTACKi(PERLSI_MAGIC);
1593     PUSHMARK(SP);
1594     EXTEND(SP,5);
1595     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1596     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1597     PUSHs(listsv);
1598     PUSHs(sv_2mortal(newSViv(minbits)));
1599     PUSHs(sv_2mortal(newSViv(none)));
1600     PUTBACK;
1601     ENTER;
1602     SAVEI32(PL_hints);
1603     PL_hints = 0;
1604     save_re_context();
1605     if (IN_PERL_COMPILETIME) {
1606         /* XXX ought to be handled by lex_start */
1607         SAVEI32(PL_in_my);
1608         PL_in_my = 0;
1609         sv_setpv(tokenbufsv, PL_tokenbuf);
1610     }
1611     errsv_save = newSVsv(ERRSV);
1612     if (call_method("SWASHNEW", G_SCALAR))
1613         retval = newSVsv(*PL_stack_sp--);
1614     else
1615         retval = &PL_sv_undef;
1616     if (!SvTRUE(ERRSV))
1617         sv_setsv(ERRSV, errsv_save);
1618     SvREFCNT_dec(errsv_save);
1619     LEAVE;
1620     POPSTACK;
1621     if (IN_PERL_COMPILETIME) {
1622         STRLEN len;
1623         char* pv = SvPV(tokenbufsv, len);
1624
1625         Copy(pv, PL_tokenbuf, len+1, char);
1626         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1627     }
1628     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1629         if (SvPOK(retval))
1630             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1631                        retval);
1632         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1633     }
1634     return retval;
1635 }
1636
1637
1638 /* This API is wrong for special case conversions since we may need to
1639  * return several Unicode characters for a single Unicode character
1640  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1641  * the lower-level routine, and it is similarly broken for returning
1642  * multiple values.  --jhi */
1643 UV
1644 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1645 {
1646     HV* hv = (HV*)SvRV(sv);
1647     U32 klen;
1648     U32 off;
1649     STRLEN slen;
1650     STRLEN needents;
1651     U8 *tmps = NULL;
1652     U32 bit;
1653     SV *retval;
1654     U8 tmputf8[2];
1655     UV c = NATIVE_TO_ASCII(*ptr);
1656
1657     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1658         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1659         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1660         ptr = tmputf8;
1661     }
1662     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1663      * then the "swatch" is a vec() for al the chars which start
1664      * with 0xAA..0xYY
1665      * So the key in the hash (klen) is length of encoded char -1
1666      */
1667     klen = UTF8SKIP(ptr) - 1;
1668     off  = ptr[klen];
1669
1670     if (klen == 0)
1671      {
1672       /* If char in invariant then swatch is for all the invariant chars
1673        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1674        */
1675       needents = UTF_CONTINUATION_MARK;
1676       off      = NATIVE_TO_UTF(ptr[klen]);
1677      }
1678     else
1679      {
1680       /* If char is encoded then swatch is for the prefix */
1681       needents = (1 << UTF_ACCUMULATION_SHIFT);
1682       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1683      }
1684
1685     /*
1686      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1687      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1688      * it's nothing to sniff at.)  Pity we usually come through at least
1689      * two function calls to get here...
1690      *
1691      * NB: this code assumes that swatches are never modified, once generated!
1692      */
1693
1694     if (hv   == PL_last_swash_hv &&
1695         klen == PL_last_swash_klen &&
1696         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1697     {
1698         tmps = PL_last_swash_tmps;
1699         slen = PL_last_swash_slen;
1700     }
1701     else {
1702         /* Try our second-level swatch cache, kept in a hash. */
1703         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1704
1705         /* If not cached, generate it via utf8::SWASHGET */
1706         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1707             dSP;
1708             /* We use utf8n_to_uvuni() as we want an index into
1709                Unicode tables, not a native character number.
1710              */
1711             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1712                                            ckWARN(WARN_UTF8) ?
1713                                            0 : UTF8_ALLOW_ANY);
1714             SV *errsv_save;
1715             ENTER;
1716             SAVETMPS;
1717             save_re_context();
1718             PUSHSTACKi(PERLSI_MAGIC);
1719             PUSHMARK(SP);
1720             EXTEND(SP,3);
1721             PUSHs((SV*)sv);
1722             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1723             PUSHs(sv_2mortal(newSViv((klen) ?
1724                                      (code_point & ~(needents - 1)) : 0)));
1725             PUSHs(sv_2mortal(newSViv(needents)));
1726             PUTBACK;
1727             errsv_save = newSVsv(ERRSV);
1728             if (call_method("SWASHGET", G_SCALAR))
1729                 retval = newSVsv(*PL_stack_sp--);
1730             else
1731                 retval = &PL_sv_undef;
1732             if (!SvTRUE(ERRSV))
1733                 sv_setsv(ERRSV, errsv_save);
1734             SvREFCNT_dec(errsv_save);
1735             POPSTACK;
1736             FREETMPS;
1737             LEAVE;
1738             if (IN_PERL_COMPILETIME)
1739                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1740
1741             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1742
1743             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1744                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1745         }
1746
1747         PL_last_swash_hv = hv;
1748         PL_last_swash_klen = klen;
1749         PL_last_swash_tmps = tmps;
1750         PL_last_swash_slen = slen;
1751         if (klen)
1752             Copy(ptr, PL_last_swash_key, klen, U8);
1753     }
1754
1755     switch ((int)((slen << 3) / needents)) {
1756     case 1:
1757         bit = 1 << (off & 7);
1758         off >>= 3;
1759         return (tmps[off] & bit) != 0;
1760     case 8:
1761         return tmps[off];
1762     case 16:
1763         off <<= 1;
1764         return (tmps[off] << 8) + tmps[off + 1] ;
1765     case 32:
1766         off <<= 2;
1767         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1768     }
1769     Perl_croak(aTHX_ "panic: swash_fetch");
1770     return 0;
1771 }
1772
1773
1774 /*
1775 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1776
1777 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1778 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1779 bytes available. The return value is the pointer to the byte after the
1780 end of the new character. In other words,
1781
1782     d = uvchr_to_utf8(d, uv);
1783
1784 is the recommended wide native character-aware way of saying
1785
1786     *(d++) = uv;
1787
1788 =cut
1789 */
1790
1791 /* On ASCII machines this is normally a macro but we want a
1792    real function in case XS code wants it
1793 */
1794 #undef Perl_uvchr_to_utf8
1795 U8 *
1796 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1797 {
1798     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1799 }
1800
1801 U8 *
1802 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1803 {
1804     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1805 }
1806
1807 /*
1808 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1809
1810 Returns the native character value of the first character in the string C<s>
1811 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1812 length, in bytes, of that character.
1813
1814 Allows length and flags to be passed to low level routine.
1815
1816 =cut
1817 */
1818 /* On ASCII machines this is normally a macro but we want
1819    a real function in case XS code wants it
1820 */
1821 #undef Perl_utf8n_to_uvchr
1822 UV
1823 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1824 {
1825     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1826     return UNI_TO_NATIVE(uv);
1827 }
1828
1829 /*
1830 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1831
1832 Build to the scalar dsv a displayable version of the string spv,
1833 length len, the displayable version being at most pvlim bytes long
1834 (if longer, the rest is truncated and "..." will be appended).
1835
1836 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1837 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1838 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1839 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1840 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1841 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1842
1843 The pointer to the PV of the dsv is returned.
1844
1845 =cut */
1846 char *
1847 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1848 {
1849     int truncated = 0;
1850     char *s, *e;
1851
1852     sv_setpvn(dsv, "", 0);
1853     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1854          UV u;
1855           /* This serves double duty as a flag and a character to print after
1856              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1857           */
1858          char ok = 0;
1859
1860          if (pvlim && SvCUR(dsv) >= pvlim) {
1861               truncated++;
1862               break;
1863          }
1864          u = utf8_to_uvchr((U8*)s, 0);
1865          if (u < 256) {
1866              unsigned char c = u & 0xFF;
1867              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1868                  switch (c) {
1869                  case '\n':
1870                      ok = 'n'; break;
1871                  case '\r':
1872                      ok = 'r'; break;
1873                  case '\t':
1874                      ok = 't'; break;
1875                  case '\f':
1876                      ok = 'f'; break;
1877                  case '\a':
1878                      ok = 'a'; break;
1879                  case '\\':
1880                      ok = '\\'; break;
1881                  default: break;
1882                  }
1883                  if (ok) {
1884                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1885                  }
1886              }
1887              /* isPRINT() is the locale-blind version. */
1888              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1889                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1890                  ok = 1;
1891              }
1892          }
1893          if (!ok)
1894              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1895     }
1896     if (truncated)
1897          sv_catpvn(dsv, "...", 3);
1898
1899     return SvPVX(dsv);
1900 }
1901
1902 /*
1903 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1904
1905 Build to the scalar dsv a displayable version of the scalar sv,
1906 the displayable version being at most pvlim bytes long
1907 (if longer, the rest is truncated and "..." will be appended).
1908
1909 The flags argument is as in pv_uni_display().
1910
1911 The pointer to the PV of the dsv is returned.
1912
1913 =cut */
1914 char *
1915 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1916 {
1917      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1918                                 pvlim, flags);
1919 }
1920
1921 /*
1922 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1923
1924 Return true if the strings s1 and s2 differ case-insensitively, false
1925 if not (if they are equal case-insensitively).  If u1 is true, the
1926 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1927 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1928 are false, the respective string is assumed to be in native 8-bit
1929 encoding.
1930
1931 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1932 in there (they will point at the beginning of the I<next> character).
1933 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1934 pointers beyond which scanning will not continue under any
1935 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1936 s2+l2 will be used as goal end pointers that will also stop the scan,
1937 and which qualify towards defining a successful match: all the scans
1938 that define an explicit length must reach their goal pointers for
1939 a match to succeed).
1940
1941 For case-insensitiveness, the "casefolding" of Unicode is used
1942 instead of upper/lowercasing both the characters, see
1943 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1944
1945 =cut */
1946 I32
1947 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1948 {
1949      register U8 *p1  = (U8*)s1;
1950      register U8 *p2  = (U8*)s2;
1951      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1952      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1953      STRLEN n1 = 0, n2 = 0;
1954      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1955      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
1956      U8 natbuf[1+1];
1957      STRLEN foldlen1, foldlen2;
1958      bool match;
1959
1960      if (pe1)
1961           e1 = *(U8**)pe1;
1962      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1963           f1 = (U8*)s1 + l1;
1964      if (pe2)
1965           e2 = *(U8**)pe2;
1966      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1967           f2 = (U8*)s2 + l2;
1968
1969      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1970           return 1; /* mismatch; possible infinite loop or false positive */
1971
1972      if (!u1 || !u2)
1973           natbuf[1] = 0; /* Need to terminate the buffer. */
1974
1975      while ((e1 == 0 || p1 < e1) &&
1976             (f1 == 0 || p1 < f1) &&
1977             (e2 == 0 || p2 < e2) &&
1978             (f2 == 0 || p2 < f2)) {
1979           if (n1 == 0) {
1980                if (u1)
1981                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1982                else {
1983                     natbuf[0] = *p1;
1984                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1985                }
1986                q1 = foldbuf1;
1987                n1 = foldlen1;
1988           }
1989           if (n2 == 0) {
1990                if (u2)
1991                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1992                else {
1993                     natbuf[0] = *p2;
1994                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1995                }
1996                q2 = foldbuf2;
1997                n2 = foldlen2;
1998           }
1999           while (n1 && n2) {
2000                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2001                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2002                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2003                    return 1; /* mismatch */
2004                n1 -= UTF8SKIP(q1);
2005                q1 += UTF8SKIP(q1);
2006                n2 -= UTF8SKIP(q2);
2007                q2 += UTF8SKIP(q2);
2008           }
2009           if (n1 == 0)
2010                p1 += u1 ? UTF8SKIP(p1) : 1;
2011           if (n2 == 0)
2012                p2 += u2 ? UTF8SKIP(p2) : 1;
2013
2014      }
2015
2016      /* A match is defined by all the scans that specified
2017       * an explicit length reaching their final goals. */
2018      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2019
2020      if (match) {
2021           if (pe1)
2022                *pe1 = (char*)p1;
2023           if (pe2)
2024                *pe2 = (char*)p2;
2025      }
2026
2027      return match ? 0 : 1; /* 0 match, 1 mismatch */
2028 }
2029
2030 /*
2031  * Local variables:
2032  * c-indentation-style: bsd
2033  * c-basic-offset: 4
2034  * indent-tabs-mode: t
2035  * End:
2036  *
2037  * vim: shiftwidth=4:
2038 */