utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
   4  *    others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
  29
  30 /*
  31 =head1 Unicode Support
  32
  33 This file contains various utility functions for manipulating UTF8-encoded
  34 strings. For the uninitiated, this is a method of representing arbitrary
  35 Unicode characters as a variable number of bytes, in such a way that
  36 characters in the ASCII range are unmodified, and a zero byte never appears
  37 within non-zero characters.
  38
  39 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  40
  41 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  42 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  43 bytes available. The return value is the pointer to the byte after the
  44 end of the new character. In other words,
  45
  46     d = uvuni_to_utf8_flags(d, uv, flags);
  47
  48 or, in most cases,
  49
  50     d = uvuni_to_utf8(d, uv);
  51
  52 (which is equivalent to)
  53
  54     d = uvuni_to_utf8_flags(d, uv, 0);
  55
  56 is the recommended Unicode-aware way of saying
  57
  58     *(d++) = uv;
  59
  60 =cut
  61 */
  62
  63 U8 *
  64 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  65 {
  66     if (ckWARN(WARN_UTF8)) {
  67          if (UNICODE_IS_SURROGATE(uv) &&
  68              !(flags & UNICODE_ALLOW_SURROGATE))
  69               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  70          else if (
  71                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  72                     !(flags & UNICODE_ALLOW_FDD0))
  73                    ||
  74                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  75                     !(flags & UNICODE_ALLOW_FFFF))) &&
  76                   /* UNICODE_ALLOW_SUPER includes
  77                    * FFFEs and FFFFs beyond 0x10FFFF. */
  78                   ((uv <= PERL_UNICODE_MAX) ||
  79                    !(flags & UNICODE_ALLOW_SUPER))
  80                   )
  81               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  82                          "Unicode character 0x%04"UVxf" is illegal", uv);
  83     }
  84     if (UNI_IS_INVARIANT(uv)) {
  85         *d++ = (U8)UTF_TO_NATIVE(uv);
  86         return d;
  87     }
  88 #if defined(EBCDIC)
  89     else {
  90         STRLEN len  = UNISKIP(uv);
  91         U8 *p = d+len-1;
  92         while (p > d) {
  93             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  94             uv >>= UTF_ACCUMULATION_SHIFT;
  95         }
  96         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  97         return d+len;
  98     }
  99 #else /* Non loop style */
 100     if (uv < 0x800) {
 101         *d++ = (U8)(( uv >>  6)         | 0xc0);
 102         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 103         return d;
 104     }
 105     if (uv < 0x10000) {
 106         *d++ = (U8)(( uv >> 12)         | 0xe0);
 107         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 108         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 109         return d;
 110     }
 111     if (uv < 0x200000) {
 112         *d++ = (U8)(( uv >> 18)         | 0xf0);
 113         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 114         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 115         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 116         return d;
 117     }
 118     if (uv < 0x4000000) {
 119         *d++ = (U8)(( uv >> 24)         | 0xf8);
 120         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 121         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 123         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 124         return d;
 125     }
 126     if (uv < 0x80000000) {
 127         *d++ = (U8)(( uv >> 30)         | 0xfc);
 128         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 129         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 132         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 133         return d;
 134     }
 135 #ifdef HAS_QUAD
 136     if (uv < UTF8_QUAD_MAX)
 137 #endif
 138     {
 139         *d++ =                            0xfe; /* Can't match U+FEFF! */
 140         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 141         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 145         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 146         return d;
 147     }
 148 #ifdef HAS_QUAD
 149     {
 150         *d++ =                            0xff;         /* Can't match U+FFFE! */
 151         *d++ =                            0x80;         /* 6 Reserved bits */
 152         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 153         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 154         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 162         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 163         return d;
 164     }
 165 #endif
 166 #endif /* Loop style */
 167 }
 168
 169 U8 *
 170 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 171 {
 172     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 173 }
 174
 175
 176 /*
 177 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 178
 179 Tests if some arbitrary number of bytes begins in a valid UTF-8
 180 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 181 UTF-8 character.  The actual number of bytes in the UTF-8 character
 182 will be returned if it is valid, otherwise 0.
 183
 184 =cut */
 185 STRLEN
 186 Perl_is_utf8_char(pTHX_ const U8 *s)
 187 {
 188     U8 u = *s;
 189     STRLEN slen, len;
 190     UV uv, ouv;
 191
 192     if (UTF8_IS_INVARIANT(u))
 193         return 1;
 194
 195     if (!UTF8_IS_START(u))
 196         return 0;
 197
 198     len = UTF8SKIP(s);
 199
 200     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 201         return 0;
 202
 203     slen = len - 1;
 204     s++;
 205     u &= UTF_START_MASK(len);
 206     uv  = u;
 207     ouv = uv;
 208     while (slen--) {
 209         if (!UTF8_IS_CONTINUATION(*s))
 210             return 0;
 211         uv = UTF8_ACCUMULATE(uv, *s);
 212         if (uv < ouv)
 213             return 0;
 214         ouv = uv;
 215         s++;
 216     }
 217
 218     if ((STRLEN)UNISKIP(uv) < len)
 219         return 0;
 220
 221     return len;
 222 }
 223
 224 /*
 225 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 226
 227 Returns true if first C<len> bytes of the given string form a valid
 228 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 229 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 230 because a valid ASCII string is a valid UTF-8 string.
 231
 232 =cut
 233 */
 234
 235 bool
 236 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 237 {
 238     const U8* x = s;
 239     const U8* send;
 240     STRLEN c;
 241
 242     if (!len && s)
 243         len = strlen((const char *)s);
 244     send = s + len;
 245
 246     while (x < send) {
 247          /* Inline the easy bits of is_utf8_char() here for speed... */
 248          if (UTF8_IS_INVARIANT(*x))
 249               c = 1;
 250          else if (!UTF8_IS_START(*x))
 251               return FALSE;
 252          else {
 253               /* ... and call is_utf8_char() only if really needed. */
 254               c = is_utf8_char(x);
 255               if (!c)
 256                    return FALSE;
 257          }
 258         x += c;
 259     }
 260     if (x != send)
 261         return FALSE;
 262
 263     return TRUE;
 264 }
 265
 266 /*
 267 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **p
 268
 269 Like is_ut8_string but store the location of the failure in
 270 the last argument.
 271
 272 =cut
 273 */
 274
 275 bool
 276 Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p)
 277 {
 278     const U8* x = s;
 279     const U8* send;
 280     STRLEN c;
 281
 282     if (!len && s)
 283         len = strlen((const char *)s);
 284     send = s + len;
 285
 286     while (x < send) {
 287          /* Inline the easy bits of is_utf8_char() here for speed... */
 288          if (UTF8_IS_INVARIANT(*x))
 289               c = 1;
 290          else if (!UTF8_IS_START(*x)) {
 291               if (p)
 292                   *p = x;
 293               return FALSE;
 294          }
 295          else {
 296               /* ... and call is_utf8_char() only if really needed. */
 297               c = is_utf8_char(x);
 298               if (!c) {
 299                    if (p)
 300                       *p = x;
 301                    return FALSE;
 302               }
 303          }
 304         x += c;
 305     }
 306     if (x != send) {
 307        if (p)
 308            *p = x;
 309         return FALSE;
 310     }
 311
 312     return TRUE;
 313 }
 314
 315 /*
 316 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 317
 318 Bottom level UTF-8 decode routine.
 319 Returns the unicode code point value of the first character in the string C<s>
 320 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 321 C<retlen> will be set to the length, in bytes, of that character.
 322
 323 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 324 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 325 it is assumed that the caller will raise a warning, and this function
 326 will silently just set C<retlen> to C<-1> and return zero.  If the
 327 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 328 malformations will be given, C<retlen> will be set to the expected
 329 length of the UTF-8 character in bytes, and zero will be returned.
 330
 331 The C<flags> can also contain various flags to allow deviations from
 332 the strict UTF-8 encoding (see F<utf8.h>).
 333
 334 Most code should use utf8_to_uvchr() rather than call this directly.
 335
 336 =cut
 337 */
 338
 339 UV
 340 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 341 {
 342     const U8 *s0 = s;
 343     UV uv = *s, ouv = 0;
 344     STRLEN len = 1;
 345     const bool dowarn = ckWARN_d(WARN_UTF8);
 346     const UV startbyte = *s;
 347     STRLEN expectlen = 0;
 348     U32 warning = 0;
 349
 350 /* This list is a superset of the UTF8_ALLOW_XXX. */
 351
 352 #define UTF8_WARN_EMPTY                          1
 353 #define UTF8_WARN_CONTINUATION                   2
 354 #define UTF8_WARN_NON_CONTINUATION               3
 355 #define UTF8_WARN_FE_FF                          4
 356 #define UTF8_WARN_SHORT                          5
 357 #define UTF8_WARN_OVERFLOW                       6
 358 #define UTF8_WARN_SURROGATE                      7
 359 #define UTF8_WARN_LONG                           8
 360 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 361
 362     if (curlen == 0 &&
 363         !(flags & UTF8_ALLOW_EMPTY)) {
 364         warning = UTF8_WARN_EMPTY;
 365         goto malformed;
 366     }
 367
 368     if (UTF8_IS_INVARIANT(uv)) {
 369         if (retlen)
 370             *retlen = 1;
 371         return (UV) (NATIVE_TO_UTF(*s));
 372     }
 373
 374     if (UTF8_IS_CONTINUATION(uv) &&
 375         !(flags & UTF8_ALLOW_CONTINUATION)) {
 376         warning = UTF8_WARN_CONTINUATION;
 377         goto malformed;
 378     }
 379
 380     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 381         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 382         warning = UTF8_WARN_NON_CONTINUATION;
 383         goto malformed;
 384     }
 385
 386 #ifdef EBCDIC
 387     uv = NATIVE_TO_UTF(uv);
 388 #else
 389     if ((uv == 0xfe || uv == 0xff) &&
 390         !(flags & UTF8_ALLOW_FE_FF)) {
 391         warning = UTF8_WARN_FE_FF;
 392         goto malformed;
 393     }
 394 #endif
 395
 396     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 397     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 398     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 399     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 400 #ifdef EBCDIC
 401     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 402     else                        { len =  7; uv &= 0x01; }
 403 #else
 404     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 405     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 406     else                        { len = 13; uv = 0; } /* whoa! */
 407 #endif
 408
 409     if (retlen)
 410         *retlen = len;
 411
 412     expectlen = len;
 413
 414     if ((curlen < expectlen) &&
 415         !(flags & UTF8_ALLOW_SHORT)) {
 416         warning = UTF8_WARN_SHORT;
 417         goto malformed;
 418     }
 419
 420     len--;
 421     s++;
 422     ouv = uv;
 423
 424     while (len--) {
 425         if (!UTF8_IS_CONTINUATION(*s) &&
 426             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 427             s--;
 428             warning = UTF8_WARN_NON_CONTINUATION;
 429             goto malformed;
 430         }
 431         else
 432             uv = UTF8_ACCUMULATE(uv, *s);
 433         if (!(uv > ouv)) {
 434             /* These cannot be allowed. */
 435             if (uv == ouv) {
 436                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 437                     warning = UTF8_WARN_LONG;
 438                     goto malformed;
 439                 }
 440             }
 441             else { /* uv < ouv */
 442                 /* This cannot be allowed. */
 443                 warning = UTF8_WARN_OVERFLOW;
 444                 goto malformed;
 445             }
 446         }
 447         s++;
 448         ouv = uv;
 449     }
 450
 451     if (UNICODE_IS_SURROGATE(uv) &&
 452         !(flags & UTF8_ALLOW_SURROGATE)) {
 453         warning = UTF8_WARN_SURROGATE;
 454         goto malformed;
 455     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 456                !(flags & UTF8_ALLOW_LONG)) {
 457         warning = UTF8_WARN_LONG;
 458         goto malformed;
 459     } else if (UNICODE_IS_ILLEGAL(uv) &&
 460                !(flags & UTF8_ALLOW_FFFF)) {
 461         warning = UTF8_WARN_FFFF;
 462         goto malformed;
 463     }
 464
 465     return uv;
 466
 467 malformed:
 468
 469     if (flags & UTF8_CHECK_ONLY) {
 470         if (retlen)
 471             *retlen = -1;
 472         return 0;
 473     }
 474
 475     if (dowarn) {
 476         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 477
 478         switch (warning) {
 479         case 0: /* Intentionally empty. */ break;
 480         case UTF8_WARN_EMPTY:
 481             Perl_sv_catpv(aTHX_ sv, "(empty string)");
 482             break;
 483         case UTF8_WARN_CONTINUATION:
 484             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 485             break;
 486         case UTF8_WARN_NON_CONTINUATION:
 487             if (s == s0)
 488                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 489                            (UV)s[1], startbyte);
 490             else
 491                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 492                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 493
 494             break;
 495         case UTF8_WARN_FE_FF:
 496             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 497             break;
 498         case UTF8_WARN_SHORT:
 499             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 500                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 501             expectlen = curlen;         /* distance for caller to skip */
 502             break;
 503         case UTF8_WARN_OVERFLOW:
 504             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 505                            ouv, *s, startbyte);
 506             break;
 507         case UTF8_WARN_SURROGATE:
 508             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 509             break;
 510         case UTF8_WARN_LONG:
 511             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 512                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 513             break;
 514         case UTF8_WARN_FFFF:
 515             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 516             break;
 517         default:
 518             Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
 519             break;
 520         }
 521
 522         if (warning) {
 523             char *s = SvPVX(sv);
 524
 525             if (PL_op)
 526                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 527                             "%s in %s", s,  OP_DESC(PL_op));
 528             else
 529                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 530         }
 531     }
 532
 533     if (retlen)
 534         *retlen = expectlen ? expectlen : len;
 535
 536     return 0;
 537 }
 538
 539 /*
 540 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 541
 542 Returns the native character value of the first character in the string C<s>
 543 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 544 length, in bytes, of that character.
 545
 546 If C<s> does not point to a well-formed UTF-8 character, zero is
 547 returned and retlen is set, if possible, to -1.
 548
 549 =cut
 550 */
 551
 552 UV
 553 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 554 {
 555     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
 556                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 557 }
 558
 559 /*
 560 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 561
 562 Returns the Unicode code point of the first character in the string C<s>
 563 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 564 length, in bytes, of that character.
 565
 566 This function should only be used when returned UV is considered
 567 an index into the Unicode semantic tables (e.g. swashes).
 568
 569 If C<s> does not point to a well-formed UTF-8 character, zero is
 570 returned and retlen is set, if possible, to -1.
 571
 572 =cut
 573 */
 574
 575 UV
 576 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 577 {
 578     /* Call the low level routine asking for checks */
 579     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 580                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 581 }
 582
 583 /*
 584 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 585
 586 Return the length of the UTF-8 char encoded string C<s> in characters.
 587 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 588 up past C<e>, croaks.
 589
 590 =cut
 591 */
 592
 593 STRLEN
 594 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 595 {
 596     STRLEN len = 0;
 597
 598     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 599      * the bitops (especially ~) can create illegal UTF-8.
 600      * In other words: in Perl UTF-8 is not just for Unicode. */
 601
 602     if (e < s) {
 603         if (ckWARN_d(WARN_UTF8)) {
 604             if (PL_op)
 605                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 606                             "%s in %s", unees, OP_DESC(PL_op));
 607             else
 608                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 609         }
 610         return 0;
 611     }
 612     while (s < e) {
 613         U8 t = UTF8SKIP(s);
 614
 615         if (e - s < t) {
 616             if (ckWARN_d(WARN_UTF8)) {
 617                 if (PL_op)
 618                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 619                                 unees, OP_DESC(PL_op));
 620                 else
 621                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 622             }
 623             return len;
 624         }
 625         s += t;
 626         len++;
 627     }
 628
 629     return len;
 630 }
 631
 632 /*
 633 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 634
 635 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 636 and C<b>.
 637
 638 WARNING: use only if you *know* that the pointers point inside the
 639 same UTF-8 buffer.
 640
 641 =cut
 642 */
 643
 644 IV
 645 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 646 {
 647     IV off = 0;
 648
 649     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 650      * the bitops (especially ~) can create illegal UTF-8.
 651      * In other words: in Perl UTF-8 is not just for Unicode. */
 652
 653     if (a < b) {
 654         while (a < b) {
 655             const U8 c = UTF8SKIP(a);
 656
 657             if (b - a < c) {
 658                 if (ckWARN_d(WARN_UTF8)) {
 659                     if (PL_op)
 660                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 661                                     "%s in %s", unees, OP_DESC(PL_op));
 662                     else
 663                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 664                 }
 665                 return off;
 666             }
 667             a += c;
 668             off--;
 669         }
 670     }
 671     else {
 672         while (b < a) {
 673             U8 c = UTF8SKIP(b);
 674
 675             if (a - b < c) {
 676                 if (ckWARN_d(WARN_UTF8)) {
 677                     if (PL_op)
 678                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 679                                     "%s in %s", unees, OP_DESC(PL_op));
 680                     else
 681                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 682                 }
 683                 return off;
 684             }
 685             b += c;
 686             off++;
 687         }
 688     }
 689
 690     return off;
 691 }
 692
 693 /*
 694 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 695
 696 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 697 forward or backward.
 698
 699 WARNING: do not use the following unless you *know* C<off> is within
 700 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 701 on the first byte of character or just after the last byte of a character.
 702
 703 =cut
 704 */
 705
 706 U8 *
 707 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 708 {
 709     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 710      * the bitops (especially ~) can create illegal UTF-8.
 711      * In other words: in Perl UTF-8 is not just for Unicode. */
 712
 713     if (off >= 0) {
 714         while (off--)
 715             s += UTF8SKIP(s);
 716     }
 717     else {
 718         while (off++) {
 719             s--;
 720             while (UTF8_IS_CONTINUATION(*s))
 721                 s--;
 722         }
 723     }
 724     return s;
 725 }
 726
 727 /*
 728 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 729
 730 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 731 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 732 updates len to contain the new length.
 733 Returns zero on failure, setting C<len> to -1.
 734
 735 =cut
 736 */
 737
 738 U8 *
 739 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 740 {
 741     U8 *send;
 742     U8 *d;
 743     U8 *save = s;
 744
 745     /* ensure valid UTF-8 and chars < 256 before updating string */
 746     for (send = s + *len; s < send; ) {
 747         U8 c = *s++;
 748
 749         if (!UTF8_IS_INVARIANT(c) &&
 750             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 751              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 752             *len = -1;
 753             return 0;
 754         }
 755     }
 756
 757     d = s = save;
 758     while (s < send) {
 759         STRLEN ulen;
 760         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 761         s += ulen;
 762     }
 763     *d = '\0';
 764     *len = d - save;
 765     return save;
 766 }
 767
 768 /*
 769 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 770
 771 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 772 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 773 the newly-created string, and updates C<len> to contain the new
 774 length.  Returns the original string if no conversion occurs, C<len>
 775 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 776 0 if C<s> is converted or contains all 7bit characters.
 777
 778 =cut
 779 */
 780
 781 U8 *
 782 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 783 {
 784     U8 *d;
 785     const U8 *start = s;
 786     const U8 *send;
 787     I32 count = 0;
 788
 789     if (!*is_utf8)
 790         return (U8 *)start;
 791
 792     /* ensure valid UTF-8 and chars < 256 before converting string */
 793     for (send = s + *len; s < send;) {
 794         U8 c = *s++;
 795         if (!UTF8_IS_INVARIANT(c)) {
 796             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 797                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 798                 count++;
 799             else
 800                 return (U8 *)start;
 801         }
 802     }
 803
 804     *is_utf8 = 0;
 805
 806     Newz(801, d, (*len) - count + 1, U8);
 807     s = start; start = d;
 808     while (s < send) {
 809         U8 c = *s++;
 810         if (!UTF8_IS_INVARIANT(c)) {
 811             /* Then it is two-byte encoded */
 812             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 813             c = ASCII_TO_NATIVE(c);
 814         }
 815         *d++ = c;
 816     }
 817     *d = '\0';
 818     *len = d - start;
 819     return (U8 *)start;
 820 }
 821
 822 /*
 823 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 824
 825 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 826 Returns a pointer to the newly-created string, and sets C<len> to
 827 reflect the new length.
 828
 829 If you want to convert to UTF-8 from other encodings than ASCII,
 830 see sv_recode_to_utf8().
 831
 832 =cut
 833 */
 834
 835 U8*
 836 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 837 {
 838     const U8 * const send = s + (*len);
 839     U8 *d;
 840     U8 *dst;
 841
 842     Newz(801, d, (*len) * 2 + 1, U8);
 843     dst = d;
 844
 845     while (s < send) {
 846         const UV uv = NATIVE_TO_ASCII(*s++);
 847         if (UNI_IS_INVARIANT(uv))
 848             *d++ = (U8)UTF_TO_NATIVE(uv);
 849         else {
 850             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 851             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 852         }
 853     }
 854     *d = '\0';
 855     *len = d-dst;
 856     return dst;
 857 }
 858
 859 /*
 860  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 861  *
 862  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 863  * We optimize for native, for obvious reasons. */
 864
 865 U8*
 866 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 867 {
 868     U8* pend;
 869     U8* dstart = d;
 870
 871     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 872          d[0] = 0;
 873          *newlen = 1;
 874          return d;
 875     }
 876
 877     if (bytelen & 1)
 878         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 879
 880     pend = p + bytelen;
 881
 882     while (p < pend) {
 883         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 884         p += 2;
 885         if (uv < 0x80) {
 886             *d++ = (U8)uv;
 887             continue;
 888         }
 889         if (uv < 0x800) {
 890             *d++ = (U8)(( uv >>  6)         | 0xc0);
 891             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 892             continue;
 893         }
 894         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 895             UV low = (p[0] << 8) + p[1];
 896             p += 2;
 897             if (low < 0xdc00 || low >= 0xdfff)
 898                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 899             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 900         }
 901         if (uv < 0x10000) {
 902             *d++ = (U8)(( uv >> 12)         | 0xe0);
 903             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 904             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 905             continue;
 906         }
 907         else {
 908             *d++ = (U8)(( uv >> 18)         | 0xf0);
 909             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 910             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 911             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 912             continue;
 913         }
 914     }
 915     *newlen = d - dstart;
 916     return d;
 917 }
 918
 919 /* Note: this one is slightly destructive of the source. */
 920
 921 U8*
 922 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 923 {
 924     U8* s = (U8*)p;
 925     U8* send = s + bytelen;
 926     while (s < send) {
 927         U8 tmp = s[0];
 928         s[0] = s[1];
 929         s[1] = tmp;
 930         s += 2;
 931     }
 932     return utf16_to_utf8(p, d, bytelen, newlen);
 933 }
 934
 935 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 936
 937 bool
 938 Perl_is_uni_alnum(pTHX_ UV c)
 939 {
 940     U8 tmpbuf[UTF8_MAXBYTES+1];
 941     uvchr_to_utf8(tmpbuf, c);
 942     return is_utf8_alnum(tmpbuf);
 943 }
 944
 945 bool
 946 Perl_is_uni_alnumc(pTHX_ UV c)
 947 {
 948     U8 tmpbuf[UTF8_MAXBYTES+1];
 949     uvchr_to_utf8(tmpbuf, c);
 950     return is_utf8_alnumc(tmpbuf);
 951 }
 952
 953 bool
 954 Perl_is_uni_idfirst(pTHX_ UV c)
 955 {
 956     U8 tmpbuf[UTF8_MAXBYTES+1];
 957     uvchr_to_utf8(tmpbuf, c);
 958     return is_utf8_idfirst(tmpbuf);
 959 }
 960
 961 bool
 962 Perl_is_uni_alpha(pTHX_ UV c)
 963 {
 964     U8 tmpbuf[UTF8_MAXBYTES+1];
 965     uvchr_to_utf8(tmpbuf, c);
 966     return is_utf8_alpha(tmpbuf);
 967 }
 968
 969 bool
 970 Perl_is_uni_ascii(pTHX_ UV c)
 971 {
 972     U8 tmpbuf[UTF8_MAXBYTES+1];
 973     uvchr_to_utf8(tmpbuf, c);
 974     return is_utf8_ascii(tmpbuf);
 975 }
 976
 977 bool
 978 Perl_is_uni_space(pTHX_ UV c)
 979 {
 980     U8 tmpbuf[UTF8_MAXBYTES+1];
 981     uvchr_to_utf8(tmpbuf, c);
 982     return is_utf8_space(tmpbuf);
 983 }
 984
 985 bool
 986 Perl_is_uni_digit(pTHX_ UV c)
 987 {
 988     U8 tmpbuf[UTF8_MAXBYTES+1];
 989     uvchr_to_utf8(tmpbuf, c);
 990     return is_utf8_digit(tmpbuf);
 991 }
 992
 993 bool
 994 Perl_is_uni_upper(pTHX_ UV c)
 995 {
 996     U8 tmpbuf[UTF8_MAXBYTES+1];
 997     uvchr_to_utf8(tmpbuf, c);
 998     return is_utf8_upper(tmpbuf);
 999 }
1000
1001 bool
1002 Perl_is_uni_lower(pTHX_ UV c)
1003 {
1004     U8 tmpbuf[UTF8_MAXBYTES+1];
1005     uvchr_to_utf8(tmpbuf, c);
1006     return is_utf8_lower(tmpbuf);
1007 }
1008
1009 bool
1010 Perl_is_uni_cntrl(pTHX_ UV c)
1011 {
1012     U8 tmpbuf[UTF8_MAXBYTES+1];
1013     uvchr_to_utf8(tmpbuf, c);
1014     return is_utf8_cntrl(tmpbuf);
1015 }
1016
1017 bool
1018 Perl_is_uni_graph(pTHX_ UV c)
1019 {
1020     U8 tmpbuf[UTF8_MAXBYTES+1];
1021     uvchr_to_utf8(tmpbuf, c);
1022     return is_utf8_graph(tmpbuf);
1023 }
1024
1025 bool
1026 Perl_is_uni_print(pTHX_ UV c)
1027 {
1028     U8 tmpbuf[UTF8_MAXBYTES+1];
1029     uvchr_to_utf8(tmpbuf, c);
1030     return is_utf8_print(tmpbuf);
1031 }
1032
1033 bool
1034 Perl_is_uni_punct(pTHX_ UV c)
1035 {
1036     U8 tmpbuf[UTF8_MAXBYTES+1];
1037     uvchr_to_utf8(tmpbuf, c);
1038     return is_utf8_punct(tmpbuf);
1039 }
1040
1041 bool
1042 Perl_is_uni_xdigit(pTHX_ UV c)
1043 {
1044     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1045     uvchr_to_utf8(tmpbuf, c);
1046     return is_utf8_xdigit(tmpbuf);
1047 }
1048
1049 UV
1050 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1051 {
1052     uvchr_to_utf8(p, c);
1053     return to_utf8_upper(p, p, lenp);
1054 }
1055
1056 UV
1057 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1058 {
1059     uvchr_to_utf8(p, c);
1060     return to_utf8_title(p, p, lenp);
1061 }
1062
1063 UV
1064 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1065 {
1066     uvchr_to_utf8(p, c);
1067     return to_utf8_lower(p, p, lenp);
1068 }
1069
1070 UV
1071 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1072 {
1073     uvchr_to_utf8(p, c);
1074     return to_utf8_fold(p, p, lenp);
1075 }
1076
1077 /* for now these all assume no locale info available for Unicode > 255 */
1078
1079 bool
1080 Perl_is_uni_alnum_lc(pTHX_ UV c)
1081 {
1082     return is_uni_alnum(c);     /* XXX no locale support yet */
1083 }
1084
1085 bool
1086 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1087 {
1088     return is_uni_alnumc(c);    /* XXX no locale support yet */
1089 }
1090
1091 bool
1092 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1093 {
1094     return is_uni_idfirst(c);   /* XXX no locale support yet */
1095 }
1096
1097 bool
1098 Perl_is_uni_alpha_lc(pTHX_ UV c)
1099 {
1100     return is_uni_alpha(c);     /* XXX no locale support yet */
1101 }
1102
1103 bool
1104 Perl_is_uni_ascii_lc(pTHX_ UV c)
1105 {
1106     return is_uni_ascii(c);     /* XXX no locale support yet */
1107 }
1108
1109 bool
1110 Perl_is_uni_space_lc(pTHX_ UV c)
1111 {
1112     return is_uni_space(c);     /* XXX no locale support yet */
1113 }
1114
1115 bool
1116 Perl_is_uni_digit_lc(pTHX_ UV c)
1117 {
1118     return is_uni_digit(c);     /* XXX no locale support yet */
1119 }
1120
1121 bool
1122 Perl_is_uni_upper_lc(pTHX_ UV c)
1123 {
1124     return is_uni_upper(c);     /* XXX no locale support yet */
1125 }
1126
1127 bool
1128 Perl_is_uni_lower_lc(pTHX_ UV c)
1129 {
1130     return is_uni_lower(c);     /* XXX no locale support yet */
1131 }
1132
1133 bool
1134 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1135 {
1136     return is_uni_cntrl(c);     /* XXX no locale support yet */
1137 }
1138
1139 bool
1140 Perl_is_uni_graph_lc(pTHX_ UV c)
1141 {
1142     return is_uni_graph(c);     /* XXX no locale support yet */
1143 }
1144
1145 bool
1146 Perl_is_uni_print_lc(pTHX_ UV c)
1147 {
1148     return is_uni_print(c);     /* XXX no locale support yet */
1149 }
1150
1151 bool
1152 Perl_is_uni_punct_lc(pTHX_ UV c)
1153 {
1154     return is_uni_punct(c);     /* XXX no locale support yet */
1155 }
1156
1157 bool
1158 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1159 {
1160     return is_uni_xdigit(c);    /* XXX no locale support yet */
1161 }
1162
1163 U32
1164 Perl_to_uni_upper_lc(pTHX_ U32 c)
1165 {
1166     /* XXX returns only the first character -- do not use XXX */
1167     /* XXX no locale support yet */
1168     STRLEN len;
1169     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1170     return (U32)to_uni_upper(c, tmpbuf, &len);
1171 }
1172
1173 U32
1174 Perl_to_uni_title_lc(pTHX_ U32 c)
1175 {
1176     /* XXX returns only the first character XXX -- do not use XXX */
1177     /* XXX no locale support yet */
1178     STRLEN len;
1179     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1180     return (U32)to_uni_title(c, tmpbuf, &len);
1181 }
1182
1183 U32
1184 Perl_to_uni_lower_lc(pTHX_ U32 c)
1185 {
1186     /* XXX returns only the first character -- do not use XXX */
1187     /* XXX no locale support yet */
1188     STRLEN len;
1189     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1190     return (U32)to_uni_lower(c, tmpbuf, &len);
1191 }
1192
1193 bool
1194 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1195 {
1196     if (!is_utf8_char(p))
1197         return FALSE;
1198     if (!PL_utf8_alnum)
1199         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1200          * descendant of isalnum(3), in other words, it doesn't
1201          * contain the '_'. --jhi */
1202         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1203     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1204 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1205 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1206     if (!PL_utf8_alnum)
1207         PL_utf8_alnum = swash_init("utf8", "",
1208             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1209     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1210 #endif
1211 }
1212
1213 bool
1214 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1215 {
1216     if (!is_utf8_char(p))
1217         return FALSE;
1218     if (!PL_utf8_alnum)
1219         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1220     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1221 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1222 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1223     if (!PL_utf8_alnum)
1224         PL_utf8_alnum = swash_init("utf8", "",
1225             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1226     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1227 #endif
1228 }
1229
1230 bool
1231 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1232 {
1233     if (*p == '_')
1234         return TRUE;
1235     if (!is_utf8_char(p))
1236         return FALSE;
1237     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1238         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1239     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1240 }
1241
1242 bool
1243 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1244 {
1245     if (*p == '_')
1246         return TRUE;
1247     if (!is_utf8_char(p))
1248         return FALSE;
1249     if (!PL_utf8_idcont)
1250         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1251     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1252 }
1253
1254 bool
1255 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1256 {
1257     if (!is_utf8_char(p))
1258         return FALSE;
1259     if (!PL_utf8_alpha)
1260         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1261     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1262 }
1263
1264 bool
1265 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1266 {
1267     if (!is_utf8_char(p))
1268         return FALSE;
1269     if (!PL_utf8_ascii)
1270         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1271     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1272 }
1273
1274 bool
1275 Perl_is_utf8_space(pTHX_ const U8 *p)
1276 {
1277     if (!is_utf8_char(p))
1278         return FALSE;
1279     if (!PL_utf8_space)
1280         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1281     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1282 }
1283
1284 bool
1285 Perl_is_utf8_digit(pTHX_ const U8 *p)
1286 {
1287     if (!is_utf8_char(p))
1288         return FALSE;
1289     if (!PL_utf8_digit)
1290         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1291     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1292 }
1293
1294 bool
1295 Perl_is_utf8_upper(pTHX_ const U8 *p)
1296 {
1297     if (!is_utf8_char(p))
1298         return FALSE;
1299     if (!PL_utf8_upper)
1300         PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1301     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1302 }
1303
1304 bool
1305 Perl_is_utf8_lower(pTHX_ const U8 *p)
1306 {
1307     if (!is_utf8_char(p))
1308         return FALSE;
1309     if (!PL_utf8_lower)
1310         PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1311     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1312 }
1313
1314 bool
1315 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1316 {
1317     if (!is_utf8_char(p))
1318         return FALSE;
1319     if (!PL_utf8_cntrl)
1320         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1321     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1322 }
1323
1324 bool
1325 Perl_is_utf8_graph(pTHX_ const U8 *p)
1326 {
1327     if (!is_utf8_char(p))
1328         return FALSE;
1329     if (!PL_utf8_graph)
1330         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1331     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1332 }
1333
1334 bool
1335 Perl_is_utf8_print(pTHX_ const U8 *p)
1336 {
1337     if (!is_utf8_char(p))
1338         return FALSE;
1339     if (!PL_utf8_print)
1340         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1341     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1342 }
1343
1344 bool
1345 Perl_is_utf8_punct(pTHX_ const U8 *p)
1346 {
1347     if (!is_utf8_char(p))
1348         return FALSE;
1349     if (!PL_utf8_punct)
1350         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1351     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1352 }
1353
1354 bool
1355 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1356 {
1357     if (!is_utf8_char(p))
1358         return FALSE;
1359     if (!PL_utf8_xdigit)
1360         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1361     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1362 }
1363
1364 bool
1365 Perl_is_utf8_mark(pTHX_ const U8 *p)
1366 {
1367     if (!is_utf8_char(p))
1368         return FALSE;
1369     if (!PL_utf8_mark)
1370         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1371     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1372 }
1373
1374 /*
1375 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1376
1377 The "p" contains the pointer to the UTF-8 string encoding
1378 the character that is being converted.
1379
1380 The "ustrp" is a pointer to the character buffer to put the
1381 conversion result to.  The "lenp" is a pointer to the length
1382 of the result.
1383
1384 The "swashp" is a pointer to the swash to use.
1385
1386 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1387 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1388 but not always, a multicharacter mapping), is tried first.
1389
1390 The "special" is a string like "utf8::ToSpecLower", which means the
1391 hash %utf8::ToSpecLower.  The access to the hash is through
1392 Perl_to_utf8_case().
1393
1394 The "normal" is a string like "ToLower" which means the swash
1395 %utf8::ToLower.
1396
1397 =cut */
1398
1399 UV
1400 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
1401 {
1402     UV uv1;
1403     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1404     STRLEN len = 0;
1405
1406     const UV uv0 = utf8_to_uvchr(p, 0);
1407     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1408      * are necessary in EBCDIC, they are redundant no-ops
1409      * in ASCII-ish platforms, and hopefully optimized away. */
1410     uv1 = NATIVE_TO_UNI(uv0);
1411     uvuni_to_utf8(tmpbuf, uv1);
1412
1413     if (!*swashp) /* load on-demand */
1414          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1415
1416     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1417     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1418          /* It might be "special" (sometimes, but not always,
1419           * a multicharacter mapping) */
1420          HV *hv;
1421          SV **svp;
1422
1423          if ((hv  = get_hv(special, FALSE)) &&
1424              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1425              (*svp)) {
1426               char *s;
1427
1428               s = SvPV(*svp, len);
1429               if (len == 1)
1430                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1431               else {
1432 #ifdef EBCDIC
1433                    /* If we have EBCDIC we need to remap the characters
1434                     * since any characters in the low 256 are Unicode
1435                     * code points, not EBCDIC. */
1436                    U8 *t = (U8*)s, *tend = t + len, *d;
1437
1438                    d = tmpbuf;
1439                    if (SvUTF8(*svp)) {
1440                         STRLEN tlen = 0;
1441
1442                         while (t < tend) {
1443                              UV c = utf8_to_uvchr(t, &tlen);
1444                              if (tlen > 0) {
1445                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1446                                   t += tlen;
1447                              }
1448                              else
1449                                   break;
1450                         }
1451                    }
1452                    else {
1453                         while (t < tend) {
1454                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1455                              t++;
1456                         }
1457                    }
1458                    len = d - tmpbuf;
1459                    Copy(tmpbuf, ustrp, len, U8);
1460 #else
1461                    Copy(s, ustrp, len, U8);
1462 #endif
1463               }
1464          }
1465     }
1466
1467     if (!len && *swashp) {
1468          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1469
1470          if (uv2) {
1471               /* It was "normal" (a single character mapping). */
1472               UV uv3 = UNI_TO_NATIVE(uv2);
1473
1474               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1475          }
1476     }
1477
1478     if (!len) /* Neither: just copy. */
1479          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1480
1481     if (lenp)
1482          *lenp = len;
1483
1484     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1485 }
1486
1487 /*
1488 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1489
1490 Convert the UTF-8 encoded character at p to its uppercase version and
1491 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1492 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1493 the uppercase version may be longer than the original character.
1494
1495 The first character of the uppercased version is returned
1496 (but note, as explained above, that there may be more.)
1497
1498 =cut */
1499
1500 UV
1501 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1502 {
1503     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1504                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1505 }
1506
1507 /*
1508 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1509
1510 Convert the UTF-8 encoded character at p to its titlecase version and
1511 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1512 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1513 titlecase version may be longer than the original character.
1514
1515 The first character of the titlecased version is returned
1516 (but note, as explained above, that there may be more.)
1517
1518 =cut */
1519
1520 UV
1521 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1522 {
1523     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1524                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1525 }
1526
1527 /*
1528 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1529
1530 Convert the UTF-8 encoded character at p to its lowercase version and
1531 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1532 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1533 lowercase version may be longer than the original character.
1534
1535 The first character of the lowercased version is returned
1536 (but note, as explained above, that there may be more.)
1537
1538 =cut */
1539
1540 UV
1541 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1542 {
1543     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1544                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1545 }
1546
1547 /*
1548 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1549
1550 Convert the UTF-8 encoded character at p to its foldcase version and
1551 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1552 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1553 foldcase version may be longer than the original character (up to
1554 three characters).
1555
1556 The first character of the foldcased version is returned
1557 (but note, as explained above, that there may be more.)
1558
1559 =cut */
1560
1561 UV
1562 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1563 {
1564     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1565                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1566 }
1567
1568 /* a "swash" is a swatch hash */
1569
1570 SV*
1571 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1572 {
1573     SV* retval;
1574     SV* tokenbufsv = sv_newmortal();
1575     dSP;
1576     const size_t pkg_len = strlen(pkg);
1577     const size_t name_len = strlen(name);
1578     HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
1579     SV* errsv_save;
1580
1581     PUSHSTACKi(PERLSI_MAGIC);
1582     ENTER;
1583     SAVEI32(PL_hints);
1584     PL_hints = 0;
1585     save_re_context();
1586     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1587         ENTER;
1588         errsv_save = newSVsv(ERRSV);
1589         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1590                          Nullsv);
1591         if (!SvTRUE(ERRSV))
1592             sv_setsv(ERRSV, errsv_save);
1593         SvREFCNT_dec(errsv_save);
1594         LEAVE;
1595     }
1596     SPAGAIN;
1597     PUSHMARK(SP);
1598     EXTEND(SP,5);
1599     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1600     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1601     PUSHs(listsv);
1602     PUSHs(sv_2mortal(newSViv(minbits)));
1603     PUSHs(sv_2mortal(newSViv(none)));
1604     PUTBACK;
1605     if (IN_PERL_COMPILETIME) {
1606         /* XXX ought to be handled by lex_start */
1607         SAVEI32(PL_in_my);
1608         PL_in_my = 0;
1609         sv_setpv(tokenbufsv, PL_tokenbuf);
1610     }
1611     errsv_save = newSVsv(ERRSV);
1612     if (call_method("SWASHNEW", G_SCALAR))
1613         retval = newSVsv(*PL_stack_sp--);
1614     else
1615         retval = &PL_sv_undef;
1616     if (!SvTRUE(ERRSV))
1617         sv_setsv(ERRSV, errsv_save);
1618     SvREFCNT_dec(errsv_save);
1619     LEAVE;
1620     POPSTACK;
1621     if (IN_PERL_COMPILETIME) {
1622         STRLEN len;
1623         const char* pv = SvPV(tokenbufsv, len);
1624
1625         Copy(pv, PL_tokenbuf, len+1, char);
1626         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1627     }
1628     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1629         if (SvPOK(retval))
1630             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1631                        retval);
1632         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1633     }
1634     return retval;
1635 }
1636
1637
1638 /* This API is wrong for special case conversions since we may need to
1639  * return several Unicode characters for a single Unicode character
1640  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1641  * the lower-level routine, and it is similarly broken for returning
1642  * multiple values.  --jhi */
1643 UV
1644 Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
1645 {
1646     HV* hv = (HV*)SvRV(sv);
1647     U32 klen;
1648     U32 off;
1649     STRLEN slen;
1650     STRLEN needents;
1651     U8 *tmps = NULL;
1652     U32 bit;
1653     SV *retval;
1654     U8 tmputf8[2];
1655     UV c = NATIVE_TO_ASCII(*ptr);
1656
1657     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1658         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1659         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1660         ptr = tmputf8;
1661     }
1662     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1663      * then the "swatch" is a vec() for al the chars which start
1664      * with 0xAA..0xYY
1665      * So the key in the hash (klen) is length of encoded char -1
1666      */
1667     klen = UTF8SKIP(ptr) - 1;
1668     off  = ptr[klen];
1669
1670     if (klen == 0)
1671      {
1672       /* If char in invariant then swatch is for all the invariant chars
1673        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1674        */
1675       needents = UTF_CONTINUATION_MARK;
1676       off      = NATIVE_TO_UTF(ptr[klen]);
1677      }
1678     else
1679      {
1680       /* If char is encoded then swatch is for the prefix */
1681       needents = (1 << UTF_ACCUMULATION_SHIFT);
1682       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1683      }
1684
1685     /*
1686      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1687      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1688      * it's nothing to sniff at.)  Pity we usually come through at least
1689      * two function calls to get here...
1690      *
1691      * NB: this code assumes that swatches are never modified, once generated!
1692      */
1693
1694     if (hv   == PL_last_swash_hv &&
1695         klen == PL_last_swash_klen &&
1696         (!klen || memEQ(ptr, PL_last_swash_key, klen)) )
1697     {
1698         tmps = PL_last_swash_tmps;
1699         slen = PL_last_swash_slen;
1700     }
1701     else {
1702         /* Try our second-level swatch cache, kept in a hash. */
1703         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1704
1705         /* If not cached, generate it via utf8::SWASHGET */
1706         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1707             dSP;
1708             /* We use utf8n_to_uvuni() as we want an index into
1709                Unicode tables, not a native character number.
1710              */
1711             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1712                                            ckWARN(WARN_UTF8) ?
1713                                            0 : UTF8_ALLOW_ANY);
1714             SV *errsv_save;
1715             ENTER;
1716             SAVETMPS;
1717             save_re_context();
1718             PUSHSTACKi(PERLSI_MAGIC);
1719             PUSHMARK(SP);
1720             EXTEND(SP,3);
1721             PUSHs((SV*)sv);
1722             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1723             PUSHs(sv_2mortal(newSViv((klen) ?
1724                                      (code_point & ~(needents - 1)) : 0)));
1725             PUSHs(sv_2mortal(newSViv(needents)));
1726             PUTBACK;
1727             errsv_save = newSVsv(ERRSV);
1728             if (call_method("SWASHGET", G_SCALAR))
1729                 retval = newSVsv(*PL_stack_sp--);
1730             else
1731                 retval = &PL_sv_undef;
1732             if (!SvTRUE(ERRSV))
1733                 sv_setsv(ERRSV, errsv_save);
1734             SvREFCNT_dec(errsv_save);
1735             POPSTACK;
1736             FREETMPS;
1737             LEAVE;
1738             if (IN_PERL_COMPILETIME)
1739                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1740
1741             svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1742
1743             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1744                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1745         }
1746
1747         PL_last_swash_hv = hv;
1748         PL_last_swash_klen = klen;
1749         PL_last_swash_tmps = tmps;
1750         PL_last_swash_slen = slen;
1751         if (klen)
1752             Copy(ptr, PL_last_swash_key, klen, U8);
1753     }
1754
1755     switch ((int)((slen << 3) / needents)) {
1756     case 1:
1757         bit = 1 << (off & 7);
1758         off >>= 3;
1759         return (tmps[off] & bit) != 0;
1760     case 8:
1761         return tmps[off];
1762     case 16:
1763         off <<= 1;
1764         return (tmps[off] << 8) + tmps[off + 1] ;
1765     case 32:
1766         off <<= 2;
1767         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1768     }
1769     Perl_croak(aTHX_ "panic: swash_fetch");
1770     return 0;
1771 }
1772
1773
1774 /*
1775 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1776
1777 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1778 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1779 bytes available. The return value is the pointer to the byte after the
1780 end of the new character. In other words,
1781
1782     d = uvchr_to_utf8(d, uv);
1783
1784 is the recommended wide native character-aware way of saying
1785
1786     *(d++) = uv;
1787
1788 =cut
1789 */
1790
1791 /* On ASCII machines this is normally a macro but we want a
1792    real function in case XS code wants it
1793 */
1794 #undef Perl_uvchr_to_utf8
1795 U8 *
1796 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1797 {
1798     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1799 }
1800
1801 U8 *
1802 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1803 {
1804     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1805 }
1806
1807 /*
1808 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1809
1810 Returns the native character value of the first character in the string C<s>
1811 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1812 length, in bytes, of that character.
1813
1814 Allows length and flags to be passed to low level routine.
1815
1816 =cut
1817 */
1818 /* On ASCII machines this is normally a macro but we want
1819    a real function in case XS code wants it
1820 */
1821 #undef Perl_utf8n_to_uvchr
1822 UV
1823 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1824 {
1825     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1826     return UNI_TO_NATIVE(uv);
1827 }
1828
1829 /*
1830 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1831
1832 Build to the scalar dsv a displayable version of the string spv,
1833 length len, the displayable version being at most pvlim bytes long
1834 (if longer, the rest is truncated and "..." will be appended).
1835
1836 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1837 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1838 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1839 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1840 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1841 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1842
1843 The pointer to the PV of the dsv is returned.
1844
1845 =cut */
1846 char *
1847 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1848 {
1849     int truncated = 0;
1850     const char *s, *e;
1851
1852     sv_setpvn(dsv, "", 0);
1853     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1854          UV u;
1855           /* This serves double duty as a flag and a character to print after
1856              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1857           */
1858          char ok = 0;
1859
1860          if (pvlim && SvCUR(dsv) >= pvlim) {
1861               truncated++;
1862               break;
1863          }
1864          u = utf8_to_uvchr((U8*)s, 0);
1865          if (u < 256) {
1866              unsigned char c = (unsigned char)u & 0xFF;
1867              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1868                  switch (c) {
1869                  case '\n':
1870                      ok = 'n'; break;
1871                  case '\r':
1872                      ok = 'r'; break;
1873                  case '\t':
1874                      ok = 't'; break;
1875                  case '\f':
1876                      ok = 'f'; break;
1877                  case '\a':
1878                      ok = 'a'; break;
1879                  case '\\':
1880                      ok = '\\'; break;
1881                  default: break;
1882                  }
1883                  if (ok) {
1884                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1885                  }
1886              }
1887              /* isPRINT() is the locale-blind version. */
1888              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1889                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1890                  ok = 1;
1891              }
1892          }
1893          if (!ok)
1894              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1895     }
1896     if (truncated)
1897          sv_catpvn(dsv, "...", 3);
1898
1899     return SvPVX(dsv);
1900 }
1901
1902 /*
1903 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1904
1905 Build to the scalar dsv a displayable version of the scalar sv,
1906 the displayable version being at most pvlim bytes long
1907 (if longer, the rest is truncated and "..." will be appended).
1908
1909 The flags argument is as in pv_uni_display().
1910
1911 The pointer to the PV of the dsv is returned.
1912
1913 =cut */
1914 char *
1915 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1916 {
1917      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1918                                 pvlim, flags);
1919 }
1920
1921 /*
1922 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1923
1924 Return true if the strings s1 and s2 differ case-insensitively, false
1925 if not (if they are equal case-insensitively).  If u1 is true, the
1926 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1927 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1928 are false, the respective string is assumed to be in native 8-bit
1929 encoding.
1930
1931 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1932 in there (they will point at the beginning of the I<next> character).
1933 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1934 pointers beyond which scanning will not continue under any
1935 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1936 s2+l2 will be used as goal end pointers that will also stop the scan,
1937 and which qualify towards defining a successful match: all the scans
1938 that define an explicit length must reach their goal pointers for
1939 a match to succeed).
1940
1941 For case-insensitiveness, the "casefolding" of Unicode is used
1942 instead of upper/lowercasing both the characters, see
1943 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1944
1945 =cut */
1946 I32
1947 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1948 {
1949      register const U8 *p1  = (const U8*)s1;
1950      register const U8 *p2  = (const U8*)s2;
1951      register const U8 *f1 = 0, *f2 = 0;
1952      register U8 *e1 = 0, *q1 = 0;
1953      register U8 *e2 = 0, *q2 = 0;
1954      STRLEN n1 = 0, n2 = 0;
1955      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1956      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
1957      U8 natbuf[1+1];
1958      STRLEN foldlen1, foldlen2;
1959      bool match;
1960
1961      if (pe1)
1962           e1 = *(U8**)pe1;
1963      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1964           f1 = (const U8*)s1 + l1;
1965      if (pe2)
1966           e2 = *(U8**)pe2;
1967      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1968           f2 = (const U8*)s2 + l2;
1969
1970      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1971           return 1; /* mismatch; possible infinite loop or false positive */
1972
1973      if (!u1 || !u2)
1974           natbuf[1] = 0; /* Need to terminate the buffer. */
1975
1976      while ((e1 == 0 || p1 < e1) &&
1977             (f1 == 0 || p1 < f1) &&
1978             (e2 == 0 || p2 < e2) &&
1979             (f2 == 0 || p2 < f2)) {
1980           if (n1 == 0) {
1981                if (u1)
1982                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1983                else {
1984                     natbuf[0] = *p1;
1985                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1986                }
1987                q1 = foldbuf1;
1988                n1 = foldlen1;
1989           }
1990           if (n2 == 0) {
1991                if (u2)
1992                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1993                else {
1994                     natbuf[0] = *p2;
1995                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1996                }
1997                q2 = foldbuf2;
1998                n2 = foldlen2;
1999           }
2000           while (n1 && n2) {
2001                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2002                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2003                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2004                    return 1; /* mismatch */
2005                n1 -= UTF8SKIP(q1);
2006                q1 += UTF8SKIP(q1);
2007                n2 -= UTF8SKIP(q2);
2008                q2 += UTF8SKIP(q2);
2009           }
2010           if (n1 == 0)
2011                p1 += u1 ? UTF8SKIP(p1) : 1;
2012           if (n2 == 0)
2013                p2 += u2 ? UTF8SKIP(p2) : 1;
2014
2015      }
2016
2017      /* A match is defined by all the scans that specified
2018       * an explicit length reaching their final goals. */
2019      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2020
2021      if (match) {
2022           if (pe1)
2023                *pe1 = (char*)p1;
2024           if (pe2)
2025                *pe2 = (char*)p2;
2026      }
2027
2028      return match ? 0 : 1; /* 0 match, 1 mismatch */
2029 }
2030
2031 /*
2032  * Local variables:
2033  * c-indentation-style: bsd
2034  * c-basic-offset: 4
2035  * indent-tabs-mode: t
2036  * End:
2037  *
2038  * vim: shiftwidth=4:
2039 */