utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
   4  *    others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 U8 *
 171 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 172 {
 173     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 174 }
 175
 176
 177 /*
 178 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 179
 180 Tests if some arbitrary number of bytes begins in a valid UTF-8
 181 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 182 UTF-8 character.  The actual number of bytes in the UTF-8 character
 183 will be returned if it is valid, otherwise 0.
 184
 185 =cut */
 186 STRLEN
 187 Perl_is_utf8_char(pTHX_ const U8 *s)
 188 {
 189     U8 u = *s;
 190     STRLEN slen, len;
 191     UV uv, ouv;
 192
 193     if (UTF8_IS_INVARIANT(u))
 194         return 1;
 195
 196     if (!UTF8_IS_START(u))
 197         return 0;
 198
 199     len = UTF8SKIP(s);
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206     u &= UTF_START_MASK(len);
 207     uv  = u;
 208     ouv = uv;
 209     while (slen--) {
 210         if (!UTF8_IS_CONTINUATION(*s))
 211             return 0;
 212         uv = UTF8_ACCUMULATE(uv, *s);
 213         if (uv < ouv)
 214             return 0;
 215         ouv = uv;
 216         s++;
 217     }
 218
 219     if ((STRLEN)UNISKIP(uv) < len)
 220         return 0;
 221
 222     return len;
 223 }
 224
 225 /*
 226 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 227
 228 Returns true if first C<len> bytes of the given string form a valid
 229 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 230 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 231 because a valid ASCII string is a valid UTF-8 string.
 232
 233 =cut
 234 */
 235
 236 bool
 237 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 238 {
 239     const U8* x = s;
 240     const U8* send;
 241     STRLEN c;
 242
 243     if (!len && s)
 244         len = strlen((const char *)s);
 245     send = s + len;
 246
 247     while (x < send) {
 248          /* Inline the easy bits of is_utf8_char() here for speed... */
 249          if (UTF8_IS_INVARIANT(*x))
 250               c = 1;
 251          else if (!UTF8_IS_START(*x))
 252               return FALSE;
 253          else {
 254               /* ... and call is_utf8_char() only if really needed. */
 255               c = is_utf8_char(x);
 256               if (!c)
 257                    return FALSE;
 258          }
 259         x += c;
 260     }
 261     if (x != send)
 262         return FALSE;
 263
 264     return TRUE;
 265 }
 266
 267 /*
 268 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **p
 269
 270 Like is_ut8_string but store the location of the failure in
 271 the last argument.
 272
 273 =cut
 274 */
 275
 276 bool
 277 Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p)
 278 {
 279     const U8* x = s;
 280     const U8* send;
 281     STRLEN c;
 282
 283     if (!len && s)
 284         len = strlen((const char *)s);
 285     send = s + len;
 286
 287     while (x < send) {
 288          /* Inline the easy bits of is_utf8_char() here for speed... */
 289          if (UTF8_IS_INVARIANT(*x))
 290               c = 1;
 291          else if (!UTF8_IS_START(*x)) {
 292               if (p)
 293                   *p = x;
 294               return FALSE;
 295          }
 296          else {
 297               /* ... and call is_utf8_char() only if really needed. */
 298               c = is_utf8_char(x);
 299               if (!c) {
 300                    if (p)
 301                       *p = x;
 302                    return FALSE;
 303               }
 304          }
 305         x += c;
 306     }
 307     if (x != send) {
 308        if (p)
 309            *p = x;
 310         return FALSE;
 311     }
 312
 313     return TRUE;
 314 }
 315
 316 /*
 317 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 318
 319 Bottom level UTF-8 decode routine.
 320 Returns the unicode code point value of the first character in the string C<s>
 321 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 322 C<retlen> will be set to the length, in bytes, of that character.
 323
 324 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 325 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 326 it is assumed that the caller will raise a warning, and this function
 327 will silently just set C<retlen> to C<-1> and return zero.  If the
 328 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 329 malformations will be given, C<retlen> will be set to the expected
 330 length of the UTF-8 character in bytes, and zero will be returned.
 331
 332 The C<flags> can also contain various flags to allow deviations from
 333 the strict UTF-8 encoding (see F<utf8.h>).
 334
 335 Most code should use utf8_to_uvchr() rather than call this directly.
 336
 337 =cut
 338 */
 339
 340 UV
 341 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 342 {
 343     const U8 *s0 = s;
 344     UV uv = *s, ouv = 0;
 345     STRLEN len = 1;
 346     const bool dowarn = ckWARN_d(WARN_UTF8);
 347     const UV startbyte = *s;
 348     STRLEN expectlen = 0;
 349     U32 warning = 0;
 350
 351 /* This list is a superset of the UTF8_ALLOW_XXX. */
 352
 353 #define UTF8_WARN_EMPTY                          1
 354 #define UTF8_WARN_CONTINUATION                   2
 355 #define UTF8_WARN_NON_CONTINUATION               3
 356 #define UTF8_WARN_FE_FF                          4
 357 #define UTF8_WARN_SHORT                          5
 358 #define UTF8_WARN_OVERFLOW                       6
 359 #define UTF8_WARN_SURROGATE                      7
 360 #define UTF8_WARN_LONG                           8
 361 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 362
 363     if (curlen == 0 &&
 364         !(flags & UTF8_ALLOW_EMPTY)) {
 365         warning = UTF8_WARN_EMPTY;
 366         goto malformed;
 367     }
 368
 369     if (UTF8_IS_INVARIANT(uv)) {
 370         if (retlen)
 371             *retlen = 1;
 372         return (UV) (NATIVE_TO_UTF(*s));
 373     }
 374
 375     if (UTF8_IS_CONTINUATION(uv) &&
 376         !(flags & UTF8_ALLOW_CONTINUATION)) {
 377         warning = UTF8_WARN_CONTINUATION;
 378         goto malformed;
 379     }
 380
 381     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 382         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 383         warning = UTF8_WARN_NON_CONTINUATION;
 384         goto malformed;
 385     }
 386
 387 #ifdef EBCDIC
 388     uv = NATIVE_TO_UTF(uv);
 389 #else
 390     if ((uv == 0xfe || uv == 0xff) &&
 391         !(flags & UTF8_ALLOW_FE_FF)) {
 392         warning = UTF8_WARN_FE_FF;
 393         goto malformed;
 394     }
 395 #endif
 396
 397     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 398     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 399     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 400     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 401 #ifdef EBCDIC
 402     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 403     else                        { len =  7; uv &= 0x01; }
 404 #else
 405     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 406     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 407     else                        { len = 13; uv = 0; } /* whoa! */
 408 #endif
 409
 410     if (retlen)
 411         *retlen = len;
 412
 413     expectlen = len;
 414
 415     if ((curlen < expectlen) &&
 416         !(flags & UTF8_ALLOW_SHORT)) {
 417         warning = UTF8_WARN_SHORT;
 418         goto malformed;
 419     }
 420
 421     len--;
 422     s++;
 423     ouv = uv;
 424
 425     while (len--) {
 426         if (!UTF8_IS_CONTINUATION(*s) &&
 427             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 428             s--;
 429             warning = UTF8_WARN_NON_CONTINUATION;
 430             goto malformed;
 431         }
 432         else
 433             uv = UTF8_ACCUMULATE(uv, *s);
 434         if (!(uv > ouv)) {
 435             /* These cannot be allowed. */
 436             if (uv == ouv) {
 437                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 438                     warning = UTF8_WARN_LONG;
 439                     goto malformed;
 440                 }
 441             }
 442             else { /* uv < ouv */
 443                 /* This cannot be allowed. */
 444                 warning = UTF8_WARN_OVERFLOW;
 445                 goto malformed;
 446             }
 447         }
 448         s++;
 449         ouv = uv;
 450     }
 451
 452     if (UNICODE_IS_SURROGATE(uv) &&
 453         !(flags & UTF8_ALLOW_SURROGATE)) {
 454         warning = UTF8_WARN_SURROGATE;
 455         goto malformed;
 456     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 457                !(flags & UTF8_ALLOW_LONG)) {
 458         warning = UTF8_WARN_LONG;
 459         goto malformed;
 460     } else if (UNICODE_IS_ILLEGAL(uv) &&
 461                !(flags & UTF8_ALLOW_FFFF)) {
 462         warning = UTF8_WARN_FFFF;
 463         goto malformed;
 464     }
 465
 466     return uv;
 467
 468 malformed:
 469
 470     if (flags & UTF8_CHECK_ONLY) {
 471         if (retlen)
 472             *retlen = -1;
 473         return 0;
 474     }
 475
 476     if (dowarn) {
 477         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 478
 479         switch (warning) {
 480         case 0: /* Intentionally empty. */ break;
 481         case UTF8_WARN_EMPTY:
 482             Perl_sv_catpv(aTHX_ sv, "(empty string)");
 483             break;
 484         case UTF8_WARN_CONTINUATION:
 485             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 486             break;
 487         case UTF8_WARN_NON_CONTINUATION:
 488             if (s == s0)
 489                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 490                            (UV)s[1], startbyte);
 491             else
 492                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 493                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 494
 495             break;
 496         case UTF8_WARN_FE_FF:
 497             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 498             break;
 499         case UTF8_WARN_SHORT:
 500             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 501                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 502             expectlen = curlen;         /* distance for caller to skip */
 503             break;
 504         case UTF8_WARN_OVERFLOW:
 505             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 506                            ouv, *s, startbyte);
 507             break;
 508         case UTF8_WARN_SURROGATE:
 509             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 510             break;
 511         case UTF8_WARN_LONG:
 512             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 513                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 514             break;
 515         case UTF8_WARN_FFFF:
 516             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 517             break;
 518         default:
 519             Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
 520             break;
 521         }
 522
 523         if (warning) {
 524             char *s = SvPVX(sv);
 525
 526             if (PL_op)
 527                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 528                             "%s in %s", s,  OP_DESC(PL_op));
 529             else
 530                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 531         }
 532     }
 533
 534     if (retlen)
 535         *retlen = expectlen ? expectlen : len;
 536
 537     return 0;
 538 }
 539
 540 /*
 541 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 542
 543 Returns the native character value of the first character in the string C<s>
 544 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 545 length, in bytes, of that character.
 546
 547 If C<s> does not point to a well-formed UTF-8 character, zero is
 548 returned and retlen is set, if possible, to -1.
 549
 550 =cut
 551 */
 552
 553 UV
 554 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 555 {
 556     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
 557                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 558 }
 559
 560 /*
 561 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 562
 563 Returns the Unicode code point of the first character in the string C<s>
 564 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 565 length, in bytes, of that character.
 566
 567 This function should only be used when returned UV is considered
 568 an index into the Unicode semantic tables (e.g. swashes).
 569
 570 If C<s> does not point to a well-formed UTF-8 character, zero is
 571 returned and retlen is set, if possible, to -1.
 572
 573 =cut
 574 */
 575
 576 UV
 577 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 578 {
 579     /* Call the low level routine asking for checks */
 580     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 581                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 582 }
 583
 584 /*
 585 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 586
 587 Return the length of the UTF-8 char encoded string C<s> in characters.
 588 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 589 up past C<e>, croaks.
 590
 591 =cut
 592 */
 593
 594 STRLEN
 595 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 596 {
 597     STRLEN len = 0;
 598
 599     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 600      * the bitops (especially ~) can create illegal UTF-8.
 601      * In other words: in Perl UTF-8 is not just for Unicode. */
 602
 603     if (e < s) {
 604         if (ckWARN_d(WARN_UTF8)) {
 605             if (PL_op)
 606                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 607                             "%s in %s", unees, OP_DESC(PL_op));
 608             else
 609                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 610         }
 611         return 0;
 612     }
 613     while (s < e) {
 614         const U8 t = UTF8SKIP(s);
 615
 616         if (e - s < t) {
 617             if (ckWARN_d(WARN_UTF8)) {
 618                 if (PL_op)
 619                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 620                                 unees, OP_DESC(PL_op));
 621                 else
 622                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 623             }
 624             return len;
 625         }
 626         s += t;
 627         len++;
 628     }
 629
 630     return len;
 631 }
 632
 633 /*
 634 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 635
 636 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 637 and C<b>.
 638
 639 WARNING: use only if you *know* that the pointers point inside the
 640 same UTF-8 buffer.
 641
 642 =cut
 643 */
 644
 645 IV
 646 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 647 {
 648     IV off = 0;
 649
 650     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 651      * the bitops (especially ~) can create illegal UTF-8.
 652      * In other words: in Perl UTF-8 is not just for Unicode. */
 653
 654     if (a < b) {
 655         while (a < b) {
 656             const U8 c = UTF8SKIP(a);
 657
 658             if (b - a < c) {
 659                 if (ckWARN_d(WARN_UTF8)) {
 660                     if (PL_op)
 661                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 662                                     "%s in %s", unees, OP_DESC(PL_op));
 663                     else
 664                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 665                 }
 666                 return off;
 667             }
 668             a += c;
 669             off--;
 670         }
 671     }
 672     else {
 673         while (b < a) {
 674             const U8 c = UTF8SKIP(b);
 675
 676             if (a - b < c) {
 677                 if (ckWARN_d(WARN_UTF8)) {
 678                     if (PL_op)
 679                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 680                                     "%s in %s", unees, OP_DESC(PL_op));
 681                     else
 682                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 683                 }
 684                 return off;
 685             }
 686             b += c;
 687             off++;
 688         }
 689     }
 690
 691     return off;
 692 }
 693
 694 /*
 695 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 696
 697 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 698 forward or backward.
 699
 700 WARNING: do not use the following unless you *know* C<off> is within
 701 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 702 on the first byte of character or just after the last byte of a character.
 703
 704 =cut
 705 */
 706
 707 U8 *
 708 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 709 {
 710     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 711      * the bitops (especially ~) can create illegal UTF-8.
 712      * In other words: in Perl UTF-8 is not just for Unicode. */
 713
 714     if (off >= 0) {
 715         while (off--)
 716             s += UTF8SKIP(s);
 717     }
 718     else {
 719         while (off++) {
 720             s--;
 721             while (UTF8_IS_CONTINUATION(*s))
 722                 s--;
 723         }
 724     }
 725     return (U8 *)s;
 726 }
 727
 728 /*
 729 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 730
 731 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 732 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 733 updates len to contain the new length.
 734 Returns zero on failure, setting C<len> to -1.
 735
 736 =cut
 737 */
 738
 739 U8 *
 740 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 741 {
 742     U8 *send;
 743     U8 *d;
 744     U8 *save = s;
 745
 746     /* ensure valid UTF-8 and chars < 256 before updating string */
 747     for (send = s + *len; s < send; ) {
 748         U8 c = *s++;
 749
 750         if (!UTF8_IS_INVARIANT(c) &&
 751             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 752              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 753             *len = -1;
 754             return 0;
 755         }
 756     }
 757
 758     d = s = save;
 759     while (s < send) {
 760         STRLEN ulen;
 761         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 762         s += ulen;
 763     }
 764     *d = '\0';
 765     *len = d - save;
 766     return save;
 767 }
 768
 769 /*
 770 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 771
 772 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 773 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 774 the newly-created string, and updates C<len> to contain the new
 775 length.  Returns the original string if no conversion occurs, C<len>
 776 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 777 0 if C<s> is converted or contains all 7bit characters.
 778
 779 =cut
 780 */
 781
 782 U8 *
 783 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 784 {
 785     U8 *d;
 786     const U8 *start = s;
 787     const U8 *send;
 788     I32 count = 0;
 789
 790     if (!*is_utf8)
 791         return (U8 *)start;
 792
 793     /* ensure valid UTF-8 and chars < 256 before converting string */
 794     for (send = s + *len; s < send;) {
 795         U8 c = *s++;
 796         if (!UTF8_IS_INVARIANT(c)) {
 797             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 798                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 799                 count++;
 800             else
 801                 return (U8 *)start;
 802         }
 803     }
 804
 805     *is_utf8 = 0;
 806
 807     Newz(801, d, (*len) - count + 1, U8);
 808     s = start; start = d;
 809     while (s < send) {
 810         U8 c = *s++;
 811         if (!UTF8_IS_INVARIANT(c)) {
 812             /* Then it is two-byte encoded */
 813             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 814             c = ASCII_TO_NATIVE(c);
 815         }
 816         *d++ = c;
 817     }
 818     *d = '\0';
 819     *len = d - start;
 820     return (U8 *)start;
 821 }
 822
 823 /*
 824 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 825
 826 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 827 Returns a pointer to the newly-created string, and sets C<len> to
 828 reflect the new length.
 829
 830 If you want to convert to UTF-8 from other encodings than ASCII,
 831 see sv_recode_to_utf8().
 832
 833 =cut
 834 */
 835
 836 U8*
 837 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 838 {
 839     const U8 * const send = s + (*len);
 840     U8 *d;
 841     U8 *dst;
 842
 843     Newz(801, d, (*len) * 2 + 1, U8);
 844     dst = d;
 845
 846     while (s < send) {
 847         const UV uv = NATIVE_TO_ASCII(*s++);
 848         if (UNI_IS_INVARIANT(uv))
 849             *d++ = (U8)UTF_TO_NATIVE(uv);
 850         else {
 851             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 852             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 853         }
 854     }
 855     *d = '\0';
 856     *len = d-dst;
 857     return dst;
 858 }
 859
 860 /*
 861  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 862  *
 863  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 864  * We optimize for native, for obvious reasons. */
 865
 866 U8*
 867 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 868 {
 869     U8* pend;
 870     U8* dstart = d;
 871
 872     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 873          d[0] = 0;
 874          *newlen = 1;
 875          return d;
 876     }
 877
 878     if (bytelen & 1)
 879         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 880
 881     pend = p + bytelen;
 882
 883     while (p < pend) {
 884         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 885         p += 2;
 886         if (uv < 0x80) {
 887             *d++ = (U8)uv;
 888             continue;
 889         }
 890         if (uv < 0x800) {
 891             *d++ = (U8)(( uv >>  6)         | 0xc0);
 892             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 893             continue;
 894         }
 895         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 896             UV low = (p[0] << 8) + p[1];
 897             p += 2;
 898             if (low < 0xdc00 || low >= 0xdfff)
 899                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 900             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 901         }
 902         if (uv < 0x10000) {
 903             *d++ = (U8)(( uv >> 12)         | 0xe0);
 904             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 905             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 906             continue;
 907         }
 908         else {
 909             *d++ = (U8)(( uv >> 18)         | 0xf0);
 910             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 911             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 912             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 913             continue;
 914         }
 915     }
 916     *newlen = d - dstart;
 917     return d;
 918 }
 919
 920 /* Note: this one is slightly destructive of the source. */
 921
 922 U8*
 923 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 924 {
 925     U8* s = (U8*)p;
 926     U8* send = s + bytelen;
 927     while (s < send) {
 928         U8 tmp = s[0];
 929         s[0] = s[1];
 930         s[1] = tmp;
 931         s += 2;
 932     }
 933     return utf16_to_utf8(p, d, bytelen, newlen);
 934 }
 935
 936 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 937
 938 bool
 939 Perl_is_uni_alnum(pTHX_ UV c)
 940 {
 941     U8 tmpbuf[UTF8_MAXBYTES+1];
 942     uvchr_to_utf8(tmpbuf, c);
 943     return is_utf8_alnum(tmpbuf);
 944 }
 945
 946 bool
 947 Perl_is_uni_alnumc(pTHX_ UV c)
 948 {
 949     U8 tmpbuf[UTF8_MAXBYTES+1];
 950     uvchr_to_utf8(tmpbuf, c);
 951     return is_utf8_alnumc(tmpbuf);
 952 }
 953
 954 bool
 955 Perl_is_uni_idfirst(pTHX_ UV c)
 956 {
 957     U8 tmpbuf[UTF8_MAXBYTES+1];
 958     uvchr_to_utf8(tmpbuf, c);
 959     return is_utf8_idfirst(tmpbuf);
 960 }
 961
 962 bool
 963 Perl_is_uni_alpha(pTHX_ UV c)
 964 {
 965     U8 tmpbuf[UTF8_MAXBYTES+1];
 966     uvchr_to_utf8(tmpbuf, c);
 967     return is_utf8_alpha(tmpbuf);
 968 }
 969
 970 bool
 971 Perl_is_uni_ascii(pTHX_ UV c)
 972 {
 973     U8 tmpbuf[UTF8_MAXBYTES+1];
 974     uvchr_to_utf8(tmpbuf, c);
 975     return is_utf8_ascii(tmpbuf);
 976 }
 977
 978 bool
 979 Perl_is_uni_space(pTHX_ UV c)
 980 {
 981     U8 tmpbuf[UTF8_MAXBYTES+1];
 982     uvchr_to_utf8(tmpbuf, c);
 983     return is_utf8_space(tmpbuf);
 984 }
 985
 986 bool
 987 Perl_is_uni_digit(pTHX_ UV c)
 988 {
 989     U8 tmpbuf[UTF8_MAXBYTES+1];
 990     uvchr_to_utf8(tmpbuf, c);
 991     return is_utf8_digit(tmpbuf);
 992 }
 993
 994 bool
 995 Perl_is_uni_upper(pTHX_ UV c)
 996 {
 997     U8 tmpbuf[UTF8_MAXBYTES+1];
 998     uvchr_to_utf8(tmpbuf, c);
 999     return is_utf8_upper(tmpbuf);
1000 }
1001
1002 bool
1003 Perl_is_uni_lower(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_lower(tmpbuf);
1008 }
1009
1010 bool
1011 Perl_is_uni_cntrl(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_cntrl(tmpbuf);
1016 }
1017
1018 bool
1019 Perl_is_uni_graph(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_graph(tmpbuf);
1024 }
1025
1026 bool
1027 Perl_is_uni_print(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_print(tmpbuf);
1032 }
1033
1034 bool
1035 Perl_is_uni_punct(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_punct(tmpbuf);
1040 }
1041
1042 bool
1043 Perl_is_uni_xdigit(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_xdigit(tmpbuf);
1048 }
1049
1050 UV
1051 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1052 {
1053     uvchr_to_utf8(p, c);
1054     return to_utf8_upper(p, p, lenp);
1055 }
1056
1057 UV
1058 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1059 {
1060     uvchr_to_utf8(p, c);
1061     return to_utf8_title(p, p, lenp);
1062 }
1063
1064 UV
1065 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1066 {
1067     uvchr_to_utf8(p, c);
1068     return to_utf8_lower(p, p, lenp);
1069 }
1070
1071 UV
1072 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1073 {
1074     uvchr_to_utf8(p, c);
1075     return to_utf8_fold(p, p, lenp);
1076 }
1077
1078 /* for now these all assume no locale info available for Unicode > 255 */
1079
1080 bool
1081 Perl_is_uni_alnum_lc(pTHX_ UV c)
1082 {
1083     return is_uni_alnum(c);     /* XXX no locale support yet */
1084 }
1085
1086 bool
1087 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1088 {
1089     return is_uni_alnumc(c);    /* XXX no locale support yet */
1090 }
1091
1092 bool
1093 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1094 {
1095     return is_uni_idfirst(c);   /* XXX no locale support yet */
1096 }
1097
1098 bool
1099 Perl_is_uni_alpha_lc(pTHX_ UV c)
1100 {
1101     return is_uni_alpha(c);     /* XXX no locale support yet */
1102 }
1103
1104 bool
1105 Perl_is_uni_ascii_lc(pTHX_ UV c)
1106 {
1107     return is_uni_ascii(c);     /* XXX no locale support yet */
1108 }
1109
1110 bool
1111 Perl_is_uni_space_lc(pTHX_ UV c)
1112 {
1113     return is_uni_space(c);     /* XXX no locale support yet */
1114 }
1115
1116 bool
1117 Perl_is_uni_digit_lc(pTHX_ UV c)
1118 {
1119     return is_uni_digit(c);     /* XXX no locale support yet */
1120 }
1121
1122 bool
1123 Perl_is_uni_upper_lc(pTHX_ UV c)
1124 {
1125     return is_uni_upper(c);     /* XXX no locale support yet */
1126 }
1127
1128 bool
1129 Perl_is_uni_lower_lc(pTHX_ UV c)
1130 {
1131     return is_uni_lower(c);     /* XXX no locale support yet */
1132 }
1133
1134 bool
1135 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1136 {
1137     return is_uni_cntrl(c);     /* XXX no locale support yet */
1138 }
1139
1140 bool
1141 Perl_is_uni_graph_lc(pTHX_ UV c)
1142 {
1143     return is_uni_graph(c);     /* XXX no locale support yet */
1144 }
1145
1146 bool
1147 Perl_is_uni_print_lc(pTHX_ UV c)
1148 {
1149     return is_uni_print(c);     /* XXX no locale support yet */
1150 }
1151
1152 bool
1153 Perl_is_uni_punct_lc(pTHX_ UV c)
1154 {
1155     return is_uni_punct(c);     /* XXX no locale support yet */
1156 }
1157
1158 bool
1159 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1160 {
1161     return is_uni_xdigit(c);    /* XXX no locale support yet */
1162 }
1163
1164 U32
1165 Perl_to_uni_upper_lc(pTHX_ U32 c)
1166 {
1167     /* XXX returns only the first character -- do not use XXX */
1168     /* XXX no locale support yet */
1169     STRLEN len;
1170     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1171     return (U32)to_uni_upper(c, tmpbuf, &len);
1172 }
1173
1174 U32
1175 Perl_to_uni_title_lc(pTHX_ U32 c)
1176 {
1177     /* XXX returns only the first character XXX -- do not use XXX */
1178     /* XXX no locale support yet */
1179     STRLEN len;
1180     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1181     return (U32)to_uni_title(c, tmpbuf, &len);
1182 }
1183
1184 U32
1185 Perl_to_uni_lower_lc(pTHX_ U32 c)
1186 {
1187     /* XXX returns only the first character -- do not use XXX */
1188     /* XXX no locale support yet */
1189     STRLEN len;
1190     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1191     return (U32)to_uni_lower(c, tmpbuf, &len);
1192 }
1193
1194 bool
1195 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1196 {
1197     if (!is_utf8_char(p))
1198         return FALSE;
1199     if (!PL_utf8_alnum)
1200         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1201          * descendant of isalnum(3), in other words, it doesn't
1202          * contain the '_'. --jhi */
1203         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1204     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1205 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1206 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1207     if (!PL_utf8_alnum)
1208         PL_utf8_alnum = swash_init("utf8", "",
1209             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1210     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1211 #endif
1212 }
1213
1214 bool
1215 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1216 {
1217     if (!is_utf8_char(p))
1218         return FALSE;
1219     if (!PL_utf8_alnum)
1220         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1221     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1222 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1223 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1224     if (!PL_utf8_alnum)
1225         PL_utf8_alnum = swash_init("utf8", "",
1226             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1227     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1228 #endif
1229 }
1230
1231 bool
1232 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1233 {
1234     if (*p == '_')
1235         return TRUE;
1236     if (!is_utf8_char(p))
1237         return FALSE;
1238     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1239         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1240     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1241 }
1242
1243 bool
1244 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1245 {
1246     if (*p == '_')
1247         return TRUE;
1248     if (!is_utf8_char(p))
1249         return FALSE;
1250     if (!PL_utf8_idcont)
1251         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1252     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1253 }
1254
1255 bool
1256 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1257 {
1258     if (!is_utf8_char(p))
1259         return FALSE;
1260     if (!PL_utf8_alpha)
1261         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1262     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1263 }
1264
1265 bool
1266 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1267 {
1268     if (!is_utf8_char(p))
1269         return FALSE;
1270     if (!PL_utf8_ascii)
1271         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1272     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1273 }
1274
1275 bool
1276 Perl_is_utf8_space(pTHX_ const U8 *p)
1277 {
1278     if (!is_utf8_char(p))
1279         return FALSE;
1280     if (!PL_utf8_space)
1281         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1282     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1283 }
1284
1285 bool
1286 Perl_is_utf8_digit(pTHX_ const U8 *p)
1287 {
1288     if (!is_utf8_char(p))
1289         return FALSE;
1290     if (!PL_utf8_digit)
1291         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1292     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1293 }
1294
1295 bool
1296 Perl_is_utf8_upper(pTHX_ const U8 *p)
1297 {
1298     if (!is_utf8_char(p))
1299         return FALSE;
1300     if (!PL_utf8_upper)
1301         PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1302     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1303 }
1304
1305 bool
1306 Perl_is_utf8_lower(pTHX_ const U8 *p)
1307 {
1308     if (!is_utf8_char(p))
1309         return FALSE;
1310     if (!PL_utf8_lower)
1311         PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1312     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1313 }
1314
1315 bool
1316 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1317 {
1318     if (!is_utf8_char(p))
1319         return FALSE;
1320     if (!PL_utf8_cntrl)
1321         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1322     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1323 }
1324
1325 bool
1326 Perl_is_utf8_graph(pTHX_ const U8 *p)
1327 {
1328     if (!is_utf8_char(p))
1329         return FALSE;
1330     if (!PL_utf8_graph)
1331         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1332     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1333 }
1334
1335 bool
1336 Perl_is_utf8_print(pTHX_ const U8 *p)
1337 {
1338     if (!is_utf8_char(p))
1339         return FALSE;
1340     if (!PL_utf8_print)
1341         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1342     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1343 }
1344
1345 bool
1346 Perl_is_utf8_punct(pTHX_ const U8 *p)
1347 {
1348     if (!is_utf8_char(p))
1349         return FALSE;
1350     if (!PL_utf8_punct)
1351         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1352     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1353 }
1354
1355 bool
1356 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1357 {
1358     if (!is_utf8_char(p))
1359         return FALSE;
1360     if (!PL_utf8_xdigit)
1361         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1362     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1363 }
1364
1365 bool
1366 Perl_is_utf8_mark(pTHX_ const U8 *p)
1367 {
1368     if (!is_utf8_char(p))
1369         return FALSE;
1370     if (!PL_utf8_mark)
1371         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1372     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1373 }
1374
1375 /*
1376 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1377
1378 The "p" contains the pointer to the UTF-8 string encoding
1379 the character that is being converted.
1380
1381 The "ustrp" is a pointer to the character buffer to put the
1382 conversion result to.  The "lenp" is a pointer to the length
1383 of the result.
1384
1385 The "swashp" is a pointer to the swash to use.
1386
1387 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1388 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1389 but not always, a multicharacter mapping), is tried first.
1390
1391 The "special" is a string like "utf8::ToSpecLower", which means the
1392 hash %utf8::ToSpecLower.  The access to the hash is through
1393 Perl_to_utf8_case().
1394
1395 The "normal" is a string like "ToLower" which means the swash
1396 %utf8::ToLower.
1397
1398 =cut */
1399
1400 UV
1401 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
1402 {
1403     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1404     STRLEN len = 0;
1405
1406     const UV uv0 = utf8_to_uvchr(p, 0);
1407     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1408      * are necessary in EBCDIC, they are redundant no-ops
1409      * in ASCII-ish platforms, and hopefully optimized away. */
1410     const UV uv1 = NATIVE_TO_UNI(uv0);
1411     uvuni_to_utf8(tmpbuf, uv1);
1412
1413     if (!*swashp) /* load on-demand */
1414          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1415
1416     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1417     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1418          /* It might be "special" (sometimes, but not always,
1419           * a multicharacter mapping) */
1420          HV *hv;
1421          SV **svp;
1422
1423          if ((hv  = get_hv(special, FALSE)) &&
1424              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1425              (*svp)) {
1426               char *s;
1427
1428               s = SvPV(*svp, len);
1429               if (len == 1)
1430                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1431               else {
1432 #ifdef EBCDIC
1433                    /* If we have EBCDIC we need to remap the characters
1434                     * since any characters in the low 256 are Unicode
1435                     * code points, not EBCDIC. */
1436                    U8 *t = (U8*)s, *tend = t + len, *d;
1437
1438                    d = tmpbuf;
1439                    if (SvUTF8(*svp)) {
1440                         STRLEN tlen = 0;
1441
1442                         while (t < tend) {
1443                              UV c = utf8_to_uvchr(t, &tlen);
1444                              if (tlen > 0) {
1445                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1446                                   t += tlen;
1447                              }
1448                              else
1449                                   break;
1450                         }
1451                    }
1452                    else {
1453                         while (t < tend) {
1454                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1455                              t++;
1456                         }
1457                    }
1458                    len = d - tmpbuf;
1459                    Copy(tmpbuf, ustrp, len, U8);
1460 #else
1461                    Copy(s, ustrp, len, U8);
1462 #endif
1463               }
1464          }
1465     }
1466
1467     if (!len && *swashp) {
1468          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1469
1470          if (uv2) {
1471               /* It was "normal" (a single character mapping). */
1472               UV uv3 = UNI_TO_NATIVE(uv2);
1473
1474               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1475          }
1476     }
1477
1478     if (!len) /* Neither: just copy. */
1479          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1480
1481     if (lenp)
1482          *lenp = len;
1483
1484     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1485 }
1486
1487 /*
1488 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1489
1490 Convert the UTF-8 encoded character at p to its uppercase version and
1491 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1492 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1493 the uppercase version may be longer than the original character.
1494
1495 The first character of the uppercased version is returned
1496 (but note, as explained above, that there may be more.)
1497
1498 =cut */
1499
1500 UV
1501 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1502 {
1503     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1504                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1505 }
1506
1507 /*
1508 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1509
1510 Convert the UTF-8 encoded character at p to its titlecase version and
1511 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1512 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1513 titlecase version may be longer than the original character.
1514
1515 The first character of the titlecased version is returned
1516 (but note, as explained above, that there may be more.)
1517
1518 =cut */
1519
1520 UV
1521 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1522 {
1523     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1524                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1525 }
1526
1527 /*
1528 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1529
1530 Convert the UTF-8 encoded character at p to its lowercase version and
1531 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1532 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1533 lowercase version may be longer than the original character.
1534
1535 The first character of the lowercased version is returned
1536 (but note, as explained above, that there may be more.)
1537
1538 =cut */
1539
1540 UV
1541 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1542 {
1543     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1544                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1545 }
1546
1547 /*
1548 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1549
1550 Convert the UTF-8 encoded character at p to its foldcase version and
1551 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1552 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1553 foldcase version may be longer than the original character (up to
1554 three characters).
1555
1556 The first character of the foldcased version is returned
1557 (but note, as explained above, that there may be more.)
1558
1559 =cut */
1560
1561 UV
1562 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1563 {
1564     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1565                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1566 }
1567
1568 /* a "swash" is a swatch hash */
1569
1570 SV*
1571 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1572 {
1573     dVAR;
1574     SV* retval;
1575     SV* tokenbufsv = sv_newmortal();
1576     dSP;
1577     const size_t pkg_len = strlen(pkg);
1578     const size_t name_len = strlen(name);
1579     HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
1580     SV* errsv_save;
1581
1582     PUSHSTACKi(PERLSI_MAGIC);
1583     ENTER;
1584     SAVEI32(PL_hints);
1585     PL_hints = 0;
1586     save_re_context();
1587     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1588         ENTER;
1589         errsv_save = newSVsv(ERRSV);
1590         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1591                          Nullsv);
1592         if (!SvTRUE(ERRSV))
1593             sv_setsv(ERRSV, errsv_save);
1594         SvREFCNT_dec(errsv_save);
1595         LEAVE;
1596     }
1597     SPAGAIN;
1598     PUSHMARK(SP);
1599     EXTEND(SP,5);
1600     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1601     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1602     PUSHs(listsv);
1603     PUSHs(sv_2mortal(newSViv(minbits)));
1604     PUSHs(sv_2mortal(newSViv(none)));
1605     PUTBACK;
1606     if (IN_PERL_COMPILETIME) {
1607         /* XXX ought to be handled by lex_start */
1608         SAVEI32(PL_in_my);
1609         PL_in_my = 0;
1610         sv_setpv(tokenbufsv, PL_tokenbuf);
1611     }
1612     errsv_save = newSVsv(ERRSV);
1613     if (call_method("SWASHNEW", G_SCALAR))
1614         retval = newSVsv(*PL_stack_sp--);
1615     else
1616         retval = &PL_sv_undef;
1617     if (!SvTRUE(ERRSV))
1618         sv_setsv(ERRSV, errsv_save);
1619     SvREFCNT_dec(errsv_save);
1620     LEAVE;
1621     POPSTACK;
1622     if (IN_PERL_COMPILETIME) {
1623         STRLEN len;
1624         const char* pv = SvPV(tokenbufsv, len);
1625
1626         Copy(pv, PL_tokenbuf, len+1, char);
1627         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1628     }
1629     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1630         if (SvPOK(retval))
1631             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1632                        retval);
1633         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1634     }
1635     return retval;
1636 }
1637
1638
1639 /* This API is wrong for special case conversions since we may need to
1640  * return several Unicode characters for a single Unicode character
1641  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1642  * the lower-level routine, and it is similarly broken for returning
1643  * multiple values.  --jhi */
1644 UV
1645 Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
1646 {
1647     dVAR;
1648     HV* hv = (HV*)SvRV(sv);
1649     U32 klen;
1650     U32 off;
1651     STRLEN slen;
1652     STRLEN needents;
1653     U8 *tmps = NULL;
1654     U32 bit;
1655     SV *retval;
1656     U8 tmputf8[2];
1657     UV c = NATIVE_TO_ASCII(*ptr);
1658
1659     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1660         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1661         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1662         ptr = tmputf8;
1663     }
1664     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1665      * then the "swatch" is a vec() for al the chars which start
1666      * with 0xAA..0xYY
1667      * So the key in the hash (klen) is length of encoded char -1
1668      */
1669     klen = UTF8SKIP(ptr) - 1;
1670     off  = ptr[klen];
1671
1672     if (klen == 0)
1673      {
1674       /* If char in invariant then swatch is for all the invariant chars
1675        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1676        */
1677       needents = UTF_CONTINUATION_MARK;
1678       off      = NATIVE_TO_UTF(ptr[klen]);
1679      }
1680     else
1681      {
1682       /* If char is encoded then swatch is for the prefix */
1683       needents = (1 << UTF_ACCUMULATION_SHIFT);
1684       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1685      }
1686
1687     /*
1688      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1689      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1690      * it's nothing to sniff at.)  Pity we usually come through at least
1691      * two function calls to get here...
1692      *
1693      * NB: this code assumes that swatches are never modified, once generated!
1694      */
1695
1696     if (hv   == PL_last_swash_hv &&
1697         klen == PL_last_swash_klen &&
1698         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1699     {
1700         tmps = PL_last_swash_tmps;
1701         slen = PL_last_swash_slen;
1702     }
1703     else {
1704         /* Try our second-level swatch cache, kept in a hash. */
1705         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1706
1707         /* If not cached, generate it via utf8::SWASHGET */
1708         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1709             dSP;
1710             /* We use utf8n_to_uvuni() as we want an index into
1711                Unicode tables, not a native character number.
1712              */
1713             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1714                                            ckWARN(WARN_UTF8) ?
1715                                            0 : UTF8_ALLOW_ANY);
1716             SV *errsv_save;
1717             ENTER;
1718             SAVETMPS;
1719             save_re_context();
1720             PUSHSTACKi(PERLSI_MAGIC);
1721             PUSHMARK(SP);
1722             EXTEND(SP,3);
1723             PUSHs((SV*)sv);
1724             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1725             PUSHs(sv_2mortal(newSViv((klen) ?
1726                                      (code_point & ~(needents - 1)) : 0)));
1727             PUSHs(sv_2mortal(newSViv(needents)));
1728             PUTBACK;
1729             errsv_save = newSVsv(ERRSV);
1730             if (call_method("SWASHGET", G_SCALAR))
1731                 retval = newSVsv(*PL_stack_sp--);
1732             else
1733                 retval = &PL_sv_undef;
1734             if (!SvTRUE(ERRSV))
1735                 sv_setsv(ERRSV, errsv_save);
1736             SvREFCNT_dec(errsv_save);
1737             POPSTACK;
1738             FREETMPS;
1739             LEAVE;
1740             if (IN_PERL_COMPILETIME)
1741                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1742
1743             svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1744
1745             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1746                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1747         }
1748
1749         PL_last_swash_hv = hv;
1750         PL_last_swash_klen = klen;
1751         PL_last_swash_tmps = tmps;
1752         PL_last_swash_slen = slen;
1753         if (klen)
1754             Copy(ptr, PL_last_swash_key, klen, U8);
1755     }
1756
1757     switch ((int)((slen << 3) / needents)) {
1758     case 1:
1759         bit = 1 << (off & 7);
1760         off >>= 3;
1761         return (tmps[off] & bit) != 0;
1762     case 8:
1763         return tmps[off];
1764     case 16:
1765         off <<= 1;
1766         return (tmps[off] << 8) + tmps[off + 1] ;
1767     case 32:
1768         off <<= 2;
1769         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1770     }
1771     Perl_croak(aTHX_ "panic: swash_fetch");
1772     return 0;
1773 }
1774
1775
1776 /*
1777 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1778
1779 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1780 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1781 bytes available. The return value is the pointer to the byte after the
1782 end of the new character. In other words,
1783
1784     d = uvchr_to_utf8(d, uv);
1785
1786 is the recommended wide native character-aware way of saying
1787
1788     *(d++) = uv;
1789
1790 =cut
1791 */
1792
1793 /* On ASCII machines this is normally a macro but we want a
1794    real function in case XS code wants it
1795 */
1796 #undef Perl_uvchr_to_utf8
1797 U8 *
1798 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1799 {
1800     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1801 }
1802
1803 U8 *
1804 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1805 {
1806     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1807 }
1808
1809 /*
1810 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1811
1812 Returns the native character value of the first character in the string C<s>
1813 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1814 length, in bytes, of that character.
1815
1816 Allows length and flags to be passed to low level routine.
1817
1818 =cut
1819 */
1820 /* On ASCII machines this is normally a macro but we want
1821    a real function in case XS code wants it
1822 */
1823 #undef Perl_utf8n_to_uvchr
1824 UV
1825 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1826 {
1827     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1828     return UNI_TO_NATIVE(uv);
1829 }
1830
1831 /*
1832 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1833
1834 Build to the scalar dsv a displayable version of the string spv,
1835 length len, the displayable version being at most pvlim bytes long
1836 (if longer, the rest is truncated and "..." will be appended).
1837
1838 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1839 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1840 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1841 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1842 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1843 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1844
1845 The pointer to the PV of the dsv is returned.
1846
1847 =cut */
1848 char *
1849 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1850 {
1851     int truncated = 0;
1852     const char *s, *e;
1853
1854     sv_setpvn(dsv, "", 0);
1855     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1856          UV u;
1857           /* This serves double duty as a flag and a character to print after
1858              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1859           */
1860          char ok = 0;
1861
1862          if (pvlim && SvCUR(dsv) >= pvlim) {
1863               truncated++;
1864               break;
1865          }
1866          u = utf8_to_uvchr((U8*)s, 0);
1867          if (u < 256) {
1868              unsigned char c = (unsigned char)u & 0xFF;
1869              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1870                  switch (c) {
1871                  case '\n':
1872                      ok = 'n'; break;
1873                  case '\r':
1874                      ok = 'r'; break;
1875                  case '\t':
1876                      ok = 't'; break;
1877                  case '\f':
1878                      ok = 'f'; break;
1879                  case '\a':
1880                      ok = 'a'; break;
1881                  case '\\':
1882                      ok = '\\'; break;
1883                  default: break;
1884                  }
1885                  if (ok) {
1886                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1887                  }
1888              }
1889              /* isPRINT() is the locale-blind version. */
1890              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1891                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1892                  ok = 1;
1893              }
1894          }
1895          if (!ok)
1896              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1897     }
1898     if (truncated)
1899          sv_catpvn(dsv, "...", 3);
1900
1901     return SvPVX(dsv);
1902 }
1903
1904 /*
1905 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1906
1907 Build to the scalar dsv a displayable version of the scalar sv,
1908 the displayable version being at most pvlim bytes long
1909 (if longer, the rest is truncated and "..." will be appended).
1910
1911 The flags argument is as in pv_uni_display().
1912
1913 The pointer to the PV of the dsv is returned.
1914
1915 =cut */
1916 char *
1917 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1918 {
1919      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1920                                 pvlim, flags);
1921 }
1922
1923 /*
1924 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1925
1926 Return true if the strings s1 and s2 differ case-insensitively, false
1927 if not (if they are equal case-insensitively).  If u1 is true, the
1928 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1929 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1930 are false, the respective string is assumed to be in native 8-bit
1931 encoding.
1932
1933 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1934 in there (they will point at the beginning of the I<next> character).
1935 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1936 pointers beyond which scanning will not continue under any
1937 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1938 s2+l2 will be used as goal end pointers that will also stop the scan,
1939 and which qualify towards defining a successful match: all the scans
1940 that define an explicit length must reach their goal pointers for
1941 a match to succeed).
1942
1943 For case-insensitiveness, the "casefolding" of Unicode is used
1944 instead of upper/lowercasing both the characters, see
1945 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1946
1947 =cut */
1948 I32
1949 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1950 {
1951      register const U8 *p1  = (const U8*)s1;
1952      register const U8 *p2  = (const U8*)s2;
1953      register const U8 *f1 = 0, *f2 = 0;
1954      register U8 *e1 = 0, *q1 = 0;
1955      register U8 *e2 = 0, *q2 = 0;
1956      STRLEN n1 = 0, n2 = 0;
1957      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1958      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
1959      U8 natbuf[1+1];
1960      STRLEN foldlen1, foldlen2;
1961      bool match;
1962
1963      if (pe1)
1964           e1 = *(U8**)pe1;
1965      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1966           f1 = (const U8*)s1 + l1;
1967      if (pe2)
1968           e2 = *(U8**)pe2;
1969      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1970           f2 = (const U8*)s2 + l2;
1971
1972      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1973           return 1; /* mismatch; possible infinite loop or false positive */
1974
1975      if (!u1 || !u2)
1976           natbuf[1] = 0; /* Need to terminate the buffer. */
1977
1978      while ((e1 == 0 || p1 < e1) &&
1979             (f1 == 0 || p1 < f1) &&
1980             (e2 == 0 || p2 < e2) &&
1981             (f2 == 0 || p2 < f2)) {
1982           if (n1 == 0) {
1983                if (u1)
1984                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1985                else {
1986                     natbuf[0] = *p1;
1987                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1988                }
1989                q1 = foldbuf1;
1990                n1 = foldlen1;
1991           }
1992           if (n2 == 0) {
1993                if (u2)
1994                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1995                else {
1996                     natbuf[0] = *p2;
1997                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1998                }
1999                q2 = foldbuf2;
2000                n2 = foldlen2;
2001           }
2002           while (n1 && n2) {
2003                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2004                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2005                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2006                    return 1; /* mismatch */
2007                n1 -= UTF8SKIP(q1);
2008                q1 += UTF8SKIP(q1);
2009                n2 -= UTF8SKIP(q2);
2010                q2 += UTF8SKIP(q2);
2011           }
2012           if (n1 == 0)
2013                p1 += u1 ? UTF8SKIP(p1) : 1;
2014           if (n2 == 0)
2015                p2 += u2 ? UTF8SKIP(p2) : 1;
2016
2017      }
2018
2019      /* A match is defined by all the scans that specified
2020       * an explicit length reaching their final goals. */
2021      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2022
2023      if (match) {
2024           if (pe1)
2025                *pe1 = (char*)p1;
2026           if (pe2)
2027                *pe2 = (char*)p2;
2028      }
2029
2030      return match ? 0 : 1; /* 0 match, 1 mismatch */
2031 }
2032
2033 /*
2034  * Local variables:
2035  * c-indentation-style: bsd
2036  * c-basic-offset: 4
2037  * indent-tabs-mode: t
2038  * End:
2039  *
2040  * vim: shiftwidth=4:
2041 */