utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
   4  *    others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241 #ifdef IS_UTF8_CHAR
 242     if (IS_UTF8_CHAR_FAST(len))
 243         return IS_UTF8_CHAR(s, len) ? len : 0;
 244 #endif /* #ifdef IS_UTF8_CHAR */
 245     return is_utf8_char_slow(s, len);
 246 }
 247
 248 /*
 249 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 250
 251 Returns true if first C<len> bytes of the given string form a valid
 252 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 253 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 254 because a valid ASCII string is a valid UTF-8 string.
 255
 256 See also is_utf8_string_loclen() and is_utf8_string_loc().
 257
 258 =cut
 259 */
 260
 261 bool
 262 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 263 {
 264     const U8* x = s;
 265     const U8* send;
 266
 267     if (!len)
 268         len = strlen((const char *)s);
 269     send = s + len;
 270
 271     while (x < send) {
 272         STRLEN c;
 273          /* Inline the easy bits of is_utf8_char() here for speed... */
 274          if (UTF8_IS_INVARIANT(*x))
 275               c = 1;
 276          else if (!UTF8_IS_START(*x))
 277              goto out;
 278          else {
 279               /* ... and call is_utf8_char() only if really needed. */
 280 #ifdef IS_UTF8_CHAR
 281              c = UTF8SKIP(x);
 282              if (IS_UTF8_CHAR_FAST(c)) {
 283                  if (!IS_UTF8_CHAR(x, c))
 284                      goto out;
 285              } else if (!is_utf8_char_slow(x, c))
 286                  goto out;
 287 #else
 288              c = is_utf8_char(x);
 289 #endif /* #ifdef IS_UTF8_CHAR */
 290               if (!c)
 291                   goto out;
 292          }
 293         x += c;
 294     }
 295
 296  out:
 297     if (x != send)
 298         return FALSE;
 299
 300     return TRUE;
 301 }
 302
 303 /*
 304 Implemented as a macro in utf8.h
 305
 306 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 307
 308 Like is_utf8_string() but stores the location of the failure (in the
 309 case of "utf8ness failure") or the location s+len (in the case of
 310 "utf8ness success") in the C<ep>.
 311
 312 See also is_utf8_string_loclen() and is_utf8_string().
 313
 314 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 315
 316 Like is_utf8_string() but stores the location of the failure (in the
 317 case of "utf8ness failure") or the location s+len (in the case of
 318 "utf8ness success") in the C<ep>, and the number of UTF-8
 319 encoded characters in the C<el>.
 320
 321 See also is_utf8_string_loc() and is_utf8_string().
 322
 323 =cut
 324 */
 325
 326 bool
 327 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 328 {
 329     const U8* x = s;
 330     const U8* send;
 331     STRLEN c;
 332
 333     if (!len)
 334         len = strlen((const char *)s);
 335     send = s + len;
 336     if (el)
 337         *el = 0;
 338
 339     while (x < send) {
 340          /* Inline the easy bits of is_utf8_char() here for speed... */
 341          if (UTF8_IS_INVARIANT(*x))
 342              c = 1;
 343          else if (!UTF8_IS_START(*x))
 344              goto out;
 345          else {
 346              /* ... and call is_utf8_char() only if really needed. */
 347 #ifdef IS_UTF8_CHAR
 348              c = UTF8SKIP(x);
 349              if (IS_UTF8_CHAR_FAST(c)) {
 350                  if (!IS_UTF8_CHAR(x, c))
 351                      c = 0;
 352              } else
 353                  c = is_utf8_char_slow(x, c);
 354 #else
 355              c = is_utf8_char(x);
 356 #endif /* #ifdef IS_UTF8_CHAR */
 357              if (!c)
 358                  goto out;
 359          }
 360          x += c;
 361          if (el)
 362              (*el)++;
 363     }
 364
 365  out:
 366     if (ep)
 367         *ep = x;
 368     if (x != send)
 369         return FALSE;
 370
 371     return TRUE;
 372 }
 373
 374 /*
 375
 376 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 377
 378 Bottom level UTF-8 decode routine.
 379 Returns the unicode code point value of the first character in the string C<s>
 380 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 381 C<retlen> will be set to the length, in bytes, of that character.
 382
 383 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 384 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 385 it is assumed that the caller will raise a warning, and this function
 386 will silently just set C<retlen> to C<-1> and return zero.  If the
 387 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 388 malformations will be given, C<retlen> will be set to the expected
 389 length of the UTF-8 character in bytes, and zero will be returned.
 390
 391 The C<flags> can also contain various flags to allow deviations from
 392 the strict UTF-8 encoding (see F<utf8.h>).
 393
 394 Most code should use utf8_to_uvchr() rather than call this directly.
 395
 396 =cut
 397 */
 398
 399 UV
 400 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 401 {
 402     const U8 *s0 = s;
 403     UV uv = *s, ouv = 0;
 404     STRLEN len = 1;
 405     const bool dowarn = ckWARN_d(WARN_UTF8);
 406     const UV startbyte = *s;
 407     STRLEN expectlen = 0;
 408     U32 warning = 0;
 409
 410 /* This list is a superset of the UTF8_ALLOW_XXX. */
 411
 412 #define UTF8_WARN_EMPTY                          1
 413 #define UTF8_WARN_CONTINUATION                   2
 414 #define UTF8_WARN_NON_CONTINUATION               3
 415 #define UTF8_WARN_FE_FF                          4
 416 #define UTF8_WARN_SHORT                          5
 417 #define UTF8_WARN_OVERFLOW                       6
 418 #define UTF8_WARN_SURROGATE                      7
 419 #define UTF8_WARN_LONG                           8
 420 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 421
 422     if (curlen == 0 &&
 423         !(flags & UTF8_ALLOW_EMPTY)) {
 424         warning = UTF8_WARN_EMPTY;
 425         goto malformed;
 426     }
 427
 428     if (UTF8_IS_INVARIANT(uv)) {
 429         if (retlen)
 430             *retlen = 1;
 431         return (UV) (NATIVE_TO_UTF(*s));
 432     }
 433
 434     if (UTF8_IS_CONTINUATION(uv) &&
 435         !(flags & UTF8_ALLOW_CONTINUATION)) {
 436         warning = UTF8_WARN_CONTINUATION;
 437         goto malformed;
 438     }
 439
 440     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 441         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 442         warning = UTF8_WARN_NON_CONTINUATION;
 443         goto malformed;
 444     }
 445
 446 #ifdef EBCDIC
 447     uv = NATIVE_TO_UTF(uv);
 448 #else
 449     if ((uv == 0xfe || uv == 0xff) &&
 450         !(flags & UTF8_ALLOW_FE_FF)) {
 451         warning = UTF8_WARN_FE_FF;
 452         goto malformed;
 453     }
 454 #endif
 455
 456     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 457     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 458     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 459     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 460 #ifdef EBCDIC
 461     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 462     else                        { len =  7; uv &= 0x01; }
 463 #else
 464     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 465     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 466     else                        { len = 13; uv = 0; } /* whoa! */
 467 #endif
 468
 469     if (retlen)
 470         *retlen = len;
 471
 472     expectlen = len;
 473
 474     if ((curlen < expectlen) &&
 475         !(flags & UTF8_ALLOW_SHORT)) {
 476         warning = UTF8_WARN_SHORT;
 477         goto malformed;
 478     }
 479
 480     len--;
 481     s++;
 482     ouv = uv;
 483
 484     while (len--) {
 485         if (!UTF8_IS_CONTINUATION(*s) &&
 486             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 487             s--;
 488             warning = UTF8_WARN_NON_CONTINUATION;
 489             goto malformed;
 490         }
 491         else
 492             uv = UTF8_ACCUMULATE(uv, *s);
 493         if (!(uv > ouv)) {
 494             /* These cannot be allowed. */
 495             if (uv == ouv) {
 496                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 497                     warning = UTF8_WARN_LONG;
 498                     goto malformed;
 499                 }
 500             }
 501             else { /* uv < ouv */
 502                 /* This cannot be allowed. */
 503                 warning = UTF8_WARN_OVERFLOW;
 504                 goto malformed;
 505             }
 506         }
 507         s++;
 508         ouv = uv;
 509     }
 510
 511     if (UNICODE_IS_SURROGATE(uv) &&
 512         !(flags & UTF8_ALLOW_SURROGATE)) {
 513         warning = UTF8_WARN_SURROGATE;
 514         goto malformed;
 515     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 516                !(flags & UTF8_ALLOW_LONG)) {
 517         warning = UTF8_WARN_LONG;
 518         goto malformed;
 519     } else if (UNICODE_IS_ILLEGAL(uv) &&
 520                !(flags & UTF8_ALLOW_FFFF)) {
 521         warning = UTF8_WARN_FFFF;
 522         goto malformed;
 523     }
 524
 525     return uv;
 526
 527 malformed:
 528
 529     if (flags & UTF8_CHECK_ONLY) {
 530         if (retlen)
 531             *retlen = -1;
 532         return 0;
 533     }
 534
 535     if (dowarn) {
 536         SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 537
 538         switch (warning) {
 539         case 0: /* Intentionally empty. */ break;
 540         case UTF8_WARN_EMPTY:
 541             Perl_sv_catpv(aTHX_ sv, "(empty string)");
 542             break;
 543         case UTF8_WARN_CONTINUATION:
 544             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 545             break;
 546         case UTF8_WARN_NON_CONTINUATION:
 547             if (s == s0)
 548                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 549                            (UV)s[1], startbyte);
 550             else {
 551                 const int len = (int)(s-s0);
 552                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 553                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 554             }
 555
 556             break;
 557         case UTF8_WARN_FE_FF:
 558             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 559             break;
 560         case UTF8_WARN_SHORT:
 561             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 562                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 563             expectlen = curlen;         /* distance for caller to skip */
 564             break;
 565         case UTF8_WARN_OVERFLOW:
 566             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 567                            ouv, *s, startbyte);
 568             break;
 569         case UTF8_WARN_SURROGATE:
 570             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 571             break;
 572         case UTF8_WARN_LONG:
 573             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 574                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 575             break;
 576         case UTF8_WARN_FFFF:
 577             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 578             break;
 579         default:
 580             Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
 581             break;
 582         }
 583
 584         if (warning) {
 585             const char * const s = SvPVX_const(sv);
 586
 587             if (PL_op)
 588                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 589                             "%s in %s", s,  OP_DESC(PL_op));
 590             else
 591                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 592         }
 593     }
 594
 595     if (retlen)
 596         *retlen = expectlen ? expectlen : len;
 597
 598     return 0;
 599 }
 600
 601 /*
 602 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 603
 604 Returns the native character value of the first character in the string C<s>
 605 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 606 length, in bytes, of that character.
 607
 608 If C<s> does not point to a well-formed UTF-8 character, zero is
 609 returned and retlen is set, if possible, to -1.
 610
 611 =cut
 612 */
 613
 614 UV
 615 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 616 {
 617     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
 618                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 619 }
 620
 621 /*
 622 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 623
 624 Returns the Unicode code point of the first character in the string C<s>
 625 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 626 length, in bytes, of that character.
 627
 628 This function should only be used when returned UV is considered
 629 an index into the Unicode semantic tables (e.g. swashes).
 630
 631 If C<s> does not point to a well-formed UTF-8 character, zero is
 632 returned and retlen is set, if possible, to -1.
 633
 634 =cut
 635 */
 636
 637 UV
 638 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 639 {
 640     /* Call the low level routine asking for checks */
 641     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 642                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 643 }
 644
 645 /*
 646 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 647
 648 Return the length of the UTF-8 char encoded string C<s> in characters.
 649 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 650 up past C<e>, croaks.
 651
 652 =cut
 653 */
 654
 655 STRLEN
 656 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 657 {
 658     STRLEN len = 0;
 659
 660     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 661      * the bitops (especially ~) can create illegal UTF-8.
 662      * In other words: in Perl UTF-8 is not just for Unicode. */
 663
 664     if (e < s)
 665         goto warn_and_return;
 666     while (s < e) {
 667         const U8 t = UTF8SKIP(s);
 668         if (e - s < t) {
 669             warn_and_return:
 670             if (ckWARN_d(WARN_UTF8)) {
 671                 if (PL_op)
 672                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 673                             "%s in %s", unees, OP_DESC(PL_op));
 674                 else
 675                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 676             }
 677             return len;
 678         }
 679         s += t;
 680         len++;
 681     }
 682
 683     return len;
 684 }
 685
 686 /*
 687 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 688
 689 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 690 and C<b>.
 691
 692 WARNING: use only if you *know* that the pointers point inside the
 693 same UTF-8 buffer.
 694
 695 =cut
 696 */
 697
 698 IV
 699 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 700 {
 701     IV off = 0;
 702
 703     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 704      * the bitops (especially ~) can create illegal UTF-8.
 705      * In other words: in Perl UTF-8 is not just for Unicode. */
 706
 707     if (a < b) {
 708         while (a < b) {
 709             const U8 c = UTF8SKIP(a);
 710             if (b - a < c)
 711                 goto warn_and_return;
 712             a += c;
 713             off--;
 714         }
 715     }
 716     else {
 717         while (b < a) {
 718             const U8 c = UTF8SKIP(b);
 719
 720             if (a - b < c) {
 721                 warn_and_return:
 722                 if (ckWARN_d(WARN_UTF8)) {
 723                     if (PL_op)
 724                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 725                                     "%s in %s", unees, OP_DESC(PL_op));
 726                     else
 727                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 728                 }
 729                 return off;
 730             }
 731             b += c;
 732             off++;
 733         }
 734     }
 735
 736     return off;
 737 }
 738
 739 /*
 740 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 741
 742 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 743 forward or backward.
 744
 745 WARNING: do not use the following unless you *know* C<off> is within
 746 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 747 on the first byte of character or just after the last byte of a character.
 748
 749 =cut
 750 */
 751
 752 U8 *
 753 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 754 {
 755     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 756      * the bitops (especially ~) can create illegal UTF-8.
 757      * In other words: in Perl UTF-8 is not just for Unicode. */
 758
 759     if (off >= 0) {
 760         while (off--)
 761             s += UTF8SKIP(s);
 762     }
 763     else {
 764         while (off++) {
 765             s--;
 766             while (UTF8_IS_CONTINUATION(*s))
 767                 s--;
 768         }
 769     }
 770     return (U8 *)s;
 771 }
 772
 773 /*
 774 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 775
 776 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 777 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 778 updates len to contain the new length.
 779 Returns zero on failure, setting C<len> to -1.
 780
 781 =cut
 782 */
 783
 784 U8 *
 785 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 786 {
 787     U8 *send;
 788     U8 *d;
 789     U8 *save = s;
 790
 791     /* ensure valid UTF-8 and chars < 256 before updating string */
 792     for (send = s + *len; s < send; ) {
 793         U8 c = *s++;
 794
 795         if (!UTF8_IS_INVARIANT(c) &&
 796             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 797              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 798             *len = -1;
 799             return 0;
 800         }
 801     }
 802
 803     d = s = save;
 804     while (s < send) {
 805         STRLEN ulen;
 806         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 807         s += ulen;
 808     }
 809     *d = '\0';
 810     *len = d - save;
 811     return save;
 812 }
 813
 814 /*
 815 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 816
 817 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 818 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 819 the newly-created string, and updates C<len> to contain the new
 820 length.  Returns the original string if no conversion occurs, C<len>
 821 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 822 0 if C<s> is converted or contains all 7bit characters.
 823
 824 =cut
 825 */
 826
 827 U8 *
 828 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 829 {
 830     U8 *d;
 831     const U8 *start = s;
 832     const U8 *send;
 833     I32 count = 0;
 834
 835     if (!*is_utf8)
 836         return (U8 *)start;
 837
 838     /* ensure valid UTF-8 and chars < 256 before converting string */
 839     for (send = s + *len; s < send;) {
 840         U8 c = *s++;
 841         if (!UTF8_IS_INVARIANT(c)) {
 842             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 843                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 844                 count++;
 845             else
 846                 return (U8 *)start;
 847         }
 848     }
 849
 850     *is_utf8 = 0;
 851
 852     Newxz(d, (*len) - count + 1, U8);
 853     s = start; start = d;
 854     while (s < send) {
 855         U8 c = *s++;
 856         if (!UTF8_IS_INVARIANT(c)) {
 857             /* Then it is two-byte encoded */
 858             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 859             c = ASCII_TO_NATIVE(c);
 860         }
 861         *d++ = c;
 862     }
 863     *d = '\0';
 864     *len = d - start;
 865     return (U8 *)start;
 866 }
 867
 868 /*
 869 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 870
 871 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 872 Returns a pointer to the newly-created string, and sets C<len> to
 873 reflect the new length.
 874
 875 If you want to convert to UTF-8 from other encodings than ASCII,
 876 see sv_recode_to_utf8().
 877
 878 =cut
 879 */
 880
 881 U8*
 882 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 883 {
 884     const U8 * const send = s + (*len);
 885     U8 *d;
 886     U8 *dst;
 887
 888     Newxz(d, (*len) * 2 + 1, U8);
 889     dst = d;
 890
 891     while (s < send) {
 892         const UV uv = NATIVE_TO_ASCII(*s++);
 893         if (UNI_IS_INVARIANT(uv))
 894             *d++ = (U8)UTF_TO_NATIVE(uv);
 895         else {
 896             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 897             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 898         }
 899     }
 900     *d = '\0';
 901     *len = d-dst;
 902     return dst;
 903 }
 904
 905 /*
 906  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 907  *
 908  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 909  * We optimize for native, for obvious reasons. */
 910
 911 U8*
 912 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 913 {
 914     U8* pend;
 915     U8* dstart = d;
 916
 917     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 918          d[0] = 0;
 919          *newlen = 1;
 920          return d;
 921     }
 922
 923     if (bytelen & 1)
 924         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 925
 926     pend = p + bytelen;
 927
 928     while (p < pend) {
 929         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 930         p += 2;
 931         if (uv < 0x80) {
 932             *d++ = (U8)uv;
 933             continue;
 934         }
 935         if (uv < 0x800) {
 936             *d++ = (U8)(( uv >>  6)         | 0xc0);
 937             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 938             continue;
 939         }
 940         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 941             UV low = (p[0] << 8) + p[1];
 942             p += 2;
 943             if (low < 0xdc00 || low >= 0xdfff)
 944                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 945             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 946         }
 947         if (uv < 0x10000) {
 948             *d++ = (U8)(( uv >> 12)         | 0xe0);
 949             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 950             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 951             continue;
 952         }
 953         else {
 954             *d++ = (U8)(( uv >> 18)         | 0xf0);
 955             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 956             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 957             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 958             continue;
 959         }
 960     }
 961     *newlen = d - dstart;
 962     return d;
 963 }
 964
 965 /* Note: this one is slightly destructive of the source. */
 966
 967 U8*
 968 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 969 {
 970     U8* s = (U8*)p;
 971     U8* send = s + bytelen;
 972     while (s < send) {
 973         U8 tmp = s[0];
 974         s[0] = s[1];
 975         s[1] = tmp;
 976         s += 2;
 977     }
 978     return utf16_to_utf8(p, d, bytelen, newlen);
 979 }
 980
 981 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 982
 983 bool
 984 Perl_is_uni_alnum(pTHX_ UV c)
 985 {
 986     U8 tmpbuf[UTF8_MAXBYTES+1];
 987     uvchr_to_utf8(tmpbuf, c);
 988     return is_utf8_alnum(tmpbuf);
 989 }
 990
 991 bool
 992 Perl_is_uni_alnumc(pTHX_ UV c)
 993 {
 994     U8 tmpbuf[UTF8_MAXBYTES+1];
 995     uvchr_to_utf8(tmpbuf, c);
 996     return is_utf8_alnumc(tmpbuf);
 997 }
 998
 999 bool
1000 Perl_is_uni_idfirst(pTHX_ UV c)
1001 {
1002     U8 tmpbuf[UTF8_MAXBYTES+1];
1003     uvchr_to_utf8(tmpbuf, c);
1004     return is_utf8_idfirst(tmpbuf);
1005 }
1006
1007 bool
1008 Perl_is_uni_alpha(pTHX_ UV c)
1009 {
1010     U8 tmpbuf[UTF8_MAXBYTES+1];
1011     uvchr_to_utf8(tmpbuf, c);
1012     return is_utf8_alpha(tmpbuf);
1013 }
1014
1015 bool
1016 Perl_is_uni_ascii(pTHX_ UV c)
1017 {
1018     U8 tmpbuf[UTF8_MAXBYTES+1];
1019     uvchr_to_utf8(tmpbuf, c);
1020     return is_utf8_ascii(tmpbuf);
1021 }
1022
1023 bool
1024 Perl_is_uni_space(pTHX_ UV c)
1025 {
1026     U8 tmpbuf[UTF8_MAXBYTES+1];
1027     uvchr_to_utf8(tmpbuf, c);
1028     return is_utf8_space(tmpbuf);
1029 }
1030
1031 bool
1032 Perl_is_uni_digit(pTHX_ UV c)
1033 {
1034     U8 tmpbuf[UTF8_MAXBYTES+1];
1035     uvchr_to_utf8(tmpbuf, c);
1036     return is_utf8_digit(tmpbuf);
1037 }
1038
1039 bool
1040 Perl_is_uni_upper(pTHX_ UV c)
1041 {
1042     U8 tmpbuf[UTF8_MAXBYTES+1];
1043     uvchr_to_utf8(tmpbuf, c);
1044     return is_utf8_upper(tmpbuf);
1045 }
1046
1047 bool
1048 Perl_is_uni_lower(pTHX_ UV c)
1049 {
1050     U8 tmpbuf[UTF8_MAXBYTES+1];
1051     uvchr_to_utf8(tmpbuf, c);
1052     return is_utf8_lower(tmpbuf);
1053 }
1054
1055 bool
1056 Perl_is_uni_cntrl(pTHX_ UV c)
1057 {
1058     U8 tmpbuf[UTF8_MAXBYTES+1];
1059     uvchr_to_utf8(tmpbuf, c);
1060     return is_utf8_cntrl(tmpbuf);
1061 }
1062
1063 bool
1064 Perl_is_uni_graph(pTHX_ UV c)
1065 {
1066     U8 tmpbuf[UTF8_MAXBYTES+1];
1067     uvchr_to_utf8(tmpbuf, c);
1068     return is_utf8_graph(tmpbuf);
1069 }
1070
1071 bool
1072 Perl_is_uni_print(pTHX_ UV c)
1073 {
1074     U8 tmpbuf[UTF8_MAXBYTES+1];
1075     uvchr_to_utf8(tmpbuf, c);
1076     return is_utf8_print(tmpbuf);
1077 }
1078
1079 bool
1080 Perl_is_uni_punct(pTHX_ UV c)
1081 {
1082     U8 tmpbuf[UTF8_MAXBYTES+1];
1083     uvchr_to_utf8(tmpbuf, c);
1084     return is_utf8_punct(tmpbuf);
1085 }
1086
1087 bool
1088 Perl_is_uni_xdigit(pTHX_ UV c)
1089 {
1090     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1091     uvchr_to_utf8(tmpbuf, c);
1092     return is_utf8_xdigit(tmpbuf);
1093 }
1094
1095 UV
1096 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1097 {
1098     uvchr_to_utf8(p, c);
1099     return to_utf8_upper(p, p, lenp);
1100 }
1101
1102 UV
1103 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1104 {
1105     uvchr_to_utf8(p, c);
1106     return to_utf8_title(p, p, lenp);
1107 }
1108
1109 UV
1110 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1111 {
1112     uvchr_to_utf8(p, c);
1113     return to_utf8_lower(p, p, lenp);
1114 }
1115
1116 UV
1117 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1118 {
1119     uvchr_to_utf8(p, c);
1120     return to_utf8_fold(p, p, lenp);
1121 }
1122
1123 /* for now these all assume no locale info available for Unicode > 255 */
1124
1125 bool
1126 Perl_is_uni_alnum_lc(pTHX_ UV c)
1127 {
1128     return is_uni_alnum(c);     /* XXX no locale support yet */
1129 }
1130
1131 bool
1132 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1133 {
1134     return is_uni_alnumc(c);    /* XXX no locale support yet */
1135 }
1136
1137 bool
1138 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1139 {
1140     return is_uni_idfirst(c);   /* XXX no locale support yet */
1141 }
1142
1143 bool
1144 Perl_is_uni_alpha_lc(pTHX_ UV c)
1145 {
1146     return is_uni_alpha(c);     /* XXX no locale support yet */
1147 }
1148
1149 bool
1150 Perl_is_uni_ascii_lc(pTHX_ UV c)
1151 {
1152     return is_uni_ascii(c);     /* XXX no locale support yet */
1153 }
1154
1155 bool
1156 Perl_is_uni_space_lc(pTHX_ UV c)
1157 {
1158     return is_uni_space(c);     /* XXX no locale support yet */
1159 }
1160
1161 bool
1162 Perl_is_uni_digit_lc(pTHX_ UV c)
1163 {
1164     return is_uni_digit(c);     /* XXX no locale support yet */
1165 }
1166
1167 bool
1168 Perl_is_uni_upper_lc(pTHX_ UV c)
1169 {
1170     return is_uni_upper(c);     /* XXX no locale support yet */
1171 }
1172
1173 bool
1174 Perl_is_uni_lower_lc(pTHX_ UV c)
1175 {
1176     return is_uni_lower(c);     /* XXX no locale support yet */
1177 }
1178
1179 bool
1180 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1181 {
1182     return is_uni_cntrl(c);     /* XXX no locale support yet */
1183 }
1184
1185 bool
1186 Perl_is_uni_graph_lc(pTHX_ UV c)
1187 {
1188     return is_uni_graph(c);     /* XXX no locale support yet */
1189 }
1190
1191 bool
1192 Perl_is_uni_print_lc(pTHX_ UV c)
1193 {
1194     return is_uni_print(c);     /* XXX no locale support yet */
1195 }
1196
1197 bool
1198 Perl_is_uni_punct_lc(pTHX_ UV c)
1199 {
1200     return is_uni_punct(c);     /* XXX no locale support yet */
1201 }
1202
1203 bool
1204 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1205 {
1206     return is_uni_xdigit(c);    /* XXX no locale support yet */
1207 }
1208
1209 U32
1210 Perl_to_uni_upper_lc(pTHX_ U32 c)
1211 {
1212     /* XXX returns only the first character -- do not use XXX */
1213     /* XXX no locale support yet */
1214     STRLEN len;
1215     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1216     return (U32)to_uni_upper(c, tmpbuf, &len);
1217 }
1218
1219 U32
1220 Perl_to_uni_title_lc(pTHX_ U32 c)
1221 {
1222     /* XXX returns only the first character XXX -- do not use XXX */
1223     /* XXX no locale support yet */
1224     STRLEN len;
1225     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1226     return (U32)to_uni_title(c, tmpbuf, &len);
1227 }
1228
1229 U32
1230 Perl_to_uni_lower_lc(pTHX_ U32 c)
1231 {
1232     /* XXX returns only the first character -- do not use XXX */
1233     /* XXX no locale support yet */
1234     STRLEN len;
1235     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1236     return (U32)to_uni_lower(c, tmpbuf, &len);
1237 }
1238
1239 bool
1240 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1241 {
1242     if (!is_utf8_char(p))
1243         return FALSE;
1244     if (!PL_utf8_alnum)
1245         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1246          * descendant of isalnum(3), in other words, it doesn't
1247          * contain the '_'. --jhi */
1248         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1249     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1250 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1251 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1252     if (!PL_utf8_alnum)
1253         PL_utf8_alnum = swash_init("utf8", "",
1254             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1255     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1256 #endif
1257 }
1258
1259 bool
1260 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1261 {
1262     if (!is_utf8_char(p))
1263         return FALSE;
1264     if (!PL_utf8_alnum)
1265         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1266     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1267 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1268 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1269     if (!PL_utf8_alnum)
1270         PL_utf8_alnum = swash_init("utf8", "",
1271             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1272     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1273 #endif
1274 }
1275
1276 bool
1277 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1278 {
1279     if (*p == '_')
1280         return TRUE;
1281     if (!is_utf8_char(p))
1282         return FALSE;
1283     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1284         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1285     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1286 }
1287
1288 bool
1289 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1290 {
1291     if (*p == '_')
1292         return TRUE;
1293     if (!is_utf8_char(p))
1294         return FALSE;
1295     if (!PL_utf8_idcont)
1296         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1297     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1298 }
1299
1300 bool
1301 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1302 {
1303     if (!is_utf8_char(p))
1304         return FALSE;
1305     if (!PL_utf8_alpha)
1306         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1307     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1308 }
1309
1310 bool
1311 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1312 {
1313     if (!is_utf8_char(p))
1314         return FALSE;
1315     if (!PL_utf8_ascii)
1316         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1317     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1318 }
1319
1320 bool
1321 Perl_is_utf8_space(pTHX_ const U8 *p)
1322 {
1323     if (!is_utf8_char(p))
1324         return FALSE;
1325     if (!PL_utf8_space)
1326         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1327     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1328 }
1329
1330 bool
1331 Perl_is_utf8_digit(pTHX_ const U8 *p)
1332 {
1333     if (!is_utf8_char(p))
1334         return FALSE;
1335     if (!PL_utf8_digit)
1336         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1337     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1338 }
1339
1340 bool
1341 Perl_is_utf8_upper(pTHX_ const U8 *p)
1342 {
1343     if (!is_utf8_char(p))
1344         return FALSE;
1345     if (!PL_utf8_upper)
1346         PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1347     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1348 }
1349
1350 bool
1351 Perl_is_utf8_lower(pTHX_ const U8 *p)
1352 {
1353     if (!is_utf8_char(p))
1354         return FALSE;
1355     if (!PL_utf8_lower)
1356         PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1357     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1358 }
1359
1360 bool
1361 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1362 {
1363     if (!is_utf8_char(p))
1364         return FALSE;
1365     if (!PL_utf8_cntrl)
1366         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1367     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1368 }
1369
1370 bool
1371 Perl_is_utf8_graph(pTHX_ const U8 *p)
1372 {
1373     if (!is_utf8_char(p))
1374         return FALSE;
1375     if (!PL_utf8_graph)
1376         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1377     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1378 }
1379
1380 bool
1381 Perl_is_utf8_print(pTHX_ const U8 *p)
1382 {
1383     if (!is_utf8_char(p))
1384         return FALSE;
1385     if (!PL_utf8_print)
1386         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1387     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1388 }
1389
1390 bool
1391 Perl_is_utf8_punct(pTHX_ const U8 *p)
1392 {
1393     if (!is_utf8_char(p))
1394         return FALSE;
1395     if (!PL_utf8_punct)
1396         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1397     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1398 }
1399
1400 bool
1401 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1402 {
1403     if (!is_utf8_char(p))
1404         return FALSE;
1405     if (!PL_utf8_xdigit)
1406         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1407     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1408 }
1409
1410 bool
1411 Perl_is_utf8_mark(pTHX_ const U8 *p)
1412 {
1413     if (!is_utf8_char(p))
1414         return FALSE;
1415     if (!PL_utf8_mark)
1416         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1417     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1418 }
1419
1420 /*
1421 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1422
1423 The "p" contains the pointer to the UTF-8 string encoding
1424 the character that is being converted.
1425
1426 The "ustrp" is a pointer to the character buffer to put the
1427 conversion result to.  The "lenp" is a pointer to the length
1428 of the result.
1429
1430 The "swashp" is a pointer to the swash to use.
1431
1432 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1433 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1434 but not always, a multicharacter mapping), is tried first.
1435
1436 The "special" is a string like "utf8::ToSpecLower", which means the
1437 hash %utf8::ToSpecLower.  The access to the hash is through
1438 Perl_to_utf8_case().
1439
1440 The "normal" is a string like "ToLower" which means the swash
1441 %utf8::ToLower.
1442
1443 =cut */
1444
1445 UV
1446 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1447                         SV **swashp, const char *normal, const char *special)
1448 {
1449     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1450     STRLEN len = 0;
1451
1452     const UV uv0 = utf8_to_uvchr(p, NULL);
1453     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1454      * are necessary in EBCDIC, they are redundant no-ops
1455      * in ASCII-ish platforms, and hopefully optimized away. */
1456     const UV uv1 = NATIVE_TO_UNI(uv0);
1457     uvuni_to_utf8(tmpbuf, uv1);
1458
1459     if (!*swashp) /* load on-demand */
1460          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1461
1462     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1463     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1464          /* It might be "special" (sometimes, but not always,
1465           * a multicharacter mapping) */
1466          HV *hv;
1467          SV **svp;
1468
1469          if ((hv  = get_hv(special, FALSE)) &&
1470              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1471              (*svp)) {
1472              const char *s;
1473
1474               s = SvPV_const(*svp, len);
1475               if (len == 1)
1476                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1477               else {
1478 #ifdef EBCDIC
1479                    /* If we have EBCDIC we need to remap the characters
1480                     * since any characters in the low 256 are Unicode
1481                     * code points, not EBCDIC. */
1482                    U8 *t = (U8*)s, *tend = t + len, *d;
1483
1484                    d = tmpbuf;
1485                    if (SvUTF8(*svp)) {
1486                         STRLEN tlen = 0;
1487
1488                         while (t < tend) {
1489                              UV c = utf8_to_uvchr(t, &tlen);
1490                              if (tlen > 0) {
1491                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1492                                   t += tlen;
1493                              }
1494                              else
1495                                   break;
1496                         }
1497                    }
1498                    else {
1499                         while (t < tend) {
1500                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1501                              t++;
1502                         }
1503                    }
1504                    len = d - tmpbuf;
1505                    Copy(tmpbuf, ustrp, len, U8);
1506 #else
1507                    Copy(s, ustrp, len, U8);
1508 #endif
1509               }
1510          }
1511     }
1512
1513     if (!len && *swashp) {
1514          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1515
1516          if (uv2) {
1517               /* It was "normal" (a single character mapping). */
1518               UV uv3 = UNI_TO_NATIVE(uv2);
1519
1520               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1521          }
1522     }
1523
1524     if (!len) /* Neither: just copy. */
1525          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1526
1527     if (lenp)
1528          *lenp = len;
1529
1530     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1531 }
1532
1533 /*
1534 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1535
1536 Convert the UTF-8 encoded character at p to its uppercase version and
1537 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1538 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1539 the uppercase version may be longer than the original character.
1540
1541 The first character of the uppercased version is returned
1542 (but note, as explained above, that there may be more.)
1543
1544 =cut */
1545
1546 UV
1547 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1548 {
1549     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1550                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1551 }
1552
1553 /*
1554 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1555
1556 Convert the UTF-8 encoded character at p to its titlecase version and
1557 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1558 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1559 titlecase version may be longer than the original character.
1560
1561 The first character of the titlecased version is returned
1562 (but note, as explained above, that there may be more.)
1563
1564 =cut */
1565
1566 UV
1567 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1568 {
1569     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1570                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1571 }
1572
1573 /*
1574 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1575
1576 Convert the UTF-8 encoded character at p to its lowercase version and
1577 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1578 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1579 lowercase version may be longer than the original character.
1580
1581 The first character of the lowercased version is returned
1582 (but note, as explained above, that there may be more.)
1583
1584 =cut */
1585
1586 UV
1587 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1588 {
1589     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1590                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1591 }
1592
1593 /*
1594 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1595
1596 Convert the UTF-8 encoded character at p to its foldcase version and
1597 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1598 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1599 foldcase version may be longer than the original character (up to
1600 three characters).
1601
1602 The first character of the foldcased version is returned
1603 (but note, as explained above, that there may be more.)
1604
1605 =cut */
1606
1607 UV
1608 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1609 {
1610     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1611                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1612 }
1613
1614 /* a "swash" is a swatch hash */
1615
1616 SV*
1617 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1618 {
1619     dVAR;
1620     SV* retval;
1621     SV* const tokenbufsv = sv_newmortal();
1622     dSP;
1623     const size_t pkg_len = strlen(pkg);
1624     const size_t name_len = strlen(name);
1625     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1626     SV* errsv_save;
1627
1628     PUSHSTACKi(PERLSI_MAGIC);
1629     ENTER;
1630     SAVEI32(PL_hints);
1631     PL_hints = 0;
1632     save_re_context();
1633     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1634         ENTER;
1635         errsv_save = newSVsv(ERRSV);
1636         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1637                          Nullsv);
1638         if (!SvTRUE(ERRSV))
1639             sv_setsv(ERRSV, errsv_save);
1640         SvREFCNT_dec(errsv_save);
1641         LEAVE;
1642     }
1643     SPAGAIN;
1644     PUSHMARK(SP);
1645     EXTEND(SP,5);
1646     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1647     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1648     PUSHs(listsv);
1649     PUSHs(sv_2mortal(newSViv(minbits)));
1650     PUSHs(sv_2mortal(newSViv(none)));
1651     PUTBACK;
1652     if (IN_PERL_COMPILETIME) {
1653         /* XXX ought to be handled by lex_start */
1654         SAVEI32(PL_in_my);
1655         PL_in_my = 0;
1656         sv_setpv(tokenbufsv, PL_tokenbuf);
1657     }
1658     errsv_save = newSVsv(ERRSV);
1659     if (call_method("SWASHNEW", G_SCALAR))
1660         retval = newSVsv(*PL_stack_sp--);
1661     else
1662         retval = &PL_sv_undef;
1663     if (!SvTRUE(ERRSV))
1664         sv_setsv(ERRSV, errsv_save);
1665     SvREFCNT_dec(errsv_save);
1666     LEAVE;
1667     POPSTACK;
1668     if (IN_PERL_COMPILETIME) {
1669         STRLEN len;
1670         const char* const pv = SvPV_const(tokenbufsv, len);
1671
1672         Copy(pv, PL_tokenbuf, len+1, char);
1673         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1674     }
1675     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1676         if (SvPOK(retval))
1677             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1678                        retval);
1679         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1680     }
1681     return retval;
1682 }
1683
1684
1685 /* This API is wrong for special case conversions since we may need to
1686  * return several Unicode characters for a single Unicode character
1687  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1688  * the lower-level routine, and it is similarly broken for returning
1689  * multiple values.  --jhi */
1690 UV
1691 Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
1692 {
1693     dVAR;
1694     HV* const hv = (HV*)SvRV(sv);
1695     U32 klen;
1696     U32 off;
1697     STRLEN slen;
1698     STRLEN needents;
1699     const U8 *tmps = NULL;
1700     U32 bit;
1701     SV *retval;
1702     U8 tmputf8[2];
1703     UV c = NATIVE_TO_ASCII(*ptr);
1704
1705     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1706         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1707         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1708         ptr = tmputf8;
1709     }
1710     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1711      * then the "swatch" is a vec() for al the chars which start
1712      * with 0xAA..0xYY
1713      * So the key in the hash (klen) is length of encoded char -1
1714      */
1715     klen = UTF8SKIP(ptr) - 1;
1716     off  = ptr[klen];
1717
1718     if (klen == 0)
1719      {
1720       /* If char in invariant then swatch is for all the invariant chars
1721        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1722        */
1723       needents = UTF_CONTINUATION_MARK;
1724       off      = NATIVE_TO_UTF(ptr[klen]);
1725      }
1726     else
1727      {
1728       /* If char is encoded then swatch is for the prefix */
1729       needents = (1 << UTF_ACCUMULATION_SHIFT);
1730       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1731      }
1732
1733     /*
1734      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1735      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1736      * it's nothing to sniff at.)  Pity we usually come through at least
1737      * two function calls to get here...
1738      *
1739      * NB: this code assumes that swatches are never modified, once generated!
1740      */
1741
1742     if (hv   == PL_last_swash_hv &&
1743         klen == PL_last_swash_klen &&
1744         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1745     {
1746         tmps = PL_last_swash_tmps;
1747         slen = PL_last_swash_slen;
1748     }
1749     else {
1750         /* Try our second-level swatch cache, kept in a hash. */
1751         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1752
1753         /* If not cached, generate it via utf8::SWASHGET */
1754         if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1755             dSP;
1756             /* We use utf8n_to_uvuni() as we want an index into
1757                Unicode tables, not a native character number.
1758              */
1759             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1760                                            ckWARN(WARN_UTF8) ?
1761                                            0 : UTF8_ALLOW_ANY);
1762             SV *errsv_save;
1763             ENTER;
1764             SAVETMPS;
1765             save_re_context();
1766             PUSHSTACKi(PERLSI_MAGIC);
1767             PUSHMARK(SP);
1768             EXTEND(SP,3);
1769             PUSHs((SV*)sv);
1770             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1771             PUSHs(sv_2mortal(newSViv((klen) ?
1772                                      (code_point & ~(needents - 1)) : 0)));
1773             PUSHs(sv_2mortal(newSViv(needents)));
1774             PUTBACK;
1775             errsv_save = newSVsv(ERRSV);
1776             if (call_method("SWASHGET", G_SCALAR))
1777                 retval = newSVsv(*PL_stack_sp--);
1778             else
1779                 retval = &PL_sv_undef;
1780             if (!SvTRUE(ERRSV))
1781                 sv_setsv(ERRSV, errsv_save);
1782             SvREFCNT_dec(errsv_save);
1783             POPSTACK;
1784             FREETMPS;
1785             LEAVE;
1786             if (IN_PERL_COMPILETIME)
1787                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1788
1789             svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1790
1791             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1792                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1793         }
1794
1795         PL_last_swash_hv = hv;
1796         PL_last_swash_klen = klen;
1797         /* FIXME change interpvar.h?  */
1798         PL_last_swash_tmps = (U8 *) tmps;
1799         PL_last_swash_slen = slen;
1800         if (klen)
1801             Copy(ptr, PL_last_swash_key, klen, U8);
1802     }
1803
1804     switch ((int)((slen << 3) / needents)) {
1805     case 1:
1806         bit = 1 << (off & 7);
1807         off >>= 3;
1808         return (tmps[off] & bit) != 0;
1809     case 8:
1810         return tmps[off];
1811     case 16:
1812         off <<= 1;
1813         return (tmps[off] << 8) + tmps[off + 1] ;
1814     case 32:
1815         off <<= 2;
1816         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1817     }
1818     Perl_croak(aTHX_ "panic: swash_fetch");
1819     return 0;
1820 }
1821
1822 U8 *
1823 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1824 {
1825     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1826 }
1827
1828 /*
1829 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1830
1831 Build to the scalar dsv a displayable version of the string spv,
1832 length len, the displayable version being at most pvlim bytes long
1833 (if longer, the rest is truncated and "..." will be appended).
1834
1835 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1836 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1837 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1838 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1839 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1840 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1841
1842 The pointer to the PV of the dsv is returned.
1843
1844 =cut */
1845 char *
1846 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1847 {
1848     int truncated = 0;
1849     const char *s, *e;
1850
1851     sv_setpvn(dsv, "", 0);
1852     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1853          UV u;
1854           /* This serves double duty as a flag and a character to print after
1855              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1856           */
1857          char ok = 0;
1858
1859          if (pvlim && SvCUR(dsv) >= pvlim) {
1860               truncated++;
1861               break;
1862          }
1863          u = utf8_to_uvchr((U8*)s, 0);
1864          if (u < 256) {
1865              const unsigned char c = (unsigned char)u & 0xFF;
1866              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1867                  switch (c) {
1868                  case '\n':
1869                      ok = 'n'; break;
1870                  case '\r':
1871                      ok = 'r'; break;
1872                  case '\t':
1873                      ok = 't'; break;
1874                  case '\f':
1875                      ok = 'f'; break;
1876                  case '\a':
1877                      ok = 'a'; break;
1878                  case '\\':
1879                      ok = '\\'; break;
1880                  default: break;
1881                  }
1882                  if (ok) {
1883                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1884                  }
1885              }
1886              /* isPRINT() is the locale-blind version. */
1887              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1888                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1889                  ok = 1;
1890              }
1891          }
1892          if (!ok)
1893              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1894     }
1895     if (truncated)
1896          sv_catpvn(dsv, "...", 3);
1897
1898     return SvPVX(dsv);
1899 }
1900
1901 /*
1902 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1903
1904 Build to the scalar dsv a displayable version of the scalar sv,
1905 the displayable version being at most pvlim bytes long
1906 (if longer, the rest is truncated and "..." will be appended).
1907
1908 The flags argument is as in pv_uni_display().
1909
1910 The pointer to the PV of the dsv is returned.
1911
1912 =cut */
1913 char *
1914 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1915 {
1916      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1917                                 SvCUR(ssv), pvlim, flags);
1918 }
1919
1920 /*
1921 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1922
1923 Return true if the strings s1 and s2 differ case-insensitively, false
1924 if not (if they are equal case-insensitively).  If u1 is true, the
1925 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1926 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1927 are false, the respective string is assumed to be in native 8-bit
1928 encoding.
1929
1930 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1931 in there (they will point at the beginning of the I<next> character).
1932 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1933 pointers beyond which scanning will not continue under any
1934 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1935 s2+l2 will be used as goal end pointers that will also stop the scan,
1936 and which qualify towards defining a successful match: all the scans
1937 that define an explicit length must reach their goal pointers for
1938 a match to succeed).
1939
1940 For case-insensitiveness, the "casefolding" of Unicode is used
1941 instead of upper/lowercasing both the characters, see
1942 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1943
1944 =cut */
1945 I32
1946 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1947 {
1948      register const U8 *p1  = (const U8*)s1;
1949      register const U8 *p2  = (const U8*)s2;
1950      register const U8 *f1 = 0, *f2 = 0;
1951      register U8 *e1 = 0, *q1 = 0;
1952      register U8 *e2 = 0, *q2 = 0;
1953      STRLEN n1 = 0, n2 = 0;
1954      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1955      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
1956      U8 natbuf[1+1];
1957      STRLEN foldlen1, foldlen2;
1958      bool match;
1959
1960      if (pe1)
1961           e1 = *(U8**)pe1;
1962      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1963           f1 = (const U8*)s1 + l1;
1964      if (pe2)
1965           e2 = *(U8**)pe2;
1966      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1967           f2 = (const U8*)s2 + l2;
1968
1969      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1970           return 1; /* mismatch; possible infinite loop or false positive */
1971
1972      if (!u1 || !u2)
1973           natbuf[1] = 0; /* Need to terminate the buffer. */
1974
1975      while ((e1 == 0 || p1 < e1) &&
1976             (f1 == 0 || p1 < f1) &&
1977             (e2 == 0 || p2 < e2) &&
1978             (f2 == 0 || p2 < f2)) {
1979           if (n1 == 0) {
1980                if (u1)
1981                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1982                else {
1983                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
1984                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1985                }
1986                q1 = foldbuf1;
1987                n1 = foldlen1;
1988           }
1989           if (n2 == 0) {
1990                if (u2)
1991                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1992                else {
1993                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
1994                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1995                }
1996                q2 = foldbuf2;
1997                n2 = foldlen2;
1998           }
1999           while (n1 && n2) {
2000                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2001                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2002                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2003                    return 1; /* mismatch */
2004                n1 -= UTF8SKIP(q1);
2005                q1 += UTF8SKIP(q1);
2006                n2 -= UTF8SKIP(q2);
2007                q2 += UTF8SKIP(q2);
2008           }
2009           if (n1 == 0)
2010                p1 += u1 ? UTF8SKIP(p1) : 1;
2011           if (n2 == 0)
2012                p2 += u2 ? UTF8SKIP(p2) : 1;
2013
2014      }
2015
2016      /* A match is defined by all the scans that specified
2017       * an explicit length reaching their final goals. */
2018      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2019
2020      if (match) {
2021           if (pe1)
2022                *pe1 = (char*)p1;
2023           if (pe2)
2024                *pe2 = (char*)p2;
2025      }
2026
2027      return match ? 0 : 1; /* 0 match, 1 mismatch */
2028 }
2029
2030 /*
2031  * Local variables:
2032  * c-indentation-style: bsd
2033  * c-basic-offset: 4
2034  * indent-tabs-mode: t
2035  * End:
2036  *
2037  * ex: set ts=8 sts=4 sw=4 noet:
2038  */