utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
   4  *    others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 U8 *
 171 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 172 {
 173     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 174 }
 175
 176 /*
 177
 178 Tests if some arbitrary number of bytes begins in a valid UTF-8
 179 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 180 UTF-8 character.  The actual number of bytes in the UTF-8 character
 181 will be returned if it is valid, otherwise 0.
 182
 183 This is the "slow" version as opposed to the "fast" version which is
 184 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 185 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 186 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 187 you should use the _slow().  In practice this means that the _slow()
 188 will be used very rarely, since the maximum Unicode code point (as of
 189 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 190 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 191 five bytes or more.
 192
 193 =cut */
 194 STATIC STRLEN
 195 S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
 196 {
 197     U8 u = *s;
 198     STRLEN slen;
 199     UV uv, ouv;
 200
 201     if (UTF8_IS_INVARIANT(u))
 202         return 1;
 203
 204     if (!UTF8_IS_START(u))
 205         return 0;
 206
 207     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 208         return 0;
 209
 210     slen = len - 1;
 211     s++;
 212 #ifdef EBCDIC
 213     u = NATIVE_TO_UTF(u);
 214 #endif
 215     u &= UTF_START_MASK(len);
 216     uv  = u;
 217     ouv = uv;
 218     while (slen--) {
 219         if (!UTF8_IS_CONTINUATION(*s))
 220             return 0;
 221         uv = UTF8_ACCUMULATE(uv, *s);
 222         if (uv < ouv)
 223             return 0;
 224         ouv = uv;
 225         s++;
 226     }
 227
 228     if ((STRLEN)UNISKIP(uv) < len)
 229         return 0;
 230
 231     return len;
 232 }
 233
 234 /*
 235 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 236
 237 Tests if some arbitrary number of bytes begins in a valid UTF-8
 238 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 239 UTF-8 character.  The actual number of bytes in the UTF-8 character
 240 will be returned if it is valid, otherwise 0.
 241
 242 =cut */
 243 STRLEN
 244 Perl_is_utf8_char(pTHX_ const U8 *s)
 245 {
 246     const STRLEN len = UTF8SKIP(s);
 247 #ifdef IS_UTF8_CHAR
 248     if (IS_UTF8_CHAR_FAST(len))
 249         return IS_UTF8_CHAR(s, len) ? len : 0;
 250 #endif /* #ifdef IS_UTF8_CHAR */
 251     return is_utf8_char_slow(s, len);
 252 }
 253
 254 /*
 255 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 256
 257 Returns true if first C<len> bytes of the given string form a valid
 258 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 259 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 260 because a valid ASCII string is a valid UTF-8 string.
 261
 262 See also is_utf8_string_loclen() and is_utf8_string_loc().
 263
 264 =cut
 265 */
 266
 267 bool
 268 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 269 {
 270     const U8* x = s;
 271     const U8* send;
 272
 273     if (!len)
 274         len = strlen((const char *)s);
 275     send = s + len;
 276
 277     while (x < send) {
 278         STRLEN c;
 279          /* Inline the easy bits of is_utf8_char() here for speed... */
 280          if (UTF8_IS_INVARIANT(*x))
 281               c = 1;
 282          else if (!UTF8_IS_START(*x))
 283              goto out;
 284          else {
 285               /* ... and call is_utf8_char() only if really needed. */
 286 #ifdef IS_UTF8_CHAR
 287              c = UTF8SKIP(x);
 288              if (IS_UTF8_CHAR_FAST(c)) {
 289                  if (!IS_UTF8_CHAR(x, c))
 290                      goto out;
 291              } else if (!is_utf8_char_slow(x, c))
 292                  goto out;
 293 #else
 294              c = is_utf8_char(x);
 295 #endif /* #ifdef IS_UTF8_CHAR */
 296               if (!c)
 297                   goto out;
 298          }
 299         x += c;
 300     }
 301
 302  out:
 303     if (x != send)
 304         return FALSE;
 305
 306     return TRUE;
 307 }
 308
 309 /*
 310 Implemented as a macro in utf8.h
 311
 312 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 313
 314 Like is_utf8_string() but stores the location of the failure (in the
 315 case of "utf8ness failure") or the location s+len (in the case of
 316 "utf8ness success") in the C<ep>.
 317
 318 See also is_utf8_string_loclen() and is_utf8_string().
 319
 320 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 321
 322 Like is_utf8_string() but stores the location of the failure (in the
 323 case of "utf8ness failure") or the location s+len (in the case of
 324 "utf8ness success") in the C<ep>, and the number of UTF-8
 325 encoded characters in the C<el>.
 326
 327 See also is_utf8_string_loc() and is_utf8_string().
 328
 329 =cut
 330 */
 331
 332 bool
 333 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 334 {
 335     const U8* x = s;
 336     const U8* send;
 337     STRLEN c;
 338
 339     if (!len)
 340         len = strlen((const char *)s);
 341     send = s + len;
 342     if (el)
 343         *el = 0;
 344
 345     while (x < send) {
 346          /* Inline the easy bits of is_utf8_char() here for speed... */
 347          if (UTF8_IS_INVARIANT(*x))
 348              c = 1;
 349          else if (!UTF8_IS_START(*x))
 350              goto out;
 351          else {
 352              /* ... and call is_utf8_char() only if really needed. */
 353 #ifdef IS_UTF8_CHAR
 354              c = UTF8SKIP(x);
 355              if (IS_UTF8_CHAR_FAST(c)) {
 356                  if (!IS_UTF8_CHAR(x, c))
 357                      c = 0;
 358              } else
 359                  c = is_utf8_char_slow(x, c);
 360 #else
 361              c = is_utf8_char(x);
 362 #endif /* #ifdef IS_UTF8_CHAR */
 363              if (!c)
 364                  goto out;
 365          }
 366          x += c;
 367          if (el)
 368              (*el)++;
 369     }
 370
 371  out:
 372     if (ep)
 373         *ep = x;
 374     if (x != send)
 375         return FALSE;
 376
 377     return TRUE;
 378 }
 379
 380 /*
 381
 382 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 383
 384 Bottom level UTF-8 decode routine.
 385 Returns the unicode code point value of the first character in the string C<s>
 386 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 387 C<retlen> will be set to the length, in bytes, of that character.
 388
 389 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 390 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 391 it is assumed that the caller will raise a warning, and this function
 392 will silently just set C<retlen> to C<-1> and return zero.  If the
 393 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 394 malformations will be given, C<retlen> will be set to the expected
 395 length of the UTF-8 character in bytes, and zero will be returned.
 396
 397 The C<flags> can also contain various flags to allow deviations from
 398 the strict UTF-8 encoding (see F<utf8.h>).
 399
 400 Most code should use utf8_to_uvchr() rather than call this directly.
 401
 402 =cut
 403 */
 404
 405 UV
 406 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 407 {
 408     const U8 *s0 = s;
 409     UV uv = *s, ouv = 0;
 410     STRLEN len = 1;
 411     const bool dowarn = ckWARN_d(WARN_UTF8);
 412     const UV startbyte = *s;
 413     STRLEN expectlen = 0;
 414     U32 warning = 0;
 415
 416 /* This list is a superset of the UTF8_ALLOW_XXX. */
 417
 418 #define UTF8_WARN_EMPTY                          1
 419 #define UTF8_WARN_CONTINUATION                   2
 420 #define UTF8_WARN_NON_CONTINUATION               3
 421 #define UTF8_WARN_FE_FF                          4
 422 #define UTF8_WARN_SHORT                          5
 423 #define UTF8_WARN_OVERFLOW                       6
 424 #define UTF8_WARN_SURROGATE                      7
 425 #define UTF8_WARN_LONG                           8
 426 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 427
 428     if (curlen == 0 &&
 429         !(flags & UTF8_ALLOW_EMPTY)) {
 430         warning = UTF8_WARN_EMPTY;
 431         goto malformed;
 432     }
 433
 434     if (UTF8_IS_INVARIANT(uv)) {
 435         if (retlen)
 436             *retlen = 1;
 437         return (UV) (NATIVE_TO_UTF(*s));
 438     }
 439
 440     if (UTF8_IS_CONTINUATION(uv) &&
 441         !(flags & UTF8_ALLOW_CONTINUATION)) {
 442         warning = UTF8_WARN_CONTINUATION;
 443         goto malformed;
 444     }
 445
 446     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 447         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 448         warning = UTF8_WARN_NON_CONTINUATION;
 449         goto malformed;
 450     }
 451
 452 #ifdef EBCDIC
 453     uv = NATIVE_TO_UTF(uv);
 454 #else
 455     if ((uv == 0xfe || uv == 0xff) &&
 456         !(flags & UTF8_ALLOW_FE_FF)) {
 457         warning = UTF8_WARN_FE_FF;
 458         goto malformed;
 459     }
 460 #endif
 461
 462     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 463     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 464     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 465     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 466 #ifdef EBCDIC
 467     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 468     else                        { len =  7; uv &= 0x01; }
 469 #else
 470     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 471     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 472     else                        { len = 13; uv = 0; } /* whoa! */
 473 #endif
 474
 475     if (retlen)
 476         *retlen = len;
 477
 478     expectlen = len;
 479
 480     if ((curlen < expectlen) &&
 481         !(flags & UTF8_ALLOW_SHORT)) {
 482         warning = UTF8_WARN_SHORT;
 483         goto malformed;
 484     }
 485
 486     len--;
 487     s++;
 488     ouv = uv;
 489
 490     while (len--) {
 491         if (!UTF8_IS_CONTINUATION(*s) &&
 492             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 493             s--;
 494             warning = UTF8_WARN_NON_CONTINUATION;
 495             goto malformed;
 496         }
 497         else
 498             uv = UTF8_ACCUMULATE(uv, *s);
 499         if (!(uv > ouv)) {
 500             /* These cannot be allowed. */
 501             if (uv == ouv) {
 502                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 503                     warning = UTF8_WARN_LONG;
 504                     goto malformed;
 505                 }
 506             }
 507             else { /* uv < ouv */
 508                 /* This cannot be allowed. */
 509                 warning = UTF8_WARN_OVERFLOW;
 510                 goto malformed;
 511             }
 512         }
 513         s++;
 514         ouv = uv;
 515     }
 516
 517     if (UNICODE_IS_SURROGATE(uv) &&
 518         !(flags & UTF8_ALLOW_SURROGATE)) {
 519         warning = UTF8_WARN_SURROGATE;
 520         goto malformed;
 521     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 522                !(flags & UTF8_ALLOW_LONG)) {
 523         warning = UTF8_WARN_LONG;
 524         goto malformed;
 525     } else if (UNICODE_IS_ILLEGAL(uv) &&
 526                !(flags & UTF8_ALLOW_FFFF)) {
 527         warning = UTF8_WARN_FFFF;
 528         goto malformed;
 529     }
 530
 531     return uv;
 532
 533 malformed:
 534
 535     if (flags & UTF8_CHECK_ONLY) {
 536         if (retlen)
 537             *retlen = -1;
 538         return 0;
 539     }
 540
 541     if (dowarn) {
 542         SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 543
 544         switch (warning) {
 545         case 0: /* Intentionally empty. */ break;
 546         case UTF8_WARN_EMPTY:
 547             Perl_sv_catpv(aTHX_ sv, "(empty string)");
 548             break;
 549         case UTF8_WARN_CONTINUATION:
 550             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 551             break;
 552         case UTF8_WARN_NON_CONTINUATION:
 553             if (s == s0)
 554                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 555                            (UV)s[1], startbyte);
 556             else {
 557                 const int len = (int)(s-s0);
 558                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 559                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 560             }
 561
 562             break;
 563         case UTF8_WARN_FE_FF:
 564             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 565             break;
 566         case UTF8_WARN_SHORT:
 567             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 568                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 569             expectlen = curlen;         /* distance for caller to skip */
 570             break;
 571         case UTF8_WARN_OVERFLOW:
 572             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 573                            ouv, *s, startbyte);
 574             break;
 575         case UTF8_WARN_SURROGATE:
 576             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 577             break;
 578         case UTF8_WARN_LONG:
 579             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 580                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 581             break;
 582         case UTF8_WARN_FFFF:
 583             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 584             break;
 585         default:
 586             Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
 587             break;
 588         }
 589
 590         if (warning) {
 591             const char * const s = SvPVX_const(sv);
 592
 593             if (PL_op)
 594                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 595                             "%s in %s", s,  OP_DESC(PL_op));
 596             else
 597                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 598         }
 599     }
 600
 601     if (retlen)
 602         *retlen = expectlen ? expectlen : len;
 603
 604     return 0;
 605 }
 606
 607 /*
 608 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 609
 610 Returns the native character value of the first character in the string C<s>
 611 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 612 length, in bytes, of that character.
 613
 614 If C<s> does not point to a well-formed UTF-8 character, zero is
 615 returned and retlen is set, if possible, to -1.
 616
 617 =cut
 618 */
 619
 620 UV
 621 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 622 {
 623     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
 624                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 625 }
 626
 627 /*
 628 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 629
 630 Returns the Unicode code point of the first character in the string C<s>
 631 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 632 length, in bytes, of that character.
 633
 634 This function should only be used when returned UV is considered
 635 an index into the Unicode semantic tables (e.g. swashes).
 636
 637 If C<s> does not point to a well-formed UTF-8 character, zero is
 638 returned and retlen is set, if possible, to -1.
 639
 640 =cut
 641 */
 642
 643 UV
 644 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 645 {
 646     /* Call the low level routine asking for checks */
 647     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 648                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 649 }
 650
 651 /*
 652 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 653
 654 Return the length of the UTF-8 char encoded string C<s> in characters.
 655 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 656 up past C<e>, croaks.
 657
 658 =cut
 659 */
 660
 661 STRLEN
 662 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 663 {
 664     STRLEN len = 0;
 665
 666     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 667      * the bitops (especially ~) can create illegal UTF-8.
 668      * In other words: in Perl UTF-8 is not just for Unicode. */
 669
 670     if (e < s)
 671         goto warn_and_return;
 672     while (s < e) {
 673         const U8 t = UTF8SKIP(s);
 674         if (e - s < t) {
 675             warn_and_return:
 676             if (ckWARN_d(WARN_UTF8)) {
 677                 if (PL_op)
 678                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 679                             "%s in %s", unees, OP_DESC(PL_op));
 680                 else
 681                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 682             }
 683             return len;
 684         }
 685         s += t;
 686         len++;
 687     }
 688
 689     return len;
 690 }
 691
 692 /*
 693 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 694
 695 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 696 and C<b>.
 697
 698 WARNING: use only if you *know* that the pointers point inside the
 699 same UTF-8 buffer.
 700
 701 =cut
 702 */
 703
 704 IV
 705 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 706 {
 707     IV off = 0;
 708
 709     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 710      * the bitops (especially ~) can create illegal UTF-8.
 711      * In other words: in Perl UTF-8 is not just for Unicode. */
 712
 713     if (a < b) {
 714         while (a < b) {
 715             const U8 c = UTF8SKIP(a);
 716             if (b - a < c)
 717                 goto warn_and_return;
 718             a += c;
 719             off--;
 720         }
 721     }
 722     else {
 723         while (b < a) {
 724             const U8 c = UTF8SKIP(b);
 725
 726             if (a - b < c) {
 727                 warn_and_return:
 728                 if (ckWARN_d(WARN_UTF8)) {
 729                     if (PL_op)
 730                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 731                                     "%s in %s", unees, OP_DESC(PL_op));
 732                     else
 733                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 734                 }
 735                 return off;
 736             }
 737             b += c;
 738             off++;
 739         }
 740     }
 741
 742     return off;
 743 }
 744
 745 /*
 746 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 747
 748 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 749 forward or backward.
 750
 751 WARNING: do not use the following unless you *know* C<off> is within
 752 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 753 on the first byte of character or just after the last byte of a character.
 754
 755 =cut
 756 */
 757
 758 U8 *
 759 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 760 {
 761     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 762      * the bitops (especially ~) can create illegal UTF-8.
 763      * In other words: in Perl UTF-8 is not just for Unicode. */
 764
 765     if (off >= 0) {
 766         while (off--)
 767             s += UTF8SKIP(s);
 768     }
 769     else {
 770         while (off++) {
 771             s--;
 772             while (UTF8_IS_CONTINUATION(*s))
 773                 s--;
 774         }
 775     }
 776     return (U8 *)s;
 777 }
 778
 779 /*
 780 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 781
 782 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 783 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 784 updates len to contain the new length.
 785 Returns zero on failure, setting C<len> to -1.
 786
 787 =cut
 788 */
 789
 790 U8 *
 791 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 792 {
 793     U8 *send;
 794     U8 *d;
 795     U8 *save = s;
 796
 797     /* ensure valid UTF-8 and chars < 256 before updating string */
 798     for (send = s + *len; s < send; ) {
 799         U8 c = *s++;
 800
 801         if (!UTF8_IS_INVARIANT(c) &&
 802             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 803              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 804             *len = -1;
 805             return 0;
 806         }
 807     }
 808
 809     d = s = save;
 810     while (s < send) {
 811         STRLEN ulen;
 812         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 813         s += ulen;
 814     }
 815     *d = '\0';
 816     *len = d - save;
 817     return save;
 818 }
 819
 820 /*
 821 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 822
 823 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 824 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 825 the newly-created string, and updates C<len> to contain the new
 826 length.  Returns the original string if no conversion occurs, C<len>
 827 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 828 0 if C<s> is converted or contains all 7bit characters.
 829
 830 =cut
 831 */
 832
 833 U8 *
 834 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 835 {
 836     U8 *d;
 837     const U8 *start = s;
 838     const U8 *send;
 839     I32 count = 0;
 840
 841     if (!*is_utf8)
 842         return (U8 *)start;
 843
 844     /* ensure valid UTF-8 and chars < 256 before converting string */
 845     for (send = s + *len; s < send;) {
 846         U8 c = *s++;
 847         if (!UTF8_IS_INVARIANT(c)) {
 848             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 849                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 850                 count++;
 851             else
 852                 return (U8 *)start;
 853         }
 854     }
 855
 856     *is_utf8 = 0;
 857
 858     Newxz(d, (*len) - count + 1, U8);
 859     s = start; start = d;
 860     while (s < send) {
 861         U8 c = *s++;
 862         if (!UTF8_IS_INVARIANT(c)) {
 863             /* Then it is two-byte encoded */
 864             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 865             c = ASCII_TO_NATIVE(c);
 866         }
 867         *d++ = c;
 868     }
 869     *d = '\0';
 870     *len = d - start;
 871     return (U8 *)start;
 872 }
 873
 874 /*
 875 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 876
 877 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 878 Returns a pointer to the newly-created string, and sets C<len> to
 879 reflect the new length.
 880
 881 If you want to convert to UTF-8 from other encodings than ASCII,
 882 see sv_recode_to_utf8().
 883
 884 =cut
 885 */
 886
 887 U8*
 888 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 889 {
 890     const U8 * const send = s + (*len);
 891     U8 *d;
 892     U8 *dst;
 893
 894     Newxz(d, (*len) * 2 + 1, U8);
 895     dst = d;
 896
 897     while (s < send) {
 898         const UV uv = NATIVE_TO_ASCII(*s++);
 899         if (UNI_IS_INVARIANT(uv))
 900             *d++ = (U8)UTF_TO_NATIVE(uv);
 901         else {
 902             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 903             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 904         }
 905     }
 906     *d = '\0';
 907     *len = d-dst;
 908     return dst;
 909 }
 910
 911 /*
 912  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 913  *
 914  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 915  * We optimize for native, for obvious reasons. */
 916
 917 U8*
 918 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 919 {
 920     U8* pend;
 921     U8* dstart = d;
 922
 923     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 924          d[0] = 0;
 925          *newlen = 1;
 926          return d;
 927     }
 928
 929     if (bytelen & 1)
 930         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 931
 932     pend = p + bytelen;
 933
 934     while (p < pend) {
 935         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 936         p += 2;
 937         if (uv < 0x80) {
 938             *d++ = (U8)uv;
 939             continue;
 940         }
 941         if (uv < 0x800) {
 942             *d++ = (U8)(( uv >>  6)         | 0xc0);
 943             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 944             continue;
 945         }
 946         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 947             UV low = (p[0] << 8) + p[1];
 948             p += 2;
 949             if (low < 0xdc00 || low >= 0xdfff)
 950                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 951             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 952         }
 953         if (uv < 0x10000) {
 954             *d++ = (U8)(( uv >> 12)         | 0xe0);
 955             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 956             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 957             continue;
 958         }
 959         else {
 960             *d++ = (U8)(( uv >> 18)         | 0xf0);
 961             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 962             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 963             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 964             continue;
 965         }
 966     }
 967     *newlen = d - dstart;
 968     return d;
 969 }
 970
 971 /* Note: this one is slightly destructive of the source. */
 972
 973 U8*
 974 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 975 {
 976     U8* s = (U8*)p;
 977     U8* send = s + bytelen;
 978     while (s < send) {
 979         U8 tmp = s[0];
 980         s[0] = s[1];
 981         s[1] = tmp;
 982         s += 2;
 983     }
 984     return utf16_to_utf8(p, d, bytelen, newlen);
 985 }
 986
 987 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 988
 989 bool
 990 Perl_is_uni_alnum(pTHX_ UV c)
 991 {
 992     U8 tmpbuf[UTF8_MAXBYTES+1];
 993     uvchr_to_utf8(tmpbuf, c);
 994     return is_utf8_alnum(tmpbuf);
 995 }
 996
 997 bool
 998 Perl_is_uni_alnumc(pTHX_ UV c)
 999 {
1000     U8 tmpbuf[UTF8_MAXBYTES+1];
1001     uvchr_to_utf8(tmpbuf, c);
1002     return is_utf8_alnumc(tmpbuf);
1003 }
1004
1005 bool
1006 Perl_is_uni_idfirst(pTHX_ UV c)
1007 {
1008     U8 tmpbuf[UTF8_MAXBYTES+1];
1009     uvchr_to_utf8(tmpbuf, c);
1010     return is_utf8_idfirst(tmpbuf);
1011 }
1012
1013 bool
1014 Perl_is_uni_alpha(pTHX_ UV c)
1015 {
1016     U8 tmpbuf[UTF8_MAXBYTES+1];
1017     uvchr_to_utf8(tmpbuf, c);
1018     return is_utf8_alpha(tmpbuf);
1019 }
1020
1021 bool
1022 Perl_is_uni_ascii(pTHX_ UV c)
1023 {
1024     U8 tmpbuf[UTF8_MAXBYTES+1];
1025     uvchr_to_utf8(tmpbuf, c);
1026     return is_utf8_ascii(tmpbuf);
1027 }
1028
1029 bool
1030 Perl_is_uni_space(pTHX_ UV c)
1031 {
1032     U8 tmpbuf[UTF8_MAXBYTES+1];
1033     uvchr_to_utf8(tmpbuf, c);
1034     return is_utf8_space(tmpbuf);
1035 }
1036
1037 bool
1038 Perl_is_uni_digit(pTHX_ UV c)
1039 {
1040     U8 tmpbuf[UTF8_MAXBYTES+1];
1041     uvchr_to_utf8(tmpbuf, c);
1042     return is_utf8_digit(tmpbuf);
1043 }
1044
1045 bool
1046 Perl_is_uni_upper(pTHX_ UV c)
1047 {
1048     U8 tmpbuf[UTF8_MAXBYTES+1];
1049     uvchr_to_utf8(tmpbuf, c);
1050     return is_utf8_upper(tmpbuf);
1051 }
1052
1053 bool
1054 Perl_is_uni_lower(pTHX_ UV c)
1055 {
1056     U8 tmpbuf[UTF8_MAXBYTES+1];
1057     uvchr_to_utf8(tmpbuf, c);
1058     return is_utf8_lower(tmpbuf);
1059 }
1060
1061 bool
1062 Perl_is_uni_cntrl(pTHX_ UV c)
1063 {
1064     U8 tmpbuf[UTF8_MAXBYTES+1];
1065     uvchr_to_utf8(tmpbuf, c);
1066     return is_utf8_cntrl(tmpbuf);
1067 }
1068
1069 bool
1070 Perl_is_uni_graph(pTHX_ UV c)
1071 {
1072     U8 tmpbuf[UTF8_MAXBYTES+1];
1073     uvchr_to_utf8(tmpbuf, c);
1074     return is_utf8_graph(tmpbuf);
1075 }
1076
1077 bool
1078 Perl_is_uni_print(pTHX_ UV c)
1079 {
1080     U8 tmpbuf[UTF8_MAXBYTES+1];
1081     uvchr_to_utf8(tmpbuf, c);
1082     return is_utf8_print(tmpbuf);
1083 }
1084
1085 bool
1086 Perl_is_uni_punct(pTHX_ UV c)
1087 {
1088     U8 tmpbuf[UTF8_MAXBYTES+1];
1089     uvchr_to_utf8(tmpbuf, c);
1090     return is_utf8_punct(tmpbuf);
1091 }
1092
1093 bool
1094 Perl_is_uni_xdigit(pTHX_ UV c)
1095 {
1096     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1097     uvchr_to_utf8(tmpbuf, c);
1098     return is_utf8_xdigit(tmpbuf);
1099 }
1100
1101 UV
1102 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1103 {
1104     uvchr_to_utf8(p, c);
1105     return to_utf8_upper(p, p, lenp);
1106 }
1107
1108 UV
1109 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1110 {
1111     uvchr_to_utf8(p, c);
1112     return to_utf8_title(p, p, lenp);
1113 }
1114
1115 UV
1116 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1117 {
1118     uvchr_to_utf8(p, c);
1119     return to_utf8_lower(p, p, lenp);
1120 }
1121
1122 UV
1123 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1124 {
1125     uvchr_to_utf8(p, c);
1126     return to_utf8_fold(p, p, lenp);
1127 }
1128
1129 /* for now these all assume no locale info available for Unicode > 255 */
1130
1131 bool
1132 Perl_is_uni_alnum_lc(pTHX_ UV c)
1133 {
1134     return is_uni_alnum(c);     /* XXX no locale support yet */
1135 }
1136
1137 bool
1138 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1139 {
1140     return is_uni_alnumc(c);    /* XXX no locale support yet */
1141 }
1142
1143 bool
1144 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1145 {
1146     return is_uni_idfirst(c);   /* XXX no locale support yet */
1147 }
1148
1149 bool
1150 Perl_is_uni_alpha_lc(pTHX_ UV c)
1151 {
1152     return is_uni_alpha(c);     /* XXX no locale support yet */
1153 }
1154
1155 bool
1156 Perl_is_uni_ascii_lc(pTHX_ UV c)
1157 {
1158     return is_uni_ascii(c);     /* XXX no locale support yet */
1159 }
1160
1161 bool
1162 Perl_is_uni_space_lc(pTHX_ UV c)
1163 {
1164     return is_uni_space(c);     /* XXX no locale support yet */
1165 }
1166
1167 bool
1168 Perl_is_uni_digit_lc(pTHX_ UV c)
1169 {
1170     return is_uni_digit(c);     /* XXX no locale support yet */
1171 }
1172
1173 bool
1174 Perl_is_uni_upper_lc(pTHX_ UV c)
1175 {
1176     return is_uni_upper(c);     /* XXX no locale support yet */
1177 }
1178
1179 bool
1180 Perl_is_uni_lower_lc(pTHX_ UV c)
1181 {
1182     return is_uni_lower(c);     /* XXX no locale support yet */
1183 }
1184
1185 bool
1186 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1187 {
1188     return is_uni_cntrl(c);     /* XXX no locale support yet */
1189 }
1190
1191 bool
1192 Perl_is_uni_graph_lc(pTHX_ UV c)
1193 {
1194     return is_uni_graph(c);     /* XXX no locale support yet */
1195 }
1196
1197 bool
1198 Perl_is_uni_print_lc(pTHX_ UV c)
1199 {
1200     return is_uni_print(c);     /* XXX no locale support yet */
1201 }
1202
1203 bool
1204 Perl_is_uni_punct_lc(pTHX_ UV c)
1205 {
1206     return is_uni_punct(c);     /* XXX no locale support yet */
1207 }
1208
1209 bool
1210 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1211 {
1212     return is_uni_xdigit(c);    /* XXX no locale support yet */
1213 }
1214
1215 U32
1216 Perl_to_uni_upper_lc(pTHX_ U32 c)
1217 {
1218     /* XXX returns only the first character -- do not use XXX */
1219     /* XXX no locale support yet */
1220     STRLEN len;
1221     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1222     return (U32)to_uni_upper(c, tmpbuf, &len);
1223 }
1224
1225 U32
1226 Perl_to_uni_title_lc(pTHX_ U32 c)
1227 {
1228     /* XXX returns only the first character XXX -- do not use XXX */
1229     /* XXX no locale support yet */
1230     STRLEN len;
1231     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1232     return (U32)to_uni_title(c, tmpbuf, &len);
1233 }
1234
1235 U32
1236 Perl_to_uni_lower_lc(pTHX_ U32 c)
1237 {
1238     /* XXX returns only the first character -- do not use XXX */
1239     /* XXX no locale support yet */
1240     STRLEN len;
1241     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1242     return (U32)to_uni_lower(c, tmpbuf, &len);
1243 }
1244
1245 bool
1246 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1247 {
1248     if (!is_utf8_char(p))
1249         return FALSE;
1250     if (!PL_utf8_alnum)
1251         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1252          * descendant of isalnum(3), in other words, it doesn't
1253          * contain the '_'. --jhi */
1254         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1255     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1256 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1257 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1258     if (!PL_utf8_alnum)
1259         PL_utf8_alnum = swash_init("utf8", "",
1260             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1261     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1262 #endif
1263 }
1264
1265 bool
1266 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1267 {
1268     if (!is_utf8_char(p))
1269         return FALSE;
1270     if (!PL_utf8_alnum)
1271         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1272     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1273 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1274 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1275     if (!PL_utf8_alnum)
1276         PL_utf8_alnum = swash_init("utf8", "",
1277             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1278     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1279 #endif
1280 }
1281
1282 bool
1283 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1284 {
1285     if (*p == '_')
1286         return TRUE;
1287     if (!is_utf8_char(p))
1288         return FALSE;
1289     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1290         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1291     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1292 }
1293
1294 bool
1295 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1296 {
1297     if (*p == '_')
1298         return TRUE;
1299     if (!is_utf8_char(p))
1300         return FALSE;
1301     if (!PL_utf8_idcont)
1302         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1303     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1304 }
1305
1306 bool
1307 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1308 {
1309     if (!is_utf8_char(p))
1310         return FALSE;
1311     if (!PL_utf8_alpha)
1312         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1313     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1314 }
1315
1316 bool
1317 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1318 {
1319     if (!is_utf8_char(p))
1320         return FALSE;
1321     if (!PL_utf8_ascii)
1322         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1323     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1324 }
1325
1326 bool
1327 Perl_is_utf8_space(pTHX_ const U8 *p)
1328 {
1329     if (!is_utf8_char(p))
1330         return FALSE;
1331     if (!PL_utf8_space)
1332         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1333     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1334 }
1335
1336 bool
1337 Perl_is_utf8_digit(pTHX_ const U8 *p)
1338 {
1339     if (!is_utf8_char(p))
1340         return FALSE;
1341     if (!PL_utf8_digit)
1342         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1343     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1344 }
1345
1346 bool
1347 Perl_is_utf8_upper(pTHX_ const U8 *p)
1348 {
1349     if (!is_utf8_char(p))
1350         return FALSE;
1351     if (!PL_utf8_upper)
1352         PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1353     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1354 }
1355
1356 bool
1357 Perl_is_utf8_lower(pTHX_ const U8 *p)
1358 {
1359     if (!is_utf8_char(p))
1360         return FALSE;
1361     if (!PL_utf8_lower)
1362         PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1363     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1364 }
1365
1366 bool
1367 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1368 {
1369     if (!is_utf8_char(p))
1370         return FALSE;
1371     if (!PL_utf8_cntrl)
1372         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1373     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1374 }
1375
1376 bool
1377 Perl_is_utf8_graph(pTHX_ const U8 *p)
1378 {
1379     if (!is_utf8_char(p))
1380         return FALSE;
1381     if (!PL_utf8_graph)
1382         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1383     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1384 }
1385
1386 bool
1387 Perl_is_utf8_print(pTHX_ const U8 *p)
1388 {
1389     if (!is_utf8_char(p))
1390         return FALSE;
1391     if (!PL_utf8_print)
1392         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1393     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1394 }
1395
1396 bool
1397 Perl_is_utf8_punct(pTHX_ const U8 *p)
1398 {
1399     if (!is_utf8_char(p))
1400         return FALSE;
1401     if (!PL_utf8_punct)
1402         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1403     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1404 }
1405
1406 bool
1407 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1408 {
1409     if (!is_utf8_char(p))
1410         return FALSE;
1411     if (!PL_utf8_xdigit)
1412         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1413     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1414 }
1415
1416 bool
1417 Perl_is_utf8_mark(pTHX_ const U8 *p)
1418 {
1419     if (!is_utf8_char(p))
1420         return FALSE;
1421     if (!PL_utf8_mark)
1422         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1423     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1424 }
1425
1426 /*
1427 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1428
1429 The "p" contains the pointer to the UTF-8 string encoding
1430 the character that is being converted.
1431
1432 The "ustrp" is a pointer to the character buffer to put the
1433 conversion result to.  The "lenp" is a pointer to the length
1434 of the result.
1435
1436 The "swashp" is a pointer to the swash to use.
1437
1438 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1439 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1440 but not always, a multicharacter mapping), is tried first.
1441
1442 The "special" is a string like "utf8::ToSpecLower", which means the
1443 hash %utf8::ToSpecLower.  The access to the hash is through
1444 Perl_to_utf8_case().
1445
1446 The "normal" is a string like "ToLower" which means the swash
1447 %utf8::ToLower.
1448
1449 =cut */
1450
1451 UV
1452 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1453                         SV **swashp, const char *normal, const char *special)
1454 {
1455     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1456     STRLEN len = 0;
1457
1458     const UV uv0 = utf8_to_uvchr(p, NULL);
1459     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1460      * are necessary in EBCDIC, they are redundant no-ops
1461      * in ASCII-ish platforms, and hopefully optimized away. */
1462     const UV uv1 = NATIVE_TO_UNI(uv0);
1463     uvuni_to_utf8(tmpbuf, uv1);
1464
1465     if (!*swashp) /* load on-demand */
1466          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1467
1468     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1469     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1470          /* It might be "special" (sometimes, but not always,
1471           * a multicharacter mapping) */
1472          HV *hv;
1473          SV **svp;
1474
1475          if ((hv  = get_hv(special, FALSE)) &&
1476              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1477              (*svp)) {
1478              const char *s;
1479
1480               s = SvPV_const(*svp, len);
1481               if (len == 1)
1482                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1483               else {
1484 #ifdef EBCDIC
1485                    /* If we have EBCDIC we need to remap the characters
1486                     * since any characters in the low 256 are Unicode
1487                     * code points, not EBCDIC. */
1488                    U8 *t = (U8*)s, *tend = t + len, *d;
1489
1490                    d = tmpbuf;
1491                    if (SvUTF8(*svp)) {
1492                         STRLEN tlen = 0;
1493
1494                         while (t < tend) {
1495                              UV c = utf8_to_uvchr(t, &tlen);
1496                              if (tlen > 0) {
1497                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1498                                   t += tlen;
1499                              }
1500                              else
1501                                   break;
1502                         }
1503                    }
1504                    else {
1505                         while (t < tend) {
1506                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1507                              t++;
1508                         }
1509                    }
1510                    len = d - tmpbuf;
1511                    Copy(tmpbuf, ustrp, len, U8);
1512 #else
1513                    Copy(s, ustrp, len, U8);
1514 #endif
1515               }
1516          }
1517     }
1518
1519     if (!len && *swashp) {
1520          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1521
1522          if (uv2) {
1523               /* It was "normal" (a single character mapping). */
1524               UV uv3 = UNI_TO_NATIVE(uv2);
1525
1526               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1527          }
1528     }
1529
1530     if (!len) /* Neither: just copy. */
1531          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1532
1533     if (lenp)
1534          *lenp = len;
1535
1536     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1537 }
1538
1539 /*
1540 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1541
1542 Convert the UTF-8 encoded character at p to its uppercase version and
1543 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1544 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1545 the uppercase version may be longer than the original character.
1546
1547 The first character of the uppercased version is returned
1548 (but note, as explained above, that there may be more.)
1549
1550 =cut */
1551
1552 UV
1553 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1554 {
1555     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1556                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1557 }
1558
1559 /*
1560 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1561
1562 Convert the UTF-8 encoded character at p to its titlecase version and
1563 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1564 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1565 titlecase version may be longer than the original character.
1566
1567 The first character of the titlecased version is returned
1568 (but note, as explained above, that there may be more.)
1569
1570 =cut */
1571
1572 UV
1573 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1574 {
1575     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1576                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1577 }
1578
1579 /*
1580 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1581
1582 Convert the UTF-8 encoded character at p to its lowercase version and
1583 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1584 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1585 lowercase version may be longer than the original character.
1586
1587 The first character of the lowercased version is returned
1588 (but note, as explained above, that there may be more.)
1589
1590 =cut */
1591
1592 UV
1593 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1594 {
1595     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1596                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1597 }
1598
1599 /*
1600 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1601
1602 Convert the UTF-8 encoded character at p to its foldcase version and
1603 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1604 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1605 foldcase version may be longer than the original character (up to
1606 three characters).
1607
1608 The first character of the foldcased version is returned
1609 (but note, as explained above, that there may be more.)
1610
1611 =cut */
1612
1613 UV
1614 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1615 {
1616     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1617                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1618 }
1619
1620 /* a "swash" is a swatch hash */
1621
1622 SV*
1623 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1624 {
1625     dVAR;
1626     SV* retval;
1627     SV* const tokenbufsv = sv_newmortal();
1628     dSP;
1629     const size_t pkg_len = strlen(pkg);
1630     const size_t name_len = strlen(name);
1631     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1632     SV* errsv_save;
1633
1634     PUSHSTACKi(PERLSI_MAGIC);
1635     ENTER;
1636     SAVEI32(PL_hints);
1637     PL_hints = 0;
1638     save_re_context();
1639     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1640         ENTER;
1641         errsv_save = newSVsv(ERRSV);
1642         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1643                          Nullsv);
1644         if (!SvTRUE(ERRSV))
1645             sv_setsv(ERRSV, errsv_save);
1646         SvREFCNT_dec(errsv_save);
1647         LEAVE;
1648     }
1649     SPAGAIN;
1650     PUSHMARK(SP);
1651     EXTEND(SP,5);
1652     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1653     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1654     PUSHs(listsv);
1655     PUSHs(sv_2mortal(newSViv(minbits)));
1656     PUSHs(sv_2mortal(newSViv(none)));
1657     PUTBACK;
1658     if (IN_PERL_COMPILETIME) {
1659         /* XXX ought to be handled by lex_start */
1660         SAVEI32(PL_in_my);
1661         PL_in_my = 0;
1662         sv_setpv(tokenbufsv, PL_tokenbuf);
1663     }
1664     errsv_save = newSVsv(ERRSV);
1665     if (call_method("SWASHNEW", G_SCALAR))
1666         retval = newSVsv(*PL_stack_sp--);
1667     else
1668         retval = &PL_sv_undef;
1669     if (!SvTRUE(ERRSV))
1670         sv_setsv(ERRSV, errsv_save);
1671     SvREFCNT_dec(errsv_save);
1672     LEAVE;
1673     POPSTACK;
1674     if (IN_PERL_COMPILETIME) {
1675         STRLEN len;
1676         const char* const pv = SvPV_const(tokenbufsv, len);
1677
1678         Copy(pv, PL_tokenbuf, len+1, char);
1679         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1680     }
1681     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1682         if (SvPOK(retval))
1683             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1684                        retval);
1685         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1686     }
1687     return retval;
1688 }
1689
1690
1691 /* This API is wrong for special case conversions since we may need to
1692  * return several Unicode characters for a single Unicode character
1693  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1694  * the lower-level routine, and it is similarly broken for returning
1695  * multiple values.  --jhi */
1696 UV
1697 Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
1698 {
1699     dVAR;
1700     HV* const hv = (HV*)SvRV(sv);
1701     U32 klen;
1702     U32 off;
1703     STRLEN slen;
1704     STRLEN needents;
1705     const U8 *tmps = NULL;
1706     U32 bit;
1707     SV *retval;
1708     U8 tmputf8[2];
1709     UV c = NATIVE_TO_ASCII(*ptr);
1710
1711     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1712         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1713         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1714         ptr = tmputf8;
1715     }
1716     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1717      * then the "swatch" is a vec() for al the chars which start
1718      * with 0xAA..0xYY
1719      * So the key in the hash (klen) is length of encoded char -1
1720      */
1721     klen = UTF8SKIP(ptr) - 1;
1722     off  = ptr[klen];
1723
1724     if (klen == 0)
1725      {
1726       /* If char in invariant then swatch is for all the invariant chars
1727        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1728        */
1729       needents = UTF_CONTINUATION_MARK;
1730       off      = NATIVE_TO_UTF(ptr[klen]);
1731      }
1732     else
1733      {
1734       /* If char is encoded then swatch is for the prefix */
1735       needents = (1 << UTF_ACCUMULATION_SHIFT);
1736       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1737      }
1738
1739     /*
1740      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1741      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1742      * it's nothing to sniff at.)  Pity we usually come through at least
1743      * two function calls to get here...
1744      *
1745      * NB: this code assumes that swatches are never modified, once generated!
1746      */
1747
1748     if (hv   == PL_last_swash_hv &&
1749         klen == PL_last_swash_klen &&
1750         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1751     {
1752         tmps = PL_last_swash_tmps;
1753         slen = PL_last_swash_slen;
1754     }
1755     else {
1756         /* Try our second-level swatch cache, kept in a hash. */
1757         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1758
1759         /* If not cached, generate it via utf8::SWASHGET */
1760         if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1761             dSP;
1762             /* We use utf8n_to_uvuni() as we want an index into
1763                Unicode tables, not a native character number.
1764              */
1765             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1766                                            ckWARN(WARN_UTF8) ?
1767                                            0 : UTF8_ALLOW_ANY);
1768             SV *errsv_save;
1769             ENTER;
1770             SAVETMPS;
1771             save_re_context();
1772             PUSHSTACKi(PERLSI_MAGIC);
1773             PUSHMARK(SP);
1774             EXTEND(SP,3);
1775             PUSHs((SV*)sv);
1776             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1777             PUSHs(sv_2mortal(newSViv((klen) ?
1778                                      (code_point & ~(needents - 1)) : 0)));
1779             PUSHs(sv_2mortal(newSViv(needents)));
1780             PUTBACK;
1781             errsv_save = newSVsv(ERRSV);
1782             if (call_method("SWASHGET", G_SCALAR))
1783                 retval = newSVsv(*PL_stack_sp--);
1784             else
1785                 retval = &PL_sv_undef;
1786             if (!SvTRUE(ERRSV))
1787                 sv_setsv(ERRSV, errsv_save);
1788             SvREFCNT_dec(errsv_save);
1789             POPSTACK;
1790             FREETMPS;
1791             LEAVE;
1792             if (IN_PERL_COMPILETIME)
1793                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1794
1795             svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1796
1797             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1798                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1799         }
1800
1801         PL_last_swash_hv = hv;
1802         PL_last_swash_klen = klen;
1803         /* FIXME change interpvar.h?  */
1804         PL_last_swash_tmps = (U8 *) tmps;
1805         PL_last_swash_slen = slen;
1806         if (klen)
1807             Copy(ptr, PL_last_swash_key, klen, U8);
1808     }
1809
1810     switch ((int)((slen << 3) / needents)) {
1811     case 1:
1812         bit = 1 << (off & 7);
1813         off >>= 3;
1814         return (tmps[off] & bit) != 0;
1815     case 8:
1816         return tmps[off];
1817     case 16:
1818         off <<= 1;
1819         return (tmps[off] << 8) + tmps[off + 1] ;
1820     case 32:
1821         off <<= 2;
1822         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1823     }
1824     Perl_croak(aTHX_ "panic: swash_fetch");
1825     return 0;
1826 }
1827
1828 U8 *
1829 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1830 {
1831     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1832 }
1833
1834 /*
1835 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1836
1837 Build to the scalar dsv a displayable version of the string spv,
1838 length len, the displayable version being at most pvlim bytes long
1839 (if longer, the rest is truncated and "..." will be appended).
1840
1841 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1842 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1843 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1844 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1845 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1846 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1847
1848 The pointer to the PV of the dsv is returned.
1849
1850 =cut */
1851 char *
1852 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1853 {
1854     int truncated = 0;
1855     const char *s, *e;
1856
1857     sv_setpvn(dsv, "", 0);
1858     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1859          UV u;
1860           /* This serves double duty as a flag and a character to print after
1861              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1862           */
1863          char ok = 0;
1864
1865          if (pvlim && SvCUR(dsv) >= pvlim) {
1866               truncated++;
1867               break;
1868          }
1869          u = utf8_to_uvchr((U8*)s, 0);
1870          if (u < 256) {
1871              const unsigned char c = (unsigned char)u & 0xFF;
1872              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1873                  switch (c) {
1874                  case '\n':
1875                      ok = 'n'; break;
1876                  case '\r':
1877                      ok = 'r'; break;
1878                  case '\t':
1879                      ok = 't'; break;
1880                  case '\f':
1881                      ok = 'f'; break;
1882                  case '\a':
1883                      ok = 'a'; break;
1884                  case '\\':
1885                      ok = '\\'; break;
1886                  default: break;
1887                  }
1888                  if (ok) {
1889                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1890                  }
1891              }
1892              /* isPRINT() is the locale-blind version. */
1893              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1894                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1895                  ok = 1;
1896              }
1897          }
1898          if (!ok)
1899              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1900     }
1901     if (truncated)
1902          sv_catpvn(dsv, "...", 3);
1903
1904     return SvPVX(dsv);
1905 }
1906
1907 /*
1908 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1909
1910 Build to the scalar dsv a displayable version of the scalar sv,
1911 the displayable version being at most pvlim bytes long
1912 (if longer, the rest is truncated and "..." will be appended).
1913
1914 The flags argument is as in pv_uni_display().
1915
1916 The pointer to the PV of the dsv is returned.
1917
1918 =cut */
1919 char *
1920 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1921 {
1922      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1923                                 SvCUR(ssv), pvlim, flags);
1924 }
1925
1926 /*
1927 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1928
1929 Return true if the strings s1 and s2 differ case-insensitively, false
1930 if not (if they are equal case-insensitively).  If u1 is true, the
1931 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1932 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1933 are false, the respective string is assumed to be in native 8-bit
1934 encoding.
1935
1936 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1937 in there (they will point at the beginning of the I<next> character).
1938 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1939 pointers beyond which scanning will not continue under any
1940 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1941 s2+l2 will be used as goal end pointers that will also stop the scan,
1942 and which qualify towards defining a successful match: all the scans
1943 that define an explicit length must reach their goal pointers for
1944 a match to succeed).
1945
1946 For case-insensitiveness, the "casefolding" of Unicode is used
1947 instead of upper/lowercasing both the characters, see
1948 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1949
1950 =cut */
1951 I32
1952 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1953 {
1954      register const U8 *p1  = (const U8*)s1;
1955      register const U8 *p2  = (const U8*)s2;
1956      register const U8 *f1 = 0, *f2 = 0;
1957      register U8 *e1 = 0, *q1 = 0;
1958      register U8 *e2 = 0, *q2 = 0;
1959      STRLEN n1 = 0, n2 = 0;
1960      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1961      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
1962      U8 natbuf[1+1];
1963      STRLEN foldlen1, foldlen2;
1964      bool match;
1965
1966      if (pe1)
1967           e1 = *(U8**)pe1;
1968      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1969           f1 = (const U8*)s1 + l1;
1970      if (pe2)
1971           e2 = *(U8**)pe2;
1972      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1973           f2 = (const U8*)s2 + l2;
1974
1975      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1976           return 1; /* mismatch; possible infinite loop or false positive */
1977
1978      if (!u1 || !u2)
1979           natbuf[1] = 0; /* Need to terminate the buffer. */
1980
1981      while ((e1 == 0 || p1 < e1) &&
1982             (f1 == 0 || p1 < f1) &&
1983             (e2 == 0 || p2 < e2) &&
1984             (f2 == 0 || p2 < f2)) {
1985           if (n1 == 0) {
1986                if (u1)
1987                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1988                else {
1989                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
1990                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1991                }
1992                q1 = foldbuf1;
1993                n1 = foldlen1;
1994           }
1995           if (n2 == 0) {
1996                if (u2)
1997                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1998                else {
1999                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2000                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2001                }
2002                q2 = foldbuf2;
2003                n2 = foldlen2;
2004           }
2005           while (n1 && n2) {
2006                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2007                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2008                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2009                    return 1; /* mismatch */
2010                n1 -= UTF8SKIP(q1);
2011                q1 += UTF8SKIP(q1);
2012                n2 -= UTF8SKIP(q2);
2013                q2 += UTF8SKIP(q2);
2014           }
2015           if (n1 == 0)
2016                p1 += u1 ? UTF8SKIP(p1) : 1;
2017           if (n2 == 0)
2018                p2 += u2 ? UTF8SKIP(p2) : 1;
2019
2020      }
2021
2022      /* A match is defined by all the scans that specified
2023       * an explicit length reaching their final goals. */
2024      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2025
2026      if (match) {
2027           if (pe1)
2028                *pe1 = (char*)p1;
2029           if (pe2)
2030                *pe2 = (char*)p2;
2031      }
2032
2033      return match ? 0 : 1; /* 0 match, 1 mismatch */
2034 }
2035
2036 /*
2037  * Local variables:
2038  * c-indentation-style: bsd
2039  * c-basic-offset: 4
2040  * indent-tabs-mode: t
2041  * End:
2042  *
2043  * ex: set ts=8 sts=4 sw=4 noet:
2044  */