utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 #ifndef EBCDIC
  29 /* Separate prototypes needed because in ASCII systems these
  30  * usually macros but they still are compiled as code, too. */
  31 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  32 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  33 #endif
  34
  35 static const char unees[] =
  36     "Malformed UTF-8 character (unexpected end of string)";
  37
  38 /*
  39 =head1 Unicode Support
  40
  41 This file contains various utility functions for manipulating UTF8-encoded
  42 strings. For the uninitiated, this is a method of representing arbitrary
  43 Unicode characters as a variable number of bytes, in such a way that
  44 characters in the ASCII range are unmodified, and a zero byte never appears
  45 within non-zero characters.
  46
  47 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  48
  49 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  50 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  51 bytes available. The return value is the pointer to the byte after the
  52 end of the new character. In other words,
  53
  54     d = uvuni_to_utf8_flags(d, uv, flags);
  55
  56 or, in most cases,
  57
  58     d = uvuni_to_utf8(d, uv);
  59
  60 (which is equivalent to)
  61
  62     d = uvuni_to_utf8_flags(d, uv, 0);
  63
  64 is the recommended Unicode-aware way of saying
  65
  66     *(d++) = uv;
  67
  68 =cut
  69 */
  70
  71 U8 *
  72 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  73 {
  74     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
  75
  76     if (ckWARN(WARN_UTF8)) {
  77          if (UNICODE_IS_SURROGATE(uv) &&
  78              !(flags & UNICODE_ALLOW_SURROGATE))
  79               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  80          else if (
  81                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  82                     !(flags & UNICODE_ALLOW_FDD0))
  83                    ||
  84                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  85                     !(flags & UNICODE_ALLOW_FFFF))) &&
  86                   /* UNICODE_ALLOW_SUPER includes
  87                    * FFFEs and FFFFs beyond 0x10FFFF. */
  88                   ((uv <= PERL_UNICODE_MAX) ||
  89                    !(flags & UNICODE_ALLOW_SUPER))
  90                   )
  91               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  92                          "Unicode character 0x%04"UVxf" is illegal", uv);
  93     }
  94     if (UNI_IS_INVARIANT(uv)) {
  95         *d++ = (U8)UTF_TO_NATIVE(uv);
  96         return d;
  97     }
  98 #if defined(EBCDIC)
  99     else {
 100         STRLEN len  = UNISKIP(uv);
 101         U8 *p = d+len-1;
 102         while (p > d) {
 103             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 104             uv >>= UTF_ACCUMULATION_SHIFT;
 105         }
 106         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 107         return d+len;
 108     }
 109 #else /* Non loop style */
 110     if (uv < 0x800) {
 111         *d++ = (U8)(( uv >>  6)         | 0xc0);
 112         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 113         return d;
 114     }
 115     if (uv < 0x10000) {
 116         *d++ = (U8)(( uv >> 12)         | 0xe0);
 117         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 118         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 119         return d;
 120     }
 121     if (uv < 0x200000) {
 122         *d++ = (U8)(( uv >> 18)         | 0xf0);
 123         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 124         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 125         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 126         return d;
 127     }
 128     if (uv < 0x4000000) {
 129         *d++ = (U8)(( uv >> 24)         | 0xf8);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136     if (uv < 0x80000000) {
 137         *d++ = (U8)(( uv >> 30)         | 0xfc);
 138         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 139         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 140         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 141         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 142         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 143         return d;
 144     }
 145 #ifdef HAS_QUAD
 146     if (uv < UTF8_QUAD_MAX)
 147 #endif
 148     {
 149         *d++ =                            0xfe; /* Can't match U+FEFF! */
 150         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 151         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 152         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 153         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 154         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 155         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 156         return d;
 157     }
 158 #ifdef HAS_QUAD
 159     {
 160         *d++ =                            0xff;         /* Can't match U+FFFE! */
 161         *d++ =                            0x80;         /* 6 Reserved bits */
 162         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 163         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 164         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 165         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 166         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 167         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 168         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 169         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 170         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 171         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 172         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 173         return d;
 174     }
 175 #endif
 176 #endif /* Loop style */
 177 }
 178
 179 /*
 180
 181 Tests if some arbitrary number of bytes begins in a valid UTF-8
 182 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 183 UTF-8 character.  The actual number of bytes in the UTF-8 character
 184 will be returned if it is valid, otherwise 0.
 185
 186 This is the "slow" version as opposed to the "fast" version which is
 187 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 188 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 189 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 190 you should use the _slow().  In practice this means that the _slow()
 191 will be used very rarely, since the maximum Unicode code point (as of
 192 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 193 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 194 five bytes or more.
 195
 196 =cut */
 197 STATIC STRLEN
 198 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 199 {
 200     U8 u = *s;
 201     STRLEN slen;
 202     UV uv, ouv;
 203
 204     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 205
 206     if (UTF8_IS_INVARIANT(u))
 207         return 1;
 208
 209     if (!UTF8_IS_START(u))
 210         return 0;
 211
 212     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 213         return 0;
 214
 215     slen = len - 1;
 216     s++;
 217 #ifdef EBCDIC
 218     u = NATIVE_TO_UTF(u);
 219 #endif
 220     u &= UTF_START_MASK(len);
 221     uv  = u;
 222     ouv = uv;
 223     while (slen--) {
 224         if (!UTF8_IS_CONTINUATION(*s))
 225             return 0;
 226         uv = UTF8_ACCUMULATE(uv, *s);
 227         if (uv < ouv)
 228             return 0;
 229         ouv = uv;
 230         s++;
 231     }
 232
 233     if ((STRLEN)UNISKIP(uv) < len)
 234         return 0;
 235
 236     return len;
 237 }
 238
 239 /*
 240 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 241
 242 Tests if some arbitrary number of bytes begins in a valid UTF-8
 243 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 244 UTF-8 character.  The actual number of bytes in the UTF-8 character
 245 will be returned if it is valid, otherwise 0.
 246
 247 =cut */
 248 STRLEN
 249 Perl_is_utf8_char(pTHX_ const U8 *s)
 250 {
 251     const STRLEN len = UTF8SKIP(s);
 252
 253     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 254     PERL_UNUSED_CONTEXT;
 255 #ifdef IS_UTF8_CHAR
 256     if (IS_UTF8_CHAR_FAST(len))
 257         return IS_UTF8_CHAR(s, len) ? len : 0;
 258 #endif /* #ifdef IS_UTF8_CHAR */
 259     return is_utf8_char_slow(s, len);
 260 }
 261
 262 /*
 263 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 264
 265 Returns true if first C<len> bytes of the given string form a valid
 266 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 267 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 268 because a valid ASCII string is a valid UTF-8 string.
 269
 270 See also is_utf8_string_loclen() and is_utf8_string_loc().
 271
 272 =cut
 273 */
 274
 275 bool
 276 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 277 {
 278     const U8* const send = s + (len ? len : strlen((const char *)s));
 279     const U8* x = s;
 280
 281     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 282     PERL_UNUSED_CONTEXT;
 283
 284     while (x < send) {
 285         STRLEN c;
 286          /* Inline the easy bits of is_utf8_char() here for speed... */
 287          if (UTF8_IS_INVARIANT(*x))
 288               c = 1;
 289          else if (!UTF8_IS_START(*x))
 290              goto out;
 291          else {
 292               /* ... and call is_utf8_char() only if really needed. */
 293 #ifdef IS_UTF8_CHAR
 294              c = UTF8SKIP(x);
 295              if (IS_UTF8_CHAR_FAST(c)) {
 296                  if (!IS_UTF8_CHAR(x, c))
 297                      c = 0;
 298              }
 299              else
 300                 c = is_utf8_char_slow(x, c);
 301 #else
 302              c = is_utf8_char(x);
 303 #endif /* #ifdef IS_UTF8_CHAR */
 304               if (!c)
 305                   goto out;
 306          }
 307         x += c;
 308     }
 309
 310  out:
 311     if (x != send)
 312         return FALSE;
 313
 314     return TRUE;
 315 }
 316
 317 /*
 318 Implemented as a macro in utf8.h
 319
 320 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 321
 322 Like is_utf8_string() but stores the location of the failure (in the
 323 case of "utf8ness failure") or the location s+len (in the case of
 324 "utf8ness success") in the C<ep>.
 325
 326 See also is_utf8_string_loclen() and is_utf8_string().
 327
 328 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 329
 330 Like is_utf8_string() but stores the location of the failure (in the
 331 case of "utf8ness failure") or the location s+len (in the case of
 332 "utf8ness success") in the C<ep>, and the number of UTF-8
 333 encoded characters in the C<el>.
 334
 335 See also is_utf8_string_loc() and is_utf8_string().
 336
 337 =cut
 338 */
 339
 340 bool
 341 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 342 {
 343     const U8* const send = s + (len ? len : strlen((const char *)s));
 344     const U8* x = s;
 345     STRLEN c;
 346     STRLEN outlen = 0;
 347
 348     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 349     PERL_UNUSED_CONTEXT;
 350
 351     while (x < send) {
 352          /* Inline the easy bits of is_utf8_char() here for speed... */
 353          if (UTF8_IS_INVARIANT(*x))
 354              c = 1;
 355          else if (!UTF8_IS_START(*x))
 356              goto out;
 357          else {
 358              /* ... and call is_utf8_char() only if really needed. */
 359 #ifdef IS_UTF8_CHAR
 360              c = UTF8SKIP(x);
 361              if (IS_UTF8_CHAR_FAST(c)) {
 362                  if (!IS_UTF8_CHAR(x, c))
 363                      c = 0;
 364              } else
 365                  c = is_utf8_char_slow(x, c);
 366 #else
 367              c = is_utf8_char(x);
 368 #endif /* #ifdef IS_UTF8_CHAR */
 369              if (!c)
 370                  goto out;
 371          }
 372          x += c;
 373          outlen++;
 374     }
 375
 376  out:
 377     if (el)
 378         *el = outlen;
 379
 380     if (ep)
 381         *ep = x;
 382     return (x == send);
 383 }
 384
 385 /*
 386
 387 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 388
 389 Bottom level UTF-8 decode routine.
 390 Returns the Unicode code point value of the first character in the string C<s>
 391 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 392 C<retlen> will be set to the length, in bytes, of that character.
 393
 394 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 395 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 396 it is assumed that the caller will raise a warning, and this function
 397 will silently just set C<retlen> to C<-1> and return zero.  If the
 398 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 399 malformations will be given, C<retlen> will be set to the expected
 400 length of the UTF-8 character in bytes, and zero will be returned.
 401
 402 The C<flags> can also contain various flags to allow deviations from
 403 the strict UTF-8 encoding (see F<utf8.h>).
 404
 405 Most code should use utf8_to_uvchr() rather than call this directly.
 406
 407 =cut
 408 */
 409
 410 UV
 411 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 412 {
 413     dVAR;
 414     const U8 * const s0 = s;
 415     UV uv = *s, ouv = 0;
 416     STRLEN len = 1;
 417     const bool dowarn = ckWARN_d(WARN_UTF8);
 418     const UV startbyte = *s;
 419     STRLEN expectlen = 0;
 420     U32 warning = 0;
 421
 422     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 423
 424 /* This list is a superset of the UTF8_ALLOW_XXX. */
 425
 426 #define UTF8_WARN_EMPTY                          1
 427 #define UTF8_WARN_CONTINUATION                   2
 428 #define UTF8_WARN_NON_CONTINUATION               3
 429 #define UTF8_WARN_FE_FF                          4
 430 #define UTF8_WARN_SHORT                          5
 431 #define UTF8_WARN_OVERFLOW                       6
 432 #define UTF8_WARN_SURROGATE                      7
 433 #define UTF8_WARN_LONG                           8
 434 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 435
 436     if (curlen == 0 &&
 437         !(flags & UTF8_ALLOW_EMPTY)) {
 438         warning = UTF8_WARN_EMPTY;
 439         goto malformed;
 440     }
 441
 442     if (UTF8_IS_INVARIANT(uv)) {
 443         if (retlen)
 444             *retlen = 1;
 445         return (UV) (NATIVE_TO_UTF(*s));
 446     }
 447
 448     if (UTF8_IS_CONTINUATION(uv) &&
 449         !(flags & UTF8_ALLOW_CONTINUATION)) {
 450         warning = UTF8_WARN_CONTINUATION;
 451         goto malformed;
 452     }
 453
 454     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 455         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 456         warning = UTF8_WARN_NON_CONTINUATION;
 457         goto malformed;
 458     }
 459
 460 #ifdef EBCDIC
 461     uv = NATIVE_TO_UTF(uv);
 462 #else
 463     if ((uv == 0xfe || uv == 0xff) &&
 464         !(flags & UTF8_ALLOW_FE_FF)) {
 465         warning = UTF8_WARN_FE_FF;
 466         goto malformed;
 467     }
 468 #endif
 469
 470     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 471     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 472     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 473     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 474 #ifdef EBCDIC
 475     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 476     else                        { len =  7; uv &= 0x01; }
 477 #else
 478     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 479     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 480     else                        { len = 13; uv = 0; } /* whoa! */
 481 #endif
 482
 483     if (retlen)
 484         *retlen = len;
 485
 486     expectlen = len;
 487
 488     if ((curlen < expectlen) &&
 489         !(flags & UTF8_ALLOW_SHORT)) {
 490         warning = UTF8_WARN_SHORT;
 491         goto malformed;
 492     }
 493
 494     len--;
 495     s++;
 496     ouv = uv;
 497
 498     while (len--) {
 499         if (!UTF8_IS_CONTINUATION(*s) &&
 500             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 501             s--;
 502             warning = UTF8_WARN_NON_CONTINUATION;
 503             goto malformed;
 504         }
 505         else
 506             uv = UTF8_ACCUMULATE(uv, *s);
 507         if (!(uv > ouv)) {
 508             /* These cannot be allowed. */
 509             if (uv == ouv) {
 510                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 511                     warning = UTF8_WARN_LONG;
 512                     goto malformed;
 513                 }
 514             }
 515             else { /* uv < ouv */
 516                 /* This cannot be allowed. */
 517                 warning = UTF8_WARN_OVERFLOW;
 518                 goto malformed;
 519             }
 520         }
 521         s++;
 522         ouv = uv;
 523     }
 524
 525     if (UNICODE_IS_SURROGATE(uv) &&
 526         !(flags & UTF8_ALLOW_SURROGATE)) {
 527         warning = UTF8_WARN_SURROGATE;
 528         goto malformed;
 529     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 530                !(flags & UTF8_ALLOW_LONG)) {
 531         warning = UTF8_WARN_LONG;
 532         goto malformed;
 533     } else if (UNICODE_IS_ILLEGAL(uv) &&
 534                !(flags & UTF8_ALLOW_FFFF)) {
 535         warning = UTF8_WARN_FFFF;
 536         goto malformed;
 537     }
 538
 539     return uv;
 540
 541 malformed:
 542
 543     if (flags & UTF8_CHECK_ONLY) {
 544         if (retlen)
 545             *retlen = ((STRLEN) -1);
 546         return 0;
 547     }
 548
 549     if (dowarn) {
 550         SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 551
 552         switch (warning) {
 553         case 0: /* Intentionally empty. */ break;
 554         case UTF8_WARN_EMPTY:
 555             sv_catpvs(sv, "(empty string)");
 556             break;
 557         case UTF8_WARN_CONTINUATION:
 558             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 559             break;
 560         case UTF8_WARN_NON_CONTINUATION:
 561             if (s == s0)
 562                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 563                            (UV)s[1], startbyte);
 564             else {
 565                 const int len = (int)(s-s0);
 566                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 567                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 568             }
 569
 570             break;
 571         case UTF8_WARN_FE_FF:
 572             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 573             break;
 574         case UTF8_WARN_SHORT:
 575             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 576                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 577             expectlen = curlen;         /* distance for caller to skip */
 578             break;
 579         case UTF8_WARN_OVERFLOW:
 580             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 581                            ouv, *s, startbyte);
 582             break;
 583         case UTF8_WARN_SURROGATE:
 584             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 585             break;
 586         case UTF8_WARN_LONG:
 587             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 588                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 589             break;
 590         case UTF8_WARN_FFFF:
 591             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 592             break;
 593         default:
 594             sv_catpvs(sv, "(unknown reason)");
 595             break;
 596         }
 597
 598         if (warning) {
 599             const char * const s = SvPVX_const(sv);
 600
 601             if (PL_op)
 602                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 603                             "%s in %s", s,  OP_DESC(PL_op));
 604             else
 605                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 606         }
 607     }
 608
 609     if (retlen)
 610         *retlen = expectlen ? expectlen : len;
 611
 612     return 0;
 613 }
 614
 615 /*
 616 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 617
 618 Returns the native character value of the first character in the string C<s>
 619 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 620 length, in bytes, of that character.
 621
 622 If C<s> does not point to a well-formed UTF-8 character, zero is
 623 returned and retlen is set, if possible, to -1.
 624
 625 =cut
 626 */
 627
 628 UV
 629 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 630 {
 631     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 632
 633     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 634                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 635 }
 636
 637 /*
 638 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 639
 640 Returns the Unicode code point of the first character in the string C<s>
 641 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 642 length, in bytes, of that character.
 643
 644 This function should only be used when returned UV is considered
 645 an index into the Unicode semantic tables (e.g. swashes).
 646
 647 If C<s> does not point to a well-formed UTF-8 character, zero is
 648 returned and retlen is set, if possible, to -1.
 649
 650 =cut
 651 */
 652
 653 UV
 654 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 655 {
 656     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 657
 658     /* Call the low level routine asking for checks */
 659     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 660                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 661 }
 662
 663 /*
 664 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 665
 666 Return the length of the UTF-8 char encoded string C<s> in characters.
 667 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 668 up past C<e>, croaks.
 669
 670 =cut
 671 */
 672
 673 STRLEN
 674 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 675 {
 676     dVAR;
 677     STRLEN len = 0;
 678     U8 t = 0;
 679
 680     PERL_ARGS_ASSERT_UTF8_LENGTH;
 681
 682     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 683      * the bitops (especially ~) can create illegal UTF-8.
 684      * In other words: in Perl UTF-8 is not just for Unicode. */
 685
 686     if (e < s)
 687         goto warn_and_return;
 688     while (s < e) {
 689         t = UTF8SKIP(s);
 690         if (e - s < t) {
 691             warn_and_return:
 692             if (ckWARN_d(WARN_UTF8)) {
 693                 if (PL_op)
 694                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 695                             "%s in %s", unees, OP_DESC(PL_op));
 696                 else
 697                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 698             }
 699             return len;
 700         }
 701         s += t;
 702         len++;
 703     }
 704
 705     return len;
 706 }
 707
 708 /*
 709 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 710
 711 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 712 and C<b>.
 713
 714 WARNING: use only if you *know* that the pointers point inside the
 715 same UTF-8 buffer.
 716
 717 =cut
 718 */
 719
 720 IV
 721 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 722 {
 723     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 724
 725     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 726 }
 727
 728 /*
 729 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 730
 731 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 732 forward or backward.
 733
 734 WARNING: do not use the following unless you *know* C<off> is within
 735 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 736 on the first byte of character or just after the last byte of a character.
 737
 738 =cut
 739 */
 740
 741 U8 *
 742 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 743 {
 744     PERL_ARGS_ASSERT_UTF8_HOP;
 745
 746     PERL_UNUSED_CONTEXT;
 747     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 748      * the bitops (especially ~) can create illegal UTF-8.
 749      * In other words: in Perl UTF-8 is not just for Unicode. */
 750
 751     if (off >= 0) {
 752         while (off--)
 753             s += UTF8SKIP(s);
 754     }
 755     else {
 756         while (off++) {
 757             s--;
 758             while (UTF8_IS_CONTINUATION(*s))
 759                 s--;
 760         }
 761     }
 762     return (U8 *)s;
 763 }
 764
 765 /*
 766 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 767
 768 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 769 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 770 updates len to contain the new length.
 771 Returns zero on failure, setting C<len> to -1.
 772
 773 If you need a copy of the string, see C<bytes_from_utf8>.
 774
 775 =cut
 776 */
 777
 778 U8 *
 779 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 780 {
 781     U8 * const save = s;
 782     U8 * const send = s + *len;
 783     U8 *d;
 784
 785     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 786
 787     /* ensure valid UTF-8 and chars < 256 before updating string */
 788     while (s < send) {
 789         U8 c = *s++;
 790
 791         if (!UTF8_IS_INVARIANT(c) &&
 792             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 793              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 794             *len = ((STRLEN) -1);
 795             return 0;
 796         }
 797     }
 798
 799     d = s = save;
 800     while (s < send) {
 801         STRLEN ulen;
 802         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 803         s += ulen;
 804     }
 805     *d = '\0';
 806     *len = d - save;
 807     return save;
 808 }
 809
 810 /*
 811 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 812
 813 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 814 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 815 the newly-created string, and updates C<len> to contain the new
 816 length.  Returns the original string if no conversion occurs, C<len>
 817 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 818 0 if C<s> is converted or contains all 7bit characters.
 819
 820 =cut
 821 */
 822
 823 U8 *
 824 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 825 {
 826     U8 *d;
 827     const U8 *start = s;
 828     const U8 *send;
 829     I32 count = 0;
 830
 831     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 832
 833     PERL_UNUSED_CONTEXT;
 834     if (!*is_utf8)
 835         return (U8 *)start;
 836
 837     /* ensure valid UTF-8 and chars < 256 before converting string */
 838     for (send = s + *len; s < send;) {
 839         U8 c = *s++;
 840         if (!UTF8_IS_INVARIANT(c)) {
 841             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 842                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 843                 count++;
 844             else
 845                 return (U8 *)start;
 846         }
 847     }
 848
 849     *is_utf8 = FALSE;
 850
 851     Newx(d, (*len) - count + 1, U8);
 852     s = start; start = d;
 853     while (s < send) {
 854         U8 c = *s++;
 855         if (!UTF8_IS_INVARIANT(c)) {
 856             /* Then it is two-byte encoded */
 857             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 858             c = ASCII_TO_NATIVE(c);
 859         }
 860         *d++ = c;
 861     }
 862     *d = '\0';
 863     *len = d - start;
 864     return (U8 *)start;
 865 }
 866
 867 /*
 868 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 869
 870 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 871 Returns a pointer to the newly-created string, and sets C<len> to
 872 reflect the new length.
 873
 874 If you want to convert to UTF-8 from other encodings than ASCII,
 875 see sv_recode_to_utf8().
 876
 877 =cut
 878 */
 879
 880 U8*
 881 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 882 {
 883     const U8 * const send = s + (*len);
 884     U8 *d;
 885     U8 *dst;
 886
 887     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 888     PERL_UNUSED_CONTEXT;
 889
 890     Newx(d, (*len) * 2 + 1, U8);
 891     dst = d;
 892
 893     while (s < send) {
 894         const UV uv = NATIVE_TO_ASCII(*s++);
 895         if (UNI_IS_INVARIANT(uv))
 896             *d++ = (U8)UTF_TO_NATIVE(uv);
 897         else {
 898             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 899             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 900         }
 901     }
 902     *d = '\0';
 903     *len = d-dst;
 904     return dst;
 905 }
 906
 907 /*
 908  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 909  *
 910  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 911  * We optimize for native, for obvious reasons. */
 912
 913 U8*
 914 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 915 {
 916     U8* pend;
 917     U8* dstart = d;
 918
 919     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 920
 921     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 922          d[0] = 0;
 923          *newlen = 1;
 924          return d;
 925     }
 926
 927     if (bytelen & 1)
 928         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 929
 930     pend = p + bytelen;
 931
 932     while (p < pend) {
 933         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 934         p += 2;
 935         if (uv < 0x80) {
 936 #ifdef EBCDIC
 937             *d++ = UNI_TO_NATIVE(uv);
 938 #else
 939             *d++ = (U8)uv;
 940 #endif
 941             continue;
 942         }
 943         if (uv < 0x800) {
 944             *d++ = (U8)(( uv >>  6)         | 0xc0);
 945             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 946             continue;
 947         }
 948         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 949             UV low = (p[0] << 8) + p[1];
 950             p += 2;
 951             if (low < 0xdc00 || low >= 0xdfff)
 952                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 953             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 954         }
 955         if (uv < 0x10000) {
 956             *d++ = (U8)(( uv >> 12)         | 0xe0);
 957             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 958             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 959             continue;
 960         }
 961         else {
 962             *d++ = (U8)(( uv >> 18)         | 0xf0);
 963             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 964             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 965             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 966             continue;
 967         }
 968     }
 969     *newlen = d - dstart;
 970     return d;
 971 }
 972
 973 /* Note: this one is slightly destructive of the source. */
 974
 975 U8*
 976 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 977 {
 978     U8* s = (U8*)p;
 979     U8* const send = s + bytelen;
 980
 981     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
 982
 983     while (s < send) {
 984         const U8 tmp = s[0];
 985         s[0] = s[1];
 986         s[1] = tmp;
 987         s += 2;
 988     }
 989     return utf16_to_utf8(p, d, bytelen, newlen);
 990 }
 991
 992 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 993
 994 bool
 995 Perl_is_uni_alnum(pTHX_ UV c)
 996 {
 997     U8 tmpbuf[UTF8_MAXBYTES+1];
 998     uvchr_to_utf8(tmpbuf, c);
 999     return is_utf8_alnum(tmpbuf);
1000 }
1001
1002 bool
1003 Perl_is_uni_alnumc(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_alnumc(tmpbuf);
1008 }
1009
1010 bool
1011 Perl_is_uni_idfirst(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_idfirst(tmpbuf);
1016 }
1017
1018 bool
1019 Perl_is_uni_alpha(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_alpha(tmpbuf);
1024 }
1025
1026 bool
1027 Perl_is_uni_ascii(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_ascii(tmpbuf);
1032 }
1033
1034 bool
1035 Perl_is_uni_space(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_space(tmpbuf);
1040 }
1041
1042 bool
1043 Perl_is_uni_digit(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_digit(tmpbuf);
1048 }
1049
1050 bool
1051 Perl_is_uni_upper(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_upper(tmpbuf);
1056 }
1057
1058 bool
1059 Perl_is_uni_lower(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_lower(tmpbuf);
1064 }
1065
1066 bool
1067 Perl_is_uni_cntrl(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_cntrl(tmpbuf);
1072 }
1073
1074 bool
1075 Perl_is_uni_graph(pTHX_ UV c)
1076 {
1077     U8 tmpbuf[UTF8_MAXBYTES+1];
1078     uvchr_to_utf8(tmpbuf, c);
1079     return is_utf8_graph(tmpbuf);
1080 }
1081
1082 bool
1083 Perl_is_uni_print(pTHX_ UV c)
1084 {
1085     U8 tmpbuf[UTF8_MAXBYTES+1];
1086     uvchr_to_utf8(tmpbuf, c);
1087     return is_utf8_print(tmpbuf);
1088 }
1089
1090 bool
1091 Perl_is_uni_punct(pTHX_ UV c)
1092 {
1093     U8 tmpbuf[UTF8_MAXBYTES+1];
1094     uvchr_to_utf8(tmpbuf, c);
1095     return is_utf8_punct(tmpbuf);
1096 }
1097
1098 bool
1099 Perl_is_uni_xdigit(pTHX_ UV c)
1100 {
1101     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1102     uvchr_to_utf8(tmpbuf, c);
1103     return is_utf8_xdigit(tmpbuf);
1104 }
1105
1106 UV
1107 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1108 {
1109     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1110
1111     uvchr_to_utf8(p, c);
1112     return to_utf8_upper(p, p, lenp);
1113 }
1114
1115 UV
1116 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1117 {
1118     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1119
1120     uvchr_to_utf8(p, c);
1121     return to_utf8_title(p, p, lenp);
1122 }
1123
1124 UV
1125 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1126 {
1127     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1128
1129     uvchr_to_utf8(p, c);
1130     return to_utf8_lower(p, p, lenp);
1131 }
1132
1133 UV
1134 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1135 {
1136     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1137
1138     uvchr_to_utf8(p, c);
1139     return to_utf8_fold(p, p, lenp);
1140 }
1141
1142 /* for now these all assume no locale info available for Unicode > 255 */
1143
1144 bool
1145 Perl_is_uni_alnum_lc(pTHX_ UV c)
1146 {
1147     return is_uni_alnum(c);     /* XXX no locale support yet */
1148 }
1149
1150 bool
1151 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1152 {
1153     return is_uni_alnumc(c);    /* XXX no locale support yet */
1154 }
1155
1156 bool
1157 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1158 {
1159     return is_uni_idfirst(c);   /* XXX no locale support yet */
1160 }
1161
1162 bool
1163 Perl_is_uni_alpha_lc(pTHX_ UV c)
1164 {
1165     return is_uni_alpha(c);     /* XXX no locale support yet */
1166 }
1167
1168 bool
1169 Perl_is_uni_ascii_lc(pTHX_ UV c)
1170 {
1171     return is_uni_ascii(c);     /* XXX no locale support yet */
1172 }
1173
1174 bool
1175 Perl_is_uni_space_lc(pTHX_ UV c)
1176 {
1177     return is_uni_space(c);     /* XXX no locale support yet */
1178 }
1179
1180 bool
1181 Perl_is_uni_digit_lc(pTHX_ UV c)
1182 {
1183     return is_uni_digit(c);     /* XXX no locale support yet */
1184 }
1185
1186 bool
1187 Perl_is_uni_upper_lc(pTHX_ UV c)
1188 {
1189     return is_uni_upper(c);     /* XXX no locale support yet */
1190 }
1191
1192 bool
1193 Perl_is_uni_lower_lc(pTHX_ UV c)
1194 {
1195     return is_uni_lower(c);     /* XXX no locale support yet */
1196 }
1197
1198 bool
1199 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1200 {
1201     return is_uni_cntrl(c);     /* XXX no locale support yet */
1202 }
1203
1204 bool
1205 Perl_is_uni_graph_lc(pTHX_ UV c)
1206 {
1207     return is_uni_graph(c);     /* XXX no locale support yet */
1208 }
1209
1210 bool
1211 Perl_is_uni_print_lc(pTHX_ UV c)
1212 {
1213     return is_uni_print(c);     /* XXX no locale support yet */
1214 }
1215
1216 bool
1217 Perl_is_uni_punct_lc(pTHX_ UV c)
1218 {
1219     return is_uni_punct(c);     /* XXX no locale support yet */
1220 }
1221
1222 bool
1223 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1224 {
1225     return is_uni_xdigit(c);    /* XXX no locale support yet */
1226 }
1227
1228 U32
1229 Perl_to_uni_upper_lc(pTHX_ U32 c)
1230 {
1231     /* XXX returns only the first character -- do not use XXX */
1232     /* XXX no locale support yet */
1233     STRLEN len;
1234     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1235     return (U32)to_uni_upper(c, tmpbuf, &len);
1236 }
1237
1238 U32
1239 Perl_to_uni_title_lc(pTHX_ U32 c)
1240 {
1241     /* XXX returns only the first character XXX -- do not use XXX */
1242     /* XXX no locale support yet */
1243     STRLEN len;
1244     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1245     return (U32)to_uni_title(c, tmpbuf, &len);
1246 }
1247
1248 U32
1249 Perl_to_uni_lower_lc(pTHX_ U32 c)
1250 {
1251     /* XXX returns only the first character -- do not use XXX */
1252     /* XXX no locale support yet */
1253     STRLEN len;
1254     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1255     return (U32)to_uni_lower(c, tmpbuf, &len);
1256 }
1257
1258 static bool
1259 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1260                  const char *const swashname)
1261 {
1262     dVAR;
1263
1264     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1265
1266     if (!is_utf8_char(p))
1267         return FALSE;
1268     if (!*swash)
1269         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1270     return swash_fetch(*swash, p, TRUE) != 0;
1271 }
1272
1273 bool
1274 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1275 {
1276     dVAR;
1277
1278     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1279
1280     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1281      * descendant of isalnum(3), in other words, it doesn't
1282      * contain the '_'. --jhi */
1283     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1284 }
1285
1286 bool
1287 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1288 {
1289     dVAR;
1290
1291     PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1292
1293     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1294 }
1295
1296 bool
1297 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1298 {
1299     dVAR;
1300
1301     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1302
1303     if (*p == '_')
1304         return TRUE;
1305     /* is_utf8_idstart would be more logical. */
1306     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1307 }
1308
1309 bool
1310 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1311 {
1312     dVAR;
1313
1314     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1315
1316     if (*p == '_')
1317         return TRUE;
1318     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1319 }
1320
1321 bool
1322 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1323 {
1324     dVAR;
1325
1326     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1327
1328     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1329 }
1330
1331 bool
1332 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1333 {
1334     dVAR;
1335
1336     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1337
1338     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1339 }
1340
1341 bool
1342 Perl_is_utf8_space(pTHX_ const U8 *p)
1343 {
1344     dVAR;
1345
1346     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1347
1348     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1349 }
1350
1351 bool
1352 Perl_is_utf8_digit(pTHX_ const U8 *p)
1353 {
1354     dVAR;
1355
1356     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1357
1358     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1359 }
1360
1361 bool
1362 Perl_is_utf8_upper(pTHX_ const U8 *p)
1363 {
1364     dVAR;
1365
1366     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1367
1368     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1369 }
1370
1371 bool
1372 Perl_is_utf8_lower(pTHX_ const U8 *p)
1373 {
1374     dVAR;
1375
1376     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1377
1378     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1379 }
1380
1381 bool
1382 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1383 {
1384     dVAR;
1385
1386     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1387
1388     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1389 }
1390
1391 bool
1392 Perl_is_utf8_graph(pTHX_ const U8 *p)
1393 {
1394     dVAR;
1395
1396     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1397
1398     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1399 }
1400
1401 bool
1402 Perl_is_utf8_print(pTHX_ const U8 *p)
1403 {
1404     dVAR;
1405
1406     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1407
1408     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1409 }
1410
1411 bool
1412 Perl_is_utf8_punct(pTHX_ const U8 *p)
1413 {
1414     dVAR;
1415
1416     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1417
1418     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1419 }
1420
1421 bool
1422 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1423 {
1424     dVAR;
1425
1426     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1427
1428     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1429 }
1430
1431 bool
1432 Perl_is_utf8_mark(pTHX_ const U8 *p)
1433 {
1434     dVAR;
1435
1436     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1437
1438     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1439 }
1440
1441 /*
1442 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1443
1444 The "p" contains the pointer to the UTF-8 string encoding
1445 the character that is being converted.
1446
1447 The "ustrp" is a pointer to the character buffer to put the
1448 conversion result to.  The "lenp" is a pointer to the length
1449 of the result.
1450
1451 The "swashp" is a pointer to the swash to use.
1452
1453 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1454 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1455 but not always, a multicharacter mapping), is tried first.
1456
1457 The "special" is a string like "utf8::ToSpecLower", which means the
1458 hash %utf8::ToSpecLower.  The access to the hash is through
1459 Perl_to_utf8_case().
1460
1461 The "normal" is a string like "ToLower" which means the swash
1462 %utf8::ToLower.
1463
1464 =cut */
1465
1466 UV
1467 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1468                         SV **swashp, const char *normal, const char *special)
1469 {
1470     dVAR;
1471     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1472     STRLEN len = 0;
1473     const UV uv0 = utf8_to_uvchr(p, NULL);
1474     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1475      * are necessary in EBCDIC, they are redundant no-ops
1476      * in ASCII-ish platforms, and hopefully optimized away. */
1477     const UV uv1 = NATIVE_TO_UNI(uv0);
1478
1479     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1480
1481     uvuni_to_utf8(tmpbuf, uv1);
1482
1483     if (!*swashp) /* load on-demand */
1484          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1485
1486     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1487     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1488          /* It might be "special" (sometimes, but not always,
1489           * a multicharacter mapping) */
1490          HV * const hv = get_hv(special, FALSE);
1491          SV **svp;
1492
1493          if (hv &&
1494              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1495              (*svp)) {
1496              const char *s;
1497
1498               s = SvPV_const(*svp, len);
1499               if (len == 1)
1500                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1501               else {
1502 #ifdef EBCDIC
1503                    /* If we have EBCDIC we need to remap the characters
1504                     * since any characters in the low 256 are Unicode
1505                     * code points, not EBCDIC. */
1506                    U8 *t = (U8*)s, *tend = t + len, *d;
1507
1508                    d = tmpbuf;
1509                    if (SvUTF8(*svp)) {
1510                         STRLEN tlen = 0;
1511
1512                         while (t < tend) {
1513                              const UV c = utf8_to_uvchr(t, &tlen);
1514                              if (tlen > 0) {
1515                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1516                                   t += tlen;
1517                              }
1518                              else
1519                                   break;
1520                         }
1521                    }
1522                    else {
1523                         while (t < tend) {
1524                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1525                              t++;
1526                         }
1527                    }
1528                    len = d - tmpbuf;
1529                    Copy(tmpbuf, ustrp, len, U8);
1530 #else
1531                    Copy(s, ustrp, len, U8);
1532 #endif
1533               }
1534          }
1535     }
1536
1537     if (!len && *swashp) {
1538         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1539
1540          if (uv2) {
1541               /* It was "normal" (a single character mapping). */
1542               const UV uv3 = UNI_TO_NATIVE(uv2);
1543               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1544          }
1545     }
1546
1547     if (!len) /* Neither: just copy. */
1548          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1549
1550     if (lenp)
1551          *lenp = len;
1552
1553     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1554 }
1555
1556 /*
1557 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1558
1559 Convert the UTF-8 encoded character at p to its uppercase version and
1560 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1561 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1562 the uppercase version may be longer than the original character.
1563
1564 The first character of the uppercased version is returned
1565 (but note, as explained above, that there may be more.)
1566
1567 =cut */
1568
1569 UV
1570 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1571 {
1572     dVAR;
1573
1574     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1575
1576     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1577                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1578 }
1579
1580 /*
1581 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1582
1583 Convert the UTF-8 encoded character at p to its titlecase version and
1584 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1585 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1586 titlecase version may be longer than the original character.
1587
1588 The first character of the titlecased version is returned
1589 (but note, as explained above, that there may be more.)
1590
1591 =cut */
1592
1593 UV
1594 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1595 {
1596     dVAR;
1597
1598     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1599
1600     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1601                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1602 }
1603
1604 /*
1605 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1606
1607 Convert the UTF-8 encoded character at p to its lowercase version and
1608 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1609 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1610 lowercase version may be longer than the original character.
1611
1612 The first character of the lowercased version is returned
1613 (but note, as explained above, that there may be more.)
1614
1615 =cut */
1616
1617 UV
1618 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1619 {
1620     dVAR;
1621
1622     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1623
1624     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1625                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1626 }
1627
1628 /*
1629 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1630
1631 Convert the UTF-8 encoded character at p to its foldcase version and
1632 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1633 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1634 foldcase version may be longer than the original character (up to
1635 three characters).
1636
1637 The first character of the foldcased version is returned
1638 (but note, as explained above, that there may be more.)
1639
1640 =cut */
1641
1642 UV
1643 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1644 {
1645     dVAR;
1646
1647     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1648
1649     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1650                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1651 }
1652
1653 /* Note:
1654  * A "swash" is a swatch hash.
1655  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1656  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1657  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1658  */
1659 SV*
1660 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1661 {
1662     dVAR;
1663     SV* retval;
1664     dSP;
1665     const size_t pkg_len = strlen(pkg);
1666     const size_t name_len = strlen(name);
1667     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1668     SV* errsv_save;
1669
1670     PERL_ARGS_ASSERT_SWASH_INIT;
1671
1672     PUSHSTACKi(PERLSI_MAGIC);
1673     ENTER;
1674     SAVEI32(PL_hints);
1675     PL_hints = 0;
1676     save_re_context();
1677     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1678         ENTER;
1679         errsv_save = newSVsv(ERRSV);
1680         /* It is assumed that callers of this routine are not passing in any
1681            user derived data.  */
1682         /* Need to do this after save_re_context() as it will set PL_tainted to
1683            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1684            Even line to create errsv_save can turn on PL_tainted.  */
1685         SAVEBOOL(PL_tainted);
1686         PL_tainted = 0;
1687         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1688                          NULL);
1689         if (!SvTRUE(ERRSV))
1690             sv_setsv(ERRSV, errsv_save);
1691         SvREFCNT_dec(errsv_save);
1692         LEAVE;
1693     }
1694     SPAGAIN;
1695     PUSHMARK(SP);
1696     EXTEND(SP,5);
1697     mPUSHp(pkg, pkg_len);
1698     mPUSHp(name, name_len);
1699     PUSHs(listsv);
1700     mPUSHi(minbits);
1701     mPUSHi(none);
1702     PUTBACK;
1703     errsv_save = newSVsv(ERRSV);
1704     if (call_method("SWASHNEW", G_SCALAR))
1705         retval = newSVsv(*PL_stack_sp--);
1706     else
1707         retval = &PL_sv_undef;
1708     if (!SvTRUE(ERRSV))
1709         sv_setsv(ERRSV, errsv_save);
1710     SvREFCNT_dec(errsv_save);
1711     LEAVE;
1712     POPSTACK;
1713     if (IN_PERL_COMPILETIME) {
1714         CopHINTS_set(PL_curcop, PL_hints);
1715     }
1716     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1717         if (SvPOK(retval))
1718             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1719                        SVfARG(retval));
1720         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1721     }
1722     return retval;
1723 }
1724
1725
1726 /* This API is wrong for special case conversions since we may need to
1727  * return several Unicode characters for a single Unicode character
1728  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1729  * the lower-level routine, and it is similarly broken for returning
1730  * multiple values.  --jhi */
1731 /* Now SWASHGET is recasted into S_swash_get in this file. */
1732
1733 /* Note:
1734  * Returns the value of property/mapping C<swash> for the first character
1735  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1736  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1737  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1738  */
1739 UV
1740 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1741 {
1742     dVAR;
1743     HV* const hv = (HV*)SvRV(swash);
1744     U32 klen;
1745     U32 off;
1746     STRLEN slen;
1747     STRLEN needents;
1748     const U8 *tmps = NULL;
1749     U32 bit;
1750     SV *swatch;
1751     U8 tmputf8[2];
1752     const UV c = NATIVE_TO_ASCII(*ptr);
1753
1754     PERL_ARGS_ASSERT_SWASH_FETCH;
1755
1756     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1757         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1758         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1759         ptr = tmputf8;
1760     }
1761     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1762      * then the "swatch" is a vec() for al the chars which start
1763      * with 0xAA..0xYY
1764      * So the key in the hash (klen) is length of encoded char -1
1765      */
1766     klen = UTF8SKIP(ptr) - 1;
1767     off  = ptr[klen];
1768
1769     if (klen == 0) {
1770       /* If char in invariant then swatch is for all the invariant chars
1771        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1772        */
1773         needents = UTF_CONTINUATION_MARK;
1774         off      = NATIVE_TO_UTF(ptr[klen]);
1775     }
1776     else {
1777       /* If char is encoded then swatch is for the prefix */
1778         needents = (1 << UTF_ACCUMULATION_SHIFT);
1779         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1780     }
1781
1782     /*
1783      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1784      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1785      * it's nothing to sniff at.)  Pity we usually come through at least
1786      * two function calls to get here...
1787      *
1788      * NB: this code assumes that swatches are never modified, once generated!
1789      */
1790
1791     if (hv   == PL_last_swash_hv &&
1792         klen == PL_last_swash_klen &&
1793         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1794     {
1795         tmps = PL_last_swash_tmps;
1796         slen = PL_last_swash_slen;
1797     }
1798     else {
1799         /* Try our second-level swatch cache, kept in a hash. */
1800         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1801
1802         /* If not cached, generate it via swash_get */
1803         if (!svp || !SvPOK(*svp)
1804                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1805             /* We use utf8n_to_uvuni() as we want an index into
1806                Unicode tables, not a native character number.
1807              */
1808             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1809                                            ckWARN(WARN_UTF8) ?
1810                                            0 : UTF8_ALLOW_ANY);
1811             swatch = swash_get(swash,
1812                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1813                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1814                                 needents);
1815
1816             if (IN_PERL_COMPILETIME)
1817                 CopHINTS_set(PL_curcop, PL_hints);
1818
1819             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1820
1821             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1822                      || (slen << 3) < needents)
1823                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1824         }
1825
1826         PL_last_swash_hv = hv;
1827         assert(klen <= sizeof(PL_last_swash_key));
1828         PL_last_swash_klen = (U8)klen;
1829         /* FIXME change interpvar.h?  */
1830         PL_last_swash_tmps = (U8 *) tmps;
1831         PL_last_swash_slen = slen;
1832         if (klen)
1833             Copy(ptr, PL_last_swash_key, klen, U8);
1834     }
1835
1836     switch ((int)((slen << 3) / needents)) {
1837     case 1:
1838         bit = 1 << (off & 7);
1839         off >>= 3;
1840         return (tmps[off] & bit) != 0;
1841     case 8:
1842         return tmps[off];
1843     case 16:
1844         off <<= 1;
1845         return (tmps[off] << 8) + tmps[off + 1] ;
1846     case 32:
1847         off <<= 2;
1848         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1849     }
1850     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1851     NORETURN_FUNCTION_END;
1852 }
1853
1854 /* Note:
1855  * Returns a swatch (a bit vector string) for a code point sequence
1856  * that starts from the value C<start> and comprises the number C<span>.
1857  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1858  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1859  */
1860 STATIC SV*
1861 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1862 {
1863     SV *swatch;
1864     U8 *l, *lend, *x, *xend, *s;
1865     STRLEN lcur, xcur, scur;
1866     HV* const hv = (HV*)SvRV(swash);
1867     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1868     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1869     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1870     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1871     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1872     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1873     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1874     const STRLEN bits  = SvUV(*bitssvp);
1875     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1876     const UV     none  = SvUV(*nonesvp);
1877     const UV     end   = start + span;
1878
1879     PERL_ARGS_ASSERT_SWASH_GET;
1880
1881     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1882         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1883                                                  (UV)bits);
1884     }
1885
1886     /* create and initialize $swatch */
1887     scur   = octets ? (span * octets) : (span + 7) / 8;
1888     swatch = newSV(scur);
1889     SvPOK_on(swatch);
1890     s = (U8*)SvPVX(swatch);
1891     if (octets && none) {
1892         const U8* const e = s + scur;
1893         while (s < e) {
1894             if (bits == 8)
1895                 *s++ = (U8)(none & 0xff);
1896             else if (bits == 16) {
1897                 *s++ = (U8)((none >>  8) & 0xff);
1898                 *s++ = (U8)( none        & 0xff);
1899             }
1900             else if (bits == 32) {
1901                 *s++ = (U8)((none >> 24) & 0xff);
1902                 *s++ = (U8)((none >> 16) & 0xff);
1903                 *s++ = (U8)((none >>  8) & 0xff);
1904                 *s++ = (U8)( none        & 0xff);
1905             }
1906         }
1907         *s = '\0';
1908     }
1909     else {
1910         (void)memzero((U8*)s, scur + 1);
1911     }
1912     SvCUR_set(swatch, scur);
1913     s = (U8*)SvPVX(swatch);
1914
1915     /* read $swash->{LIST} */
1916     l = (U8*)SvPV(*listsvp, lcur);
1917     lend = l + lcur;
1918     while (l < lend) {
1919         UV min, max, val;
1920         STRLEN numlen;
1921         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1922
1923         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1924
1925         numlen = lend - l;
1926         min = grok_hex((char *)l, &numlen, &flags, NULL);
1927         if (numlen)
1928             l += numlen;
1929         else if (nl) {
1930             l = nl + 1; /* 1 is length of "\n" */
1931             continue;
1932         }
1933         else {
1934             l = lend; /* to LIST's end at which \n is not found */
1935             break;
1936         }
1937
1938         if (isBLANK(*l)) {
1939             ++l;
1940             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1941             numlen = lend - l;
1942             max = grok_hex((char *)l, &numlen, &flags, NULL);
1943             if (numlen)
1944                 l += numlen;
1945             else
1946                 max = min;
1947
1948             if (octets) {
1949                 if (isBLANK(*l)) {
1950                     ++l;
1951                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1952                             PERL_SCAN_DISALLOW_PREFIX;
1953                     numlen = lend - l;
1954                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1955                     if (numlen)
1956                         l += numlen;
1957                     else
1958                         val = 0;
1959                 }
1960                 else {
1961                     val = 0;
1962                     if (typeto) {
1963                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1964                                          typestr, l);
1965                     }
1966                 }
1967             }
1968             else
1969                 val = 0; /* bits == 1, then val should be ignored */
1970         }
1971         else {
1972             max = min;
1973             if (octets) {
1974                 val = 0;
1975                 if (typeto) {
1976                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1977                 }
1978             }
1979             else
1980                 val = 0; /* bits == 1, then val should be ignored */
1981         }
1982
1983         if (nl)
1984             l = nl + 1;
1985         else
1986             l = lend;
1987
1988         if (max < start)
1989             continue;
1990
1991         if (octets) {
1992             UV key;
1993             if (min < start) {
1994                 if (!none || val < none) {
1995                     val += start - min;
1996                 }
1997                 min = start;
1998             }
1999             for (key = min; key <= max; key++) {
2000                 STRLEN offset;
2001                 if (key >= end)
2002                     goto go_out_list;
2003                 /* offset must be non-negative (start <= min <= key < end) */
2004                 offset = octets * (key - start);
2005                 if (bits == 8)
2006                     s[offset] = (U8)(val & 0xff);
2007                 else if (bits == 16) {
2008                     s[offset    ] = (U8)((val >>  8) & 0xff);
2009                     s[offset + 1] = (U8)( val        & 0xff);
2010                 }
2011                 else if (bits == 32) {
2012                     s[offset    ] = (U8)((val >> 24) & 0xff);
2013                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2014                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2015                     s[offset + 3] = (U8)( val        & 0xff);
2016                 }
2017
2018                 if (!none || val < none)
2019                     ++val;
2020             }
2021         }
2022         else { /* bits == 1, then val should be ignored */
2023             UV key;
2024             if (min < start)
2025                 min = start;
2026             for (key = min; key <= max; key++) {
2027                 const STRLEN offset = (STRLEN)(key - start);
2028                 if (key >= end)
2029                     goto go_out_list;
2030                 s[offset >> 3] |= 1 << (offset & 7);
2031             }
2032         }
2033     } /* while */
2034   go_out_list:
2035
2036     /* read $swash->{EXTRAS} */
2037     x = (U8*)SvPV(*extssvp, xcur);
2038     xend = x + xcur;
2039     while (x < xend) {
2040         STRLEN namelen;
2041         U8 *namestr;
2042         SV** othersvp;
2043         HV* otherhv;
2044         STRLEN otherbits;
2045         SV **otherbitssvp, *other;
2046         U8 *s, *o, *nl;
2047         STRLEN slen, olen;
2048
2049         const U8 opc = *x++;
2050         if (opc == '\n')
2051             continue;
2052
2053         nl = (U8*)memchr(x, '\n', xend - x);
2054
2055         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2056             if (nl) {
2057                 x = nl + 1; /* 1 is length of "\n" */
2058                 continue;
2059             }
2060             else {
2061                 x = xend; /* to EXTRAS' end at which \n is not found */
2062                 break;
2063             }
2064         }
2065
2066         namestr = x;
2067         if (nl) {
2068             namelen = nl - namestr;
2069             x = nl + 1;
2070         }
2071         else {
2072             namelen = xend - namestr;
2073             x = xend;
2074         }
2075
2076         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2077         otherhv = (HV*)SvRV(*othersvp);
2078         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2079         otherbits = (STRLEN)SvUV(*otherbitssvp);
2080         if (bits < otherbits)
2081             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2082
2083         /* The "other" swatch must be destroyed after. */
2084         other = swash_get(*othersvp, start, span);
2085         o = (U8*)SvPV(other, olen);
2086
2087         if (!olen)
2088             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2089
2090         s = (U8*)SvPV(swatch, slen);
2091         if (bits == 1 && otherbits == 1) {
2092             if (slen != olen)
2093                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2094
2095             switch (opc) {
2096             case '+':
2097                 while (slen--)
2098                     *s++ |= *o++;
2099                 break;
2100             case '!':
2101                 while (slen--)
2102                     *s++ |= ~*o++;
2103                 break;
2104             case '-':
2105                 while (slen--)
2106                     *s++ &= ~*o++;
2107                 break;
2108             case '&':
2109                 while (slen--)
2110                     *s++ &= *o++;
2111                 break;
2112             default:
2113                 break;
2114             }
2115         }
2116         else {
2117             STRLEN otheroctets = otherbits >> 3;
2118             STRLEN offset = 0;
2119             U8* const send = s + slen;
2120
2121             while (s < send) {
2122                 UV otherval = 0;
2123
2124                 if (otherbits == 1) {
2125                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2126                     ++offset;
2127                 }
2128                 else {
2129                     STRLEN vlen = otheroctets;
2130                     otherval = *o++;
2131                     while (--vlen) {
2132                         otherval <<= 8;
2133                         otherval |= *o++;
2134                     }
2135                 }
2136
2137                 if (opc == '+' && otherval)
2138                     NOOP;   /* replace with otherval */
2139                 else if (opc == '!' && !otherval)
2140                     otherval = 1;
2141                 else if (opc == '-' && otherval)
2142                     otherval = 0;
2143                 else if (opc == '&' && !otherval)
2144                     otherval = 0;
2145                 else {
2146                     s += octets; /* no replacement */
2147                     continue;
2148                 }
2149
2150                 if (bits == 8)
2151                     *s++ = (U8)( otherval & 0xff);
2152                 else if (bits == 16) {
2153                     *s++ = (U8)((otherval >>  8) & 0xff);
2154                     *s++ = (U8)( otherval        & 0xff);
2155                 }
2156                 else if (bits == 32) {
2157                     *s++ = (U8)((otherval >> 24) & 0xff);
2158                     *s++ = (U8)((otherval >> 16) & 0xff);
2159                     *s++ = (U8)((otherval >>  8) & 0xff);
2160                     *s++ = (U8)( otherval        & 0xff);
2161                 }
2162             }
2163         }
2164         sv_free(other); /* through with it! */
2165     } /* while */
2166     return swatch;
2167 }
2168
2169 /*
2170 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2171
2172 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2173 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2174 bytes available. The return value is the pointer to the byte after the
2175 end of the new character. In other words,
2176
2177     d = uvchr_to_utf8(d, uv);
2178
2179 is the recommended wide native character-aware way of saying
2180
2181     *(d++) = uv;
2182
2183 =cut
2184 */
2185
2186 /* On ASCII machines this is normally a macro but we want a
2187    real function in case XS code wants it
2188 */
2189 U8 *
2190 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2191 {
2192     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2193
2194     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2195 }
2196
2197 U8 *
2198 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2199 {
2200     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2201
2202     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2203 }
2204
2205 /*
2206 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2207 flags
2208
2209 Returns the native character value of the first character in the string
2210 C<s>
2211 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2212 length, in bytes, of that character.
2213
2214 Allows length and flags to be passed to low level routine.
2215
2216 =cut
2217 */
2218 /* On ASCII machines this is normally a macro but we want
2219    a real function in case XS code wants it
2220 */
2221 UV
2222 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2223 U32 flags)
2224 {
2225     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2226
2227     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2228
2229     return UNI_TO_NATIVE(uv);
2230 }
2231
2232 /*
2233 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2234
2235 Build to the scalar dsv a displayable version of the string spv,
2236 length len, the displayable version being at most pvlim bytes long
2237 (if longer, the rest is truncated and "..." will be appended).
2238
2239 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2240 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2241 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2242 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2243 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2244 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2245
2246 The pointer to the PV of the dsv is returned.
2247
2248 =cut */
2249 char *
2250 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2251 {
2252     int truncated = 0;
2253     const char *s, *e;
2254
2255     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2256
2257     sv_setpvn(dsv, "", 0);
2258     SvUTF8_off(dsv);
2259     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2260          UV u;
2261           /* This serves double duty as a flag and a character to print after
2262              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2263           */
2264          char ok = 0;
2265
2266          if (pvlim && SvCUR(dsv) >= pvlim) {
2267               truncated++;
2268               break;
2269          }
2270          u = utf8_to_uvchr((U8*)s, 0);
2271          if (u < 256) {
2272              const unsigned char c = (unsigned char)u & 0xFF;
2273              if (flags & UNI_DISPLAY_BACKSLASH) {
2274                  switch (c) {
2275                  case '\n':
2276                      ok = 'n'; break;
2277                  case '\r':
2278                      ok = 'r'; break;
2279                  case '\t':
2280                      ok = 't'; break;
2281                  case '\f':
2282                      ok = 'f'; break;
2283                  case '\a':
2284                      ok = 'a'; break;
2285                  case '\\':
2286                      ok = '\\'; break;
2287                  default: break;
2288                  }
2289                  if (ok) {
2290                      const char string = ok;
2291                      sv_catpvn(dsv, &string, 1);
2292                  }
2293              }
2294              /* isPRINT() is the locale-blind version. */
2295              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2296                  const char string = c;
2297                  sv_catpvn(dsv, &string, 1);
2298                  ok = 1;
2299              }
2300          }
2301          if (!ok)
2302              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2303     }
2304     if (truncated)
2305          sv_catpvs(dsv, "...");
2306
2307     return SvPVX(dsv);
2308 }
2309
2310 /*
2311 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2312
2313 Build to the scalar dsv a displayable version of the scalar sv,
2314 the displayable version being at most pvlim bytes long
2315 (if longer, the rest is truncated and "..." will be appended).
2316
2317 The flags argument is as in pv_uni_display().
2318
2319 The pointer to the PV of the dsv is returned.
2320
2321 =cut
2322 */
2323 char *
2324 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2325 {
2326     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2327
2328      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2329                                 SvCUR(ssv), pvlim, flags);
2330 }
2331
2332 /*
2333 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2334
2335 Return true if the strings s1 and s2 differ case-insensitively, false
2336 if not (if they are equal case-insensitively).  If u1 is true, the
2337 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2338 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2339 are false, the respective string is assumed to be in native 8-bit
2340 encoding.
2341
2342 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2343 in there (they will point at the beginning of the I<next> character).
2344 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2345 pointers beyond which scanning will not continue under any
2346 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2347 s2+l2 will be used as goal end pointers that will also stop the scan,
2348 and which qualify towards defining a successful match: all the scans
2349 that define an explicit length must reach their goal pointers for
2350 a match to succeed).
2351
2352 For case-insensitiveness, the "casefolding" of Unicode is used
2353 instead of upper/lowercasing both the characters, see
2354 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2355
2356 =cut */
2357 I32
2358 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2359 {
2360      dVAR;
2361      register const U8 *p1  = (const U8*)s1;
2362      register const U8 *p2  = (const U8*)s2;
2363      register const U8 *f1 = NULL;
2364      register const U8 *f2 = NULL;
2365      register U8 *e1 = NULL;
2366      register U8 *q1 = NULL;
2367      register U8 *e2 = NULL;
2368      register U8 *q2 = NULL;
2369      STRLEN n1 = 0, n2 = 0;
2370      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2371      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2372      U8 natbuf[1+1];
2373      STRLEN foldlen1, foldlen2;
2374      bool match;
2375
2376      PERL_ARGS_ASSERT_IBCMP_UTF8;
2377
2378      if (pe1)
2379           e1 = *(U8**)pe1;
2380      /* assert(e1 || l1); */
2381      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2382           f1 = (const U8*)s1 + l1;
2383      if (pe2)
2384           e2 = *(U8**)pe2;
2385      /* assert(e2 || l2); */
2386      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2387           f2 = (const U8*)s2 + l2;
2388
2389      /* This shouldn't happen. However, putting an assert() there makes some
2390       * tests fail. */
2391      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2392      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2393           return 1; /* mismatch; possible infinite loop or false positive */
2394
2395      if (!u1 || !u2)
2396           natbuf[1] = 0; /* Need to terminate the buffer. */
2397
2398      while ((e1 == 0 || p1 < e1) &&
2399             (f1 == 0 || p1 < f1) &&
2400             (e2 == 0 || p2 < e2) &&
2401             (f2 == 0 || p2 < f2)) {
2402           if (n1 == 0) {
2403                if (u1)
2404                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2405                else {
2406                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2407                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2408                }
2409                q1 = foldbuf1;
2410                n1 = foldlen1;
2411           }
2412           if (n2 == 0) {
2413                if (u2)
2414                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2415                else {
2416                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2417                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2418                }
2419                q2 = foldbuf2;
2420                n2 = foldlen2;
2421           }
2422           while (n1 && n2) {
2423                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2424                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2425                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2426                    return 1; /* mismatch */
2427                n1 -= UTF8SKIP(q1);
2428                q1 += UTF8SKIP(q1);
2429                n2 -= UTF8SKIP(q2);
2430                q2 += UTF8SKIP(q2);
2431           }
2432           if (n1 == 0)
2433                p1 += u1 ? UTF8SKIP(p1) : 1;
2434           if (n2 == 0)
2435                p2 += u2 ? UTF8SKIP(p2) : 1;
2436
2437      }
2438
2439      /* A match is defined by all the scans that specified
2440       * an explicit length reaching their final goals. */
2441      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2442
2443      if (match) {
2444           if (pe1)
2445                *pe1 = (char*)p1;
2446           if (pe2)
2447                *pe2 = (char*)p2;
2448      }
2449
2450      return match ? 0 : 1; /* 0 match, 1 mismatch */
2451 }
2452
2453 /*
2454  * Local variables:
2455  * c-indentation-style: bsd
2456  * c-basic-offset: 4
2457  * indent-tabs-mode: t
2458  * End:
2459  *
2460  * ex: set ts=8 sts=4 sw=4 noet:
2461  */