utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =for apidoc uvuni_to_utf8_flags
  55
  56 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  57 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  58 bytes available. The return value is the pointer to the byte after the
  59 end of the new character. In other words,
  60
  61     d = uvuni_to_utf8_flags(d, uv, flags);
  62
  63 or, in most cases,
  64
  65     d = uvuni_to_utf8(d, uv);
  66
  67 (which is equivalent to)
  68
  69     d = uvuni_to_utf8_flags(d, uv, 0);
  70
  71 is the recommended Unicode-aware way of saying
  72
  73     *(d++) = uv;
  74
  75 =cut
  76 */
  77
  78 U8 *
  79 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  80 {
  81     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
  82
  83     if (ckWARN(WARN_UTF8)) {
  84          if (UNICODE_IS_SURROGATE(uv) &&
  85              !(flags & UNICODE_ALLOW_SURROGATE))
  86               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  87          else if (
  88                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  89                     !(flags & UNICODE_ALLOW_FDD0))
  90                    ||
  91                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  92                     !(flags & UNICODE_ALLOW_FFFF))) &&
  93                   /* UNICODE_ALLOW_SUPER includes
  94                    * FFFEs and FFFFs beyond 0x10FFFF. */
  95                   ((uv <= PERL_UNICODE_MAX) ||
  96                    !(flags & UNICODE_ALLOW_SUPER))
  97                   )
  98               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  99                          "Unicode character 0x%04"UVxf" is illegal", uv);
 100     }
 101     if (UNI_IS_INVARIANT(uv)) {
 102         *d++ = (U8)UTF_TO_NATIVE(uv);
 103         return d;
 104     }
 105 #if defined(EBCDIC)
 106     else {
 107         STRLEN len  = UNISKIP(uv);
 108         U8 *p = d+len-1;
 109         while (p > d) {
 110             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 111             uv >>= UTF_ACCUMULATION_SHIFT;
 112         }
 113         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 114         return d+len;
 115     }
 116 #else /* Non loop style */
 117     if (uv < 0x800) {
 118         *d++ = (U8)(( uv >>  6)         | 0xc0);
 119         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 120         return d;
 121     }
 122     if (uv < 0x10000) {
 123         *d++ = (U8)(( uv >> 12)         | 0xe0);
 124         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 125         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 126         return d;
 127     }
 128     if (uv < 0x200000) {
 129         *d++ = (U8)(( uv >> 18)         | 0xf0);
 130         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 132         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 133         return d;
 134     }
 135     if (uv < 0x4000000) {
 136         *d++ = (U8)(( uv >> 24)         | 0xf8);
 137         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 138         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 139         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 140         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 141         return d;
 142     }
 143     if (uv < 0x80000000) {
 144         *d++ = (U8)(( uv >> 30)         | 0xfc);
 145         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 146         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 147         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 148         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 149         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 150         return d;
 151     }
 152 #ifdef HAS_QUAD
 153     if (uv < UTF8_QUAD_MAX)
 154 #endif
 155     {
 156         *d++ =                            0xfe; /* Can't match U+FEFF! */
 157         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 162         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 163         return d;
 164     }
 165 #ifdef HAS_QUAD
 166     {
 167         *d++ =                            0xff;         /* Can't match U+FFFE! */
 168         *d++ =                            0x80;         /* 6 Reserved bits */
 169         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 170         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 171         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 172         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 173         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 174         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 175         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 176         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 177         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 178         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 179         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 180         return d;
 181     }
 182 #endif
 183 #endif /* Loop style */
 184 }
 185
 186 /*
 187
 188 Tests if some arbitrary number of bytes begins in a valid UTF-8
 189 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 190 UTF-8 character.  The actual number of bytes in the UTF-8 character
 191 will be returned if it is valid, otherwise 0.
 192
 193 This is the "slow" version as opposed to the "fast" version which is
 194 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 195 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 196 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 197 you should use the _slow().  In practice this means that the _slow()
 198 will be used very rarely, since the maximum Unicode code point (as of
 199 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 200 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 201 five bytes or more.
 202
 203 =cut */
 204 STATIC STRLEN
 205 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 206 {
 207     U8 u = *s;
 208     STRLEN slen;
 209     UV uv, ouv;
 210
 211     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 212
 213     if (UTF8_IS_INVARIANT(u))
 214         return 1;
 215
 216     if (!UTF8_IS_START(u))
 217         return 0;
 218
 219     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 220         return 0;
 221
 222     slen = len - 1;
 223     s++;
 224 #ifdef EBCDIC
 225     u = NATIVE_TO_UTF(u);
 226 #endif
 227     u &= UTF_START_MASK(len);
 228     uv  = u;
 229     ouv = uv;
 230     while (slen--) {
 231         if (!UTF8_IS_CONTINUATION(*s))
 232             return 0;
 233         uv = UTF8_ACCUMULATE(uv, *s);
 234         if (uv < ouv)
 235             return 0;
 236         ouv = uv;
 237         s++;
 238     }
 239
 240     if ((STRLEN)UNISKIP(uv) < len)
 241         return 0;
 242
 243     return len;
 244 }
 245
 246 /*
 247 =for apidoc is_utf8_char
 248
 249 Tests if some arbitrary number of bytes begins in a valid UTF-8
 250 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 251 UTF-8 character.  The actual number of bytes in the UTF-8 character
 252 will be returned if it is valid, otherwise 0.
 253
 254 =cut */
 255 STRLEN
 256 Perl_is_utf8_char(pTHX_ const U8 *s)
 257 {
 258     const STRLEN len = UTF8SKIP(s);
 259
 260     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 261     PERL_UNUSED_CONTEXT;
 262 #ifdef IS_UTF8_CHAR
 263     if (IS_UTF8_CHAR_FAST(len))
 264         return IS_UTF8_CHAR(s, len) ? len : 0;
 265 #endif /* #ifdef IS_UTF8_CHAR */
 266     return is_utf8_char_slow(s, len);
 267 }
 268
 269 /*
 270 =for apidoc is_utf8_string
 271
 272 Returns true if first C<len> bytes of the given string form a valid
 273 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 274 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 275 because a valid ASCII string is a valid UTF-8 string.
 276
 277 See also is_utf8_string_loclen() and is_utf8_string_loc().
 278
 279 =cut
 280 */
 281
 282 bool
 283 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 284 {
 285     const U8* const send = s + (len ? len : strlen((const char *)s));
 286     const U8* x = s;
 287
 288     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 289     PERL_UNUSED_CONTEXT;
 290
 291     while (x < send) {
 292         STRLEN c;
 293          /* Inline the easy bits of is_utf8_char() here for speed... */
 294          if (UTF8_IS_INVARIANT(*x))
 295               c = 1;
 296          else if (!UTF8_IS_START(*x))
 297              goto out;
 298          else {
 299               /* ... and call is_utf8_char() only if really needed. */
 300 #ifdef IS_UTF8_CHAR
 301              c = UTF8SKIP(x);
 302              if (IS_UTF8_CHAR_FAST(c)) {
 303                  if (!IS_UTF8_CHAR(x, c))
 304                      c = 0;
 305              }
 306              else
 307                 c = is_utf8_char_slow(x, c);
 308 #else
 309              c = is_utf8_char(x);
 310 #endif /* #ifdef IS_UTF8_CHAR */
 311               if (!c)
 312                   goto out;
 313          }
 314         x += c;
 315     }
 316
 317  out:
 318     if (x != send)
 319         return FALSE;
 320
 321     return TRUE;
 322 }
 323
 324 /*
 325 Implemented as a macro in utf8.h
 326
 327 =for apidoc is_utf8_string_loc
 328
 329 Like is_utf8_string() but stores the location of the failure (in the
 330 case of "utf8ness failure") or the location s+len (in the case of
 331 "utf8ness success") in the C<ep>.
 332
 333 See also is_utf8_string_loclen() and is_utf8_string().
 334
 335 =for apidoc is_utf8_string_loclen
 336
 337 Like is_utf8_string() but stores the location of the failure (in the
 338 case of "utf8ness failure") or the location s+len (in the case of
 339 "utf8ness success") in the C<ep>, and the number of UTF-8
 340 encoded characters in the C<el>.
 341
 342 See also is_utf8_string_loc() and is_utf8_string().
 343
 344 =cut
 345 */
 346
 347 bool
 348 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 349 {
 350     const U8* const send = s + (len ? len : strlen((const char *)s));
 351     const U8* x = s;
 352     STRLEN c;
 353     STRLEN outlen = 0;
 354
 355     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 356     PERL_UNUSED_CONTEXT;
 357
 358     while (x < send) {
 359          /* Inline the easy bits of is_utf8_char() here for speed... */
 360          if (UTF8_IS_INVARIANT(*x))
 361              c = 1;
 362          else if (!UTF8_IS_START(*x))
 363              goto out;
 364          else {
 365              /* ... and call is_utf8_char() only if really needed. */
 366 #ifdef IS_UTF8_CHAR
 367              c = UTF8SKIP(x);
 368              if (IS_UTF8_CHAR_FAST(c)) {
 369                  if (!IS_UTF8_CHAR(x, c))
 370                      c = 0;
 371              } else
 372                  c = is_utf8_char_slow(x, c);
 373 #else
 374              c = is_utf8_char(x);
 375 #endif /* #ifdef IS_UTF8_CHAR */
 376              if (!c)
 377                  goto out;
 378          }
 379          x += c;
 380          outlen++;
 381     }
 382
 383  out:
 384     if (el)
 385         *el = outlen;
 386
 387     if (ep)
 388         *ep = x;
 389     return (x == send);
 390 }
 391
 392 /*
 393
 394 =for apidoc utf8n_to_uvuni
 395
 396 Bottom level UTF-8 decode routine.
 397 Returns the Unicode code point value of the first character in the string C<s>
 398 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 399 C<retlen> will be set to the length, in bytes, of that character.
 400
 401 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 402 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 403 it is assumed that the caller will raise a warning, and this function
 404 will silently just set C<retlen> to C<-1> and return zero.  If the
 405 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 406 malformations will be given, C<retlen> will be set to the expected
 407 length of the UTF-8 character in bytes, and zero will be returned.
 408
 409 The C<flags> can also contain various flags to allow deviations from
 410 the strict UTF-8 encoding (see F<utf8.h>).
 411
 412 Most code should use utf8_to_uvchr() rather than call this directly.
 413
 414 =cut
 415 */
 416
 417 UV
 418 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 419 {
 420     dVAR;
 421     const U8 * const s0 = s;
 422     UV uv = *s, ouv = 0;
 423     STRLEN len = 1;
 424     const bool dowarn = ckWARN_d(WARN_UTF8);
 425     const UV startbyte = *s;
 426     STRLEN expectlen = 0;
 427     U32 warning = 0;
 428
 429     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 430
 431 /* This list is a superset of the UTF8_ALLOW_XXX. */
 432
 433 #define UTF8_WARN_EMPTY                          1
 434 #define UTF8_WARN_CONTINUATION                   2
 435 #define UTF8_WARN_NON_CONTINUATION               3
 436 #define UTF8_WARN_FE_FF                          4
 437 #define UTF8_WARN_SHORT                          5
 438 #define UTF8_WARN_OVERFLOW                       6
 439 #define UTF8_WARN_SURROGATE                      7
 440 #define UTF8_WARN_LONG                           8
 441 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 442
 443     if (curlen == 0 &&
 444         !(flags & UTF8_ALLOW_EMPTY)) {
 445         warning = UTF8_WARN_EMPTY;
 446         goto malformed;
 447     }
 448
 449     if (UTF8_IS_INVARIANT(uv)) {
 450         if (retlen)
 451             *retlen = 1;
 452         return (UV) (NATIVE_TO_UTF(*s));
 453     }
 454
 455     if (UTF8_IS_CONTINUATION(uv) &&
 456         !(flags & UTF8_ALLOW_CONTINUATION)) {
 457         warning = UTF8_WARN_CONTINUATION;
 458         goto malformed;
 459     }
 460
 461     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 462         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 463         warning = UTF8_WARN_NON_CONTINUATION;
 464         goto malformed;
 465     }
 466
 467 #ifdef EBCDIC
 468     uv = NATIVE_TO_UTF(uv);
 469 #else
 470     if ((uv == 0xfe || uv == 0xff) &&
 471         !(flags & UTF8_ALLOW_FE_FF)) {
 472         warning = UTF8_WARN_FE_FF;
 473         goto malformed;
 474     }
 475 #endif
 476
 477     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 478     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 479     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 480     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 481 #ifdef EBCDIC
 482     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 483     else                        { len =  7; uv &= 0x01; }
 484 #else
 485     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 486     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 487     else                        { len = 13; uv = 0; } /* whoa! */
 488 #endif
 489
 490     if (retlen)
 491         *retlen = len;
 492
 493     expectlen = len;
 494
 495     if ((curlen < expectlen) &&
 496         !(flags & UTF8_ALLOW_SHORT)) {
 497         warning = UTF8_WARN_SHORT;
 498         goto malformed;
 499     }
 500
 501     len--;
 502     s++;
 503     ouv = uv;
 504
 505     while (len--) {
 506         if (!UTF8_IS_CONTINUATION(*s) &&
 507             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 508             s--;
 509             warning = UTF8_WARN_NON_CONTINUATION;
 510             goto malformed;
 511         }
 512         else
 513             uv = UTF8_ACCUMULATE(uv, *s);
 514         if (!(uv > ouv)) {
 515             /* These cannot be allowed. */
 516             if (uv == ouv) {
 517                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 518                     warning = UTF8_WARN_LONG;
 519                     goto malformed;
 520                 }
 521             }
 522             else { /* uv < ouv */
 523                 /* This cannot be allowed. */
 524                 warning = UTF8_WARN_OVERFLOW;
 525                 goto malformed;
 526             }
 527         }
 528         s++;
 529         ouv = uv;
 530     }
 531
 532     if (UNICODE_IS_SURROGATE(uv) &&
 533         !(flags & UTF8_ALLOW_SURROGATE)) {
 534         warning = UTF8_WARN_SURROGATE;
 535         goto malformed;
 536     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 537                !(flags & UTF8_ALLOW_LONG)) {
 538         warning = UTF8_WARN_LONG;
 539         goto malformed;
 540     } else if (UNICODE_IS_ILLEGAL(uv) &&
 541                !(flags & UTF8_ALLOW_FFFF)) {
 542         warning = UTF8_WARN_FFFF;
 543         goto malformed;
 544     }
 545
 546     return uv;
 547
 548 malformed:
 549
 550     if (flags & UTF8_CHECK_ONLY) {
 551         if (retlen)
 552             *retlen = ((STRLEN) -1);
 553         return 0;
 554     }
 555
 556     if (dowarn) {
 557         SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 558
 559         switch (warning) {
 560         case 0: /* Intentionally empty. */ break;
 561         case UTF8_WARN_EMPTY:
 562             sv_catpvs(sv, "(empty string)");
 563             break;
 564         case UTF8_WARN_CONTINUATION:
 565             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 566             break;
 567         case UTF8_WARN_NON_CONTINUATION:
 568             if (s == s0)
 569                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 570                            (UV)s[1], startbyte);
 571             else {
 572                 const int len = (int)(s-s0);
 573                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 574                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 575             }
 576
 577             break;
 578         case UTF8_WARN_FE_FF:
 579             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 580             break;
 581         case UTF8_WARN_SHORT:
 582             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 583                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 584             expectlen = curlen;         /* distance for caller to skip */
 585             break;
 586         case UTF8_WARN_OVERFLOW:
 587             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 588                            ouv, *s, startbyte);
 589             break;
 590         case UTF8_WARN_SURROGATE:
 591             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 592             break;
 593         case UTF8_WARN_LONG:
 594             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 595                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 596             break;
 597         case UTF8_WARN_FFFF:
 598             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 599             break;
 600         default:
 601             sv_catpvs(sv, "(unknown reason)");
 602             break;
 603         }
 604
 605         if (warning) {
 606             const char * const s = SvPVX_const(sv);
 607
 608             if (PL_op)
 609                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 610                             "%s in %s", s,  OP_DESC(PL_op));
 611             else
 612                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 613         }
 614     }
 615
 616     if (retlen)
 617         *retlen = expectlen ? expectlen : len;
 618
 619     return 0;
 620 }
 621
 622 /*
 623 =for apidoc utf8_to_uvchr
 624
 625 Returns the native character value of the first character in the string C<s>
 626 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 627 length, in bytes, of that character.
 628
 629 If C<s> does not point to a well-formed UTF-8 character, zero is
 630 returned and retlen is set, if possible, to -1.
 631
 632 =cut
 633 */
 634
 635 UV
 636 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 637 {
 638     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 639
 640     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 641                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 642 }
 643
 644 /*
 645 =for apidoc utf8_to_uvuni
 646
 647 Returns the Unicode code point of the first character in the string C<s>
 648 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 649 length, in bytes, of that character.
 650
 651 This function should only be used when returned UV is considered
 652 an index into the Unicode semantic tables (e.g. swashes).
 653
 654 If C<s> does not point to a well-formed UTF-8 character, zero is
 655 returned and retlen is set, if possible, to -1.
 656
 657 =cut
 658 */
 659
 660 UV
 661 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 662 {
 663     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 664
 665     /* Call the low level routine asking for checks */
 666     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 667                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 668 }
 669
 670 /*
 671 =for apidoc utf8_length
 672
 673 Return the length of the UTF-8 char encoded string C<s> in characters.
 674 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 675 up past C<e>, croaks.
 676
 677 =cut
 678 */
 679
 680 STRLEN
 681 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 682 {
 683     dVAR;
 684     STRLEN len = 0;
 685     U8 t = 0;
 686
 687     PERL_ARGS_ASSERT_UTF8_LENGTH;
 688
 689     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 690      * the bitops (especially ~) can create illegal UTF-8.
 691      * In other words: in Perl UTF-8 is not just for Unicode. */
 692
 693     if (e < s)
 694         goto warn_and_return;
 695     while (s < e) {
 696         t = UTF8SKIP(s);
 697         if (e - s < t) {
 698             warn_and_return:
 699             if (ckWARN_d(WARN_UTF8)) {
 700                 if (PL_op)
 701                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 702                             "%s in %s", unees, OP_DESC(PL_op));
 703                 else
 704                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 705             }
 706             return len;
 707         }
 708         s += t;
 709         len++;
 710     }
 711
 712     return len;
 713 }
 714
 715 /*
 716 =for apidoc utf8_distance
 717
 718 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 719 and C<b>.
 720
 721 WARNING: use only if you *know* that the pointers point inside the
 722 same UTF-8 buffer.
 723
 724 =cut
 725 */
 726
 727 IV
 728 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 729 {
 730     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 731
 732     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 733 }
 734
 735 /*
 736 =for apidoc utf8_hop
 737
 738 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 739 forward or backward.
 740
 741 WARNING: do not use the following unless you *know* C<off> is within
 742 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 743 on the first byte of character or just after the last byte of a character.
 744
 745 =cut
 746 */
 747
 748 U8 *
 749 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 750 {
 751     PERL_ARGS_ASSERT_UTF8_HOP;
 752
 753     PERL_UNUSED_CONTEXT;
 754     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 755      * the bitops (especially ~) can create illegal UTF-8.
 756      * In other words: in Perl UTF-8 is not just for Unicode. */
 757
 758     if (off >= 0) {
 759         while (off--)
 760             s += UTF8SKIP(s);
 761     }
 762     else {
 763         while (off++) {
 764             s--;
 765             while (UTF8_IS_CONTINUATION(*s))
 766                 s--;
 767         }
 768     }
 769     return (U8 *)s;
 770 }
 771
 772 /*
 773 =for apidoc utf8_to_bytes
 774
 775 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 776 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 777 updates len to contain the new length.
 778 Returns zero on failure, setting C<len> to -1.
 779
 780 If you need a copy of the string, see C<bytes_from_utf8>.
 781
 782 =cut
 783 */
 784
 785 U8 *
 786 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 787 {
 788     U8 * const save = s;
 789     U8 * const send = s + *len;
 790     U8 *d;
 791
 792     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 793
 794     /* ensure valid UTF-8 and chars < 256 before updating string */
 795     while (s < send) {
 796         U8 c = *s++;
 797
 798         if (!UTF8_IS_INVARIANT(c) &&
 799             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 800              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 801             *len = ((STRLEN) -1);
 802             return 0;
 803         }
 804     }
 805
 806     d = s = save;
 807     while (s < send) {
 808         STRLEN ulen;
 809         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 810         s += ulen;
 811     }
 812     *d = '\0';
 813     *len = d - save;
 814     return save;
 815 }
 816
 817 /*
 818 =for apidoc bytes_from_utf8
 819
 820 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 821 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 822 the newly-created string, and updates C<len> to contain the new
 823 length.  Returns the original string if no conversion occurs, C<len>
 824 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 825 0 if C<s> is converted or contains all 7bit characters.
 826
 827 =cut
 828 */
 829
 830 U8 *
 831 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 832 {
 833     U8 *d;
 834     const U8 *start = s;
 835     const U8 *send;
 836     I32 count = 0;
 837
 838     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 839
 840     PERL_UNUSED_CONTEXT;
 841     if (!*is_utf8)
 842         return (U8 *)start;
 843
 844     /* ensure valid UTF-8 and chars < 256 before converting string */
 845     for (send = s + *len; s < send;) {
 846         U8 c = *s++;
 847         if (!UTF8_IS_INVARIANT(c)) {
 848             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 849                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 850                 count++;
 851             else
 852                 return (U8 *)start;
 853         }
 854     }
 855
 856     *is_utf8 = FALSE;
 857
 858     Newx(d, (*len) - count + 1, U8);
 859     s = start; start = d;
 860     while (s < send) {
 861         U8 c = *s++;
 862         if (!UTF8_IS_INVARIANT(c)) {
 863             /* Then it is two-byte encoded */
 864             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 865             c = ASCII_TO_NATIVE(c);
 866         }
 867         *d++ = c;
 868     }
 869     *d = '\0';
 870     *len = d - start;
 871     return (U8 *)start;
 872 }
 873
 874 /*
 875 =for apidoc bytes_to_utf8
 876
 877 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 878 Returns a pointer to the newly-created string, and sets C<len> to
 879 reflect the new length.
 880
 881 If you want to convert to UTF-8 from other encodings than ASCII,
 882 see sv_recode_to_utf8().
 883
 884 =cut
 885 */
 886
 887 U8*
 888 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 889 {
 890     const U8 * const send = s + (*len);
 891     U8 *d;
 892     U8 *dst;
 893
 894     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 895     PERL_UNUSED_CONTEXT;
 896
 897     Newx(d, (*len) * 2 + 1, U8);
 898     dst = d;
 899
 900     while (s < send) {
 901         const UV uv = NATIVE_TO_ASCII(*s++);
 902         if (UNI_IS_INVARIANT(uv))
 903             *d++ = (U8)UTF_TO_NATIVE(uv);
 904         else {
 905             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 906             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 907         }
 908     }
 909     *d = '\0';
 910     *len = d-dst;
 911     return dst;
 912 }
 913
 914 /*
 915  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 916  *
 917  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 918  * We optimize for native, for obvious reasons. */
 919
 920 U8*
 921 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 922 {
 923     U8* pend;
 924     U8* dstart = d;
 925
 926     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 927
 928     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 929          d[0] = 0;
 930          *newlen = 1;
 931          return d;
 932     }
 933
 934     if (bytelen & 1)
 935         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 936
 937     pend = p + bytelen;
 938
 939     while (p < pend) {
 940         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 941         p += 2;
 942         if (uv < 0x80) {
 943 #ifdef EBCDIC
 944             *d++ = UNI_TO_NATIVE(uv);
 945 #else
 946             *d++ = (U8)uv;
 947 #endif
 948             continue;
 949         }
 950         if (uv < 0x800) {
 951             *d++ = (U8)(( uv >>  6)         | 0xc0);
 952             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 953             continue;
 954         }
 955         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 956             UV low = (p[0] << 8) + p[1];
 957             p += 2;
 958             if (low < 0xdc00 || low >= 0xdfff)
 959                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 960             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 961         }
 962         if (uv < 0x10000) {
 963             *d++ = (U8)(( uv >> 12)         | 0xe0);
 964             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 965             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 966             continue;
 967         }
 968         else {
 969             *d++ = (U8)(( uv >> 18)         | 0xf0);
 970             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 971             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 972             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 973             continue;
 974         }
 975     }
 976     *newlen = d - dstart;
 977     return d;
 978 }
 979
 980 /* Note: this one is slightly destructive of the source. */
 981
 982 U8*
 983 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 984 {
 985     U8* s = (U8*)p;
 986     U8* const send = s + bytelen;
 987
 988     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
 989
 990     while (s < send) {
 991         const U8 tmp = s[0];
 992         s[0] = s[1];
 993         s[1] = tmp;
 994         s += 2;
 995     }
 996     return utf16_to_utf8(p, d, bytelen, newlen);
 997 }
 998
 999 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1000
1001 bool
1002 Perl_is_uni_alnum(pTHX_ UV c)
1003 {
1004     U8 tmpbuf[UTF8_MAXBYTES+1];
1005     uvchr_to_utf8(tmpbuf, c);
1006     return is_utf8_alnum(tmpbuf);
1007 }
1008
1009 bool
1010 Perl_is_uni_alnumc(pTHX_ UV c)
1011 {
1012     U8 tmpbuf[UTF8_MAXBYTES+1];
1013     uvchr_to_utf8(tmpbuf, c);
1014     return is_utf8_alnumc(tmpbuf);
1015 }
1016
1017 bool
1018 Perl_is_uni_idfirst(pTHX_ UV c)
1019 {
1020     U8 tmpbuf[UTF8_MAXBYTES+1];
1021     uvchr_to_utf8(tmpbuf, c);
1022     return is_utf8_idfirst(tmpbuf);
1023 }
1024
1025 bool
1026 Perl_is_uni_alpha(pTHX_ UV c)
1027 {
1028     U8 tmpbuf[UTF8_MAXBYTES+1];
1029     uvchr_to_utf8(tmpbuf, c);
1030     return is_utf8_alpha(tmpbuf);
1031 }
1032
1033 bool
1034 Perl_is_uni_ascii(pTHX_ UV c)
1035 {
1036     U8 tmpbuf[UTF8_MAXBYTES+1];
1037     uvchr_to_utf8(tmpbuf, c);
1038     return is_utf8_ascii(tmpbuf);
1039 }
1040
1041 bool
1042 Perl_is_uni_space(pTHX_ UV c)
1043 {
1044     U8 tmpbuf[UTF8_MAXBYTES+1];
1045     uvchr_to_utf8(tmpbuf, c);
1046     return is_utf8_space(tmpbuf);
1047 }
1048
1049 bool
1050 Perl_is_uni_digit(pTHX_ UV c)
1051 {
1052     U8 tmpbuf[UTF8_MAXBYTES+1];
1053     uvchr_to_utf8(tmpbuf, c);
1054     return is_utf8_digit(tmpbuf);
1055 }
1056
1057 bool
1058 Perl_is_uni_upper(pTHX_ UV c)
1059 {
1060     U8 tmpbuf[UTF8_MAXBYTES+1];
1061     uvchr_to_utf8(tmpbuf, c);
1062     return is_utf8_upper(tmpbuf);
1063 }
1064
1065 bool
1066 Perl_is_uni_lower(pTHX_ UV c)
1067 {
1068     U8 tmpbuf[UTF8_MAXBYTES+1];
1069     uvchr_to_utf8(tmpbuf, c);
1070     return is_utf8_lower(tmpbuf);
1071 }
1072
1073 bool
1074 Perl_is_uni_cntrl(pTHX_ UV c)
1075 {
1076     U8 tmpbuf[UTF8_MAXBYTES+1];
1077     uvchr_to_utf8(tmpbuf, c);
1078     return is_utf8_cntrl(tmpbuf);
1079 }
1080
1081 bool
1082 Perl_is_uni_graph(pTHX_ UV c)
1083 {
1084     U8 tmpbuf[UTF8_MAXBYTES+1];
1085     uvchr_to_utf8(tmpbuf, c);
1086     return is_utf8_graph(tmpbuf);
1087 }
1088
1089 bool
1090 Perl_is_uni_print(pTHX_ UV c)
1091 {
1092     U8 tmpbuf[UTF8_MAXBYTES+1];
1093     uvchr_to_utf8(tmpbuf, c);
1094     return is_utf8_print(tmpbuf);
1095 }
1096
1097 bool
1098 Perl_is_uni_punct(pTHX_ UV c)
1099 {
1100     U8 tmpbuf[UTF8_MAXBYTES+1];
1101     uvchr_to_utf8(tmpbuf, c);
1102     return is_utf8_punct(tmpbuf);
1103 }
1104
1105 bool
1106 Perl_is_uni_xdigit(pTHX_ UV c)
1107 {
1108     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1109     uvchr_to_utf8(tmpbuf, c);
1110     return is_utf8_xdigit(tmpbuf);
1111 }
1112
1113 UV
1114 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1115 {
1116     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1117
1118     uvchr_to_utf8(p, c);
1119     return to_utf8_upper(p, p, lenp);
1120 }
1121
1122 UV
1123 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1124 {
1125     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1126
1127     uvchr_to_utf8(p, c);
1128     return to_utf8_title(p, p, lenp);
1129 }
1130
1131 UV
1132 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1133 {
1134     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1135
1136     uvchr_to_utf8(p, c);
1137     return to_utf8_lower(p, p, lenp);
1138 }
1139
1140 UV
1141 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1142 {
1143     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1144
1145     uvchr_to_utf8(p, c);
1146     return to_utf8_fold(p, p, lenp);
1147 }
1148
1149 /* for now these all assume no locale info available for Unicode > 255 */
1150
1151 bool
1152 Perl_is_uni_alnum_lc(pTHX_ UV c)
1153 {
1154     return is_uni_alnum(c);     /* XXX no locale support yet */
1155 }
1156
1157 bool
1158 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1159 {
1160     return is_uni_alnumc(c);    /* XXX no locale support yet */
1161 }
1162
1163 bool
1164 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1165 {
1166     return is_uni_idfirst(c);   /* XXX no locale support yet */
1167 }
1168
1169 bool
1170 Perl_is_uni_alpha_lc(pTHX_ UV c)
1171 {
1172     return is_uni_alpha(c);     /* XXX no locale support yet */
1173 }
1174
1175 bool
1176 Perl_is_uni_ascii_lc(pTHX_ UV c)
1177 {
1178     return is_uni_ascii(c);     /* XXX no locale support yet */
1179 }
1180
1181 bool
1182 Perl_is_uni_space_lc(pTHX_ UV c)
1183 {
1184     return is_uni_space(c);     /* XXX no locale support yet */
1185 }
1186
1187 bool
1188 Perl_is_uni_digit_lc(pTHX_ UV c)
1189 {
1190     return is_uni_digit(c);     /* XXX no locale support yet */
1191 }
1192
1193 bool
1194 Perl_is_uni_upper_lc(pTHX_ UV c)
1195 {
1196     return is_uni_upper(c);     /* XXX no locale support yet */
1197 }
1198
1199 bool
1200 Perl_is_uni_lower_lc(pTHX_ UV c)
1201 {
1202     return is_uni_lower(c);     /* XXX no locale support yet */
1203 }
1204
1205 bool
1206 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1207 {
1208     return is_uni_cntrl(c);     /* XXX no locale support yet */
1209 }
1210
1211 bool
1212 Perl_is_uni_graph_lc(pTHX_ UV c)
1213 {
1214     return is_uni_graph(c);     /* XXX no locale support yet */
1215 }
1216
1217 bool
1218 Perl_is_uni_print_lc(pTHX_ UV c)
1219 {
1220     return is_uni_print(c);     /* XXX no locale support yet */
1221 }
1222
1223 bool
1224 Perl_is_uni_punct_lc(pTHX_ UV c)
1225 {
1226     return is_uni_punct(c);     /* XXX no locale support yet */
1227 }
1228
1229 bool
1230 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1231 {
1232     return is_uni_xdigit(c);    /* XXX no locale support yet */
1233 }
1234
1235 U32
1236 Perl_to_uni_upper_lc(pTHX_ U32 c)
1237 {
1238     /* XXX returns only the first character -- do not use XXX */
1239     /* XXX no locale support yet */
1240     STRLEN len;
1241     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1242     return (U32)to_uni_upper(c, tmpbuf, &len);
1243 }
1244
1245 U32
1246 Perl_to_uni_title_lc(pTHX_ U32 c)
1247 {
1248     /* XXX returns only the first character XXX -- do not use XXX */
1249     /* XXX no locale support yet */
1250     STRLEN len;
1251     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1252     return (U32)to_uni_title(c, tmpbuf, &len);
1253 }
1254
1255 U32
1256 Perl_to_uni_lower_lc(pTHX_ U32 c)
1257 {
1258     /* XXX returns only the first character -- do not use XXX */
1259     /* XXX no locale support yet */
1260     STRLEN len;
1261     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1262     return (U32)to_uni_lower(c, tmpbuf, &len);
1263 }
1264
1265 static bool
1266 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1267                  const char *const swashname)
1268 {
1269     dVAR;
1270
1271     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1272
1273     if (!is_utf8_char(p))
1274         return FALSE;
1275     if (!*swash)
1276         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1277     return swash_fetch(*swash, p, TRUE) != 0;
1278 }
1279
1280 bool
1281 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1282 {
1283     dVAR;
1284
1285     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1286
1287     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1288      * descendant of isalnum(3), in other words, it doesn't
1289      * contain the '_'. --jhi */
1290     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1291 }
1292
1293 bool
1294 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1295 {
1296     dVAR;
1297
1298     PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1299
1300     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1301 }
1302
1303 bool
1304 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1305 {
1306     dVAR;
1307
1308     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1309
1310     if (*p == '_')
1311         return TRUE;
1312     /* is_utf8_idstart would be more logical. */
1313     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1314 }
1315
1316 bool
1317 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1318 {
1319     dVAR;
1320
1321     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1322
1323     if (*p == '_')
1324         return TRUE;
1325     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1326 }
1327
1328 bool
1329 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1330 {
1331     dVAR;
1332
1333     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1334
1335     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1336 }
1337
1338 bool
1339 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1340 {
1341     dVAR;
1342
1343     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1344
1345     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1346 }
1347
1348 bool
1349 Perl_is_utf8_space(pTHX_ const U8 *p)
1350 {
1351     dVAR;
1352
1353     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1354
1355     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1356 }
1357
1358 bool
1359 Perl_is_utf8_digit(pTHX_ const U8 *p)
1360 {
1361     dVAR;
1362
1363     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1364
1365     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1366 }
1367
1368 bool
1369 Perl_is_utf8_upper(pTHX_ const U8 *p)
1370 {
1371     dVAR;
1372
1373     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1374
1375     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1376 }
1377
1378 bool
1379 Perl_is_utf8_lower(pTHX_ const U8 *p)
1380 {
1381     dVAR;
1382
1383     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1384
1385     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1386 }
1387
1388 bool
1389 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1390 {
1391     dVAR;
1392
1393     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1394
1395     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1396 }
1397
1398 bool
1399 Perl_is_utf8_graph(pTHX_ const U8 *p)
1400 {
1401     dVAR;
1402
1403     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1404
1405     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1406 }
1407
1408 bool
1409 Perl_is_utf8_print(pTHX_ const U8 *p)
1410 {
1411     dVAR;
1412
1413     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1414
1415     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1416 }
1417
1418 bool
1419 Perl_is_utf8_punct(pTHX_ const U8 *p)
1420 {
1421     dVAR;
1422
1423     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1424
1425     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1426 }
1427
1428 bool
1429 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1430 {
1431     dVAR;
1432
1433     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1434
1435     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1436 }
1437
1438 bool
1439 Perl_is_utf8_mark(pTHX_ const U8 *p)
1440 {
1441     dVAR;
1442
1443     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1444
1445     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1446 }
1447
1448 /*
1449 =for apidoc to_utf8_case
1450
1451 The "p" contains the pointer to the UTF-8 string encoding
1452 the character that is being converted.
1453
1454 The "ustrp" is a pointer to the character buffer to put the
1455 conversion result to.  The "lenp" is a pointer to the length
1456 of the result.
1457
1458 The "swashp" is a pointer to the swash to use.
1459
1460 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1461 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1462 but not always, a multicharacter mapping), is tried first.
1463
1464 The "special" is a string like "utf8::ToSpecLower", which means the
1465 hash %utf8::ToSpecLower.  The access to the hash is through
1466 Perl_to_utf8_case().
1467
1468 The "normal" is a string like "ToLower" which means the swash
1469 %utf8::ToLower.
1470
1471 =cut */
1472
1473 UV
1474 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1475                         SV **swashp, const char *normal, const char *special)
1476 {
1477     dVAR;
1478     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1479     STRLEN len = 0;
1480     const UV uv0 = utf8_to_uvchr(p, NULL);
1481     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1482      * are necessary in EBCDIC, they are redundant no-ops
1483      * in ASCII-ish platforms, and hopefully optimized away. */
1484     const UV uv1 = NATIVE_TO_UNI(uv0);
1485
1486     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1487
1488     uvuni_to_utf8(tmpbuf, uv1);
1489
1490     if (!*swashp) /* load on-demand */
1491          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1492
1493     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1494     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1495          /* It might be "special" (sometimes, but not always,
1496           * a multicharacter mapping) */
1497          HV * const hv = get_hv(special, FALSE);
1498          SV **svp;
1499
1500          if (hv &&
1501              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1502              (*svp)) {
1503              const char *s;
1504
1505               s = SvPV_const(*svp, len);
1506               if (len == 1)
1507                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1508               else {
1509 #ifdef EBCDIC
1510                    /* If we have EBCDIC we need to remap the characters
1511                     * since any characters in the low 256 are Unicode
1512                     * code points, not EBCDIC. */
1513                    U8 *t = (U8*)s, *tend = t + len, *d;
1514
1515                    d = tmpbuf;
1516                    if (SvUTF8(*svp)) {
1517                         STRLEN tlen = 0;
1518
1519                         while (t < tend) {
1520                              const UV c = utf8_to_uvchr(t, &tlen);
1521                              if (tlen > 0) {
1522                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1523                                   t += tlen;
1524                              }
1525                              else
1526                                   break;
1527                         }
1528                    }
1529                    else {
1530                         while (t < tend) {
1531                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1532                              t++;
1533                         }
1534                    }
1535                    len = d - tmpbuf;
1536                    Copy(tmpbuf, ustrp, len, U8);
1537 #else
1538                    Copy(s, ustrp, len, U8);
1539 #endif
1540               }
1541          }
1542     }
1543
1544     if (!len && *swashp) {
1545         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1546
1547          if (uv2) {
1548               /* It was "normal" (a single character mapping). */
1549               const UV uv3 = UNI_TO_NATIVE(uv2);
1550               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1551          }
1552     }
1553
1554     if (!len) /* Neither: just copy. */
1555          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1556
1557     if (lenp)
1558          *lenp = len;
1559
1560     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1561 }
1562
1563 /*
1564 =for apidoc to_utf8_upper
1565
1566 Convert the UTF-8 encoded character at p to its uppercase version and
1567 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1568 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1569 the uppercase version may be longer than the original character.
1570
1571 The first character of the uppercased version is returned
1572 (but note, as explained above, that there may be more.)
1573
1574 =cut */
1575
1576 UV
1577 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1578 {
1579     dVAR;
1580
1581     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1582
1583     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1584                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1585 }
1586
1587 /*
1588 =for apidoc to_utf8_title
1589
1590 Convert the UTF-8 encoded character at p to its titlecase version and
1591 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1592 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1593 titlecase version may be longer than the original character.
1594
1595 The first character of the titlecased version is returned
1596 (but note, as explained above, that there may be more.)
1597
1598 =cut */
1599
1600 UV
1601 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1602 {
1603     dVAR;
1604
1605     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1606
1607     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1608                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1609 }
1610
1611 /*
1612 =for apidoc to_utf8_lower
1613
1614 Convert the UTF-8 encoded character at p to its lowercase version and
1615 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1616 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1617 lowercase version may be longer than the original character.
1618
1619 The first character of the lowercased version is returned
1620 (but note, as explained above, that there may be more.)
1621
1622 =cut */
1623
1624 UV
1625 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1626 {
1627     dVAR;
1628
1629     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1630
1631     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1632                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1633 }
1634
1635 /*
1636 =for apidoc to_utf8_fold
1637
1638 Convert the UTF-8 encoded character at p to its foldcase version and
1639 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1640 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1641 foldcase version may be longer than the original character (up to
1642 three characters).
1643
1644 The first character of the foldcased version is returned
1645 (but note, as explained above, that there may be more.)
1646
1647 =cut */
1648
1649 UV
1650 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1651 {
1652     dVAR;
1653
1654     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1655
1656     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1657                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1658 }
1659
1660 /* Note:
1661  * A "swash" is a swatch hash.
1662  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1663  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1664  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1665  */
1666 SV*
1667 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1668 {
1669     dVAR;
1670     SV* retval;
1671     dSP;
1672     const size_t pkg_len = strlen(pkg);
1673     const size_t name_len = strlen(name);
1674     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1675     SV* errsv_save;
1676
1677     PERL_ARGS_ASSERT_SWASH_INIT;
1678
1679     PUSHSTACKi(PERLSI_MAGIC);
1680     ENTER;
1681     SAVEI32(PL_hints);
1682     PL_hints = 0;
1683     save_re_context();
1684     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1685         ENTER;
1686         errsv_save = newSVsv(ERRSV);
1687         /* It is assumed that callers of this routine are not passing in any
1688            user derived data.  */
1689         /* Need to do this after save_re_context() as it will set PL_tainted to
1690            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1691            Even line to create errsv_save can turn on PL_tainted.  */
1692         SAVEBOOL(PL_tainted);
1693         PL_tainted = 0;
1694         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1695                          NULL);
1696         if (!SvTRUE(ERRSV))
1697             sv_setsv(ERRSV, errsv_save);
1698         SvREFCNT_dec(errsv_save);
1699         LEAVE;
1700     }
1701     SPAGAIN;
1702     PUSHMARK(SP);
1703     EXTEND(SP,5);
1704     mPUSHp(pkg, pkg_len);
1705     mPUSHp(name, name_len);
1706     PUSHs(listsv);
1707     mPUSHi(minbits);
1708     mPUSHi(none);
1709     PUTBACK;
1710     errsv_save = newSVsv(ERRSV);
1711     if (call_method("SWASHNEW", G_SCALAR))
1712         retval = newSVsv(*PL_stack_sp--);
1713     else
1714         retval = &PL_sv_undef;
1715     if (!SvTRUE(ERRSV))
1716         sv_setsv(ERRSV, errsv_save);
1717     SvREFCNT_dec(errsv_save);
1718     LEAVE;
1719     POPSTACK;
1720     if (IN_PERL_COMPILETIME) {
1721         CopHINTS_set(PL_curcop, PL_hints);
1722     }
1723     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1724         if (SvPOK(retval))
1725             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1726                        SVfARG(retval));
1727         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1728     }
1729     return retval;
1730 }
1731
1732
1733 /* This API is wrong for special case conversions since we may need to
1734  * return several Unicode characters for a single Unicode character
1735  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1736  * the lower-level routine, and it is similarly broken for returning
1737  * multiple values.  --jhi */
1738 /* Now SWASHGET is recasted into S_swash_get in this file. */
1739
1740 /* Note:
1741  * Returns the value of property/mapping C<swash> for the first character
1742  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1743  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1744  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1745  */
1746 UV
1747 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1748 {
1749     dVAR;
1750     HV *const hv = MUTABLE_HV(SvRV(swash));
1751     U32 klen;
1752     U32 off;
1753     STRLEN slen;
1754     STRLEN needents;
1755     const U8 *tmps = NULL;
1756     U32 bit;
1757     SV *swatch;
1758     U8 tmputf8[2];
1759     const UV c = NATIVE_TO_ASCII(*ptr);
1760
1761     PERL_ARGS_ASSERT_SWASH_FETCH;
1762
1763     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1764         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1765         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1766         ptr = tmputf8;
1767     }
1768     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1769      * then the "swatch" is a vec() for al the chars which start
1770      * with 0xAA..0xYY
1771      * So the key in the hash (klen) is length of encoded char -1
1772      */
1773     klen = UTF8SKIP(ptr) - 1;
1774     off  = ptr[klen];
1775
1776     if (klen == 0) {
1777       /* If char in invariant then swatch is for all the invariant chars
1778        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1779        */
1780         needents = UTF_CONTINUATION_MARK;
1781         off      = NATIVE_TO_UTF(ptr[klen]);
1782     }
1783     else {
1784       /* If char is encoded then swatch is for the prefix */
1785         needents = (1 << UTF_ACCUMULATION_SHIFT);
1786         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1787     }
1788
1789     /*
1790      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1791      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1792      * it's nothing to sniff at.)  Pity we usually come through at least
1793      * two function calls to get here...
1794      *
1795      * NB: this code assumes that swatches are never modified, once generated!
1796      */
1797
1798     if (hv   == PL_last_swash_hv &&
1799         klen == PL_last_swash_klen &&
1800         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1801     {
1802         tmps = PL_last_swash_tmps;
1803         slen = PL_last_swash_slen;
1804     }
1805     else {
1806         /* Try our second-level swatch cache, kept in a hash. */
1807         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1808
1809         /* If not cached, generate it via swash_get */
1810         if (!svp || !SvPOK(*svp)
1811                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1812             /* We use utf8n_to_uvuni() as we want an index into
1813                Unicode tables, not a native character number.
1814              */
1815             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1816                                            ckWARN(WARN_UTF8) ?
1817                                            0 : UTF8_ALLOW_ANY);
1818             swatch = swash_get(swash,
1819                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1820                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1821                                 needents);
1822
1823             if (IN_PERL_COMPILETIME)
1824                 CopHINTS_set(PL_curcop, PL_hints);
1825
1826             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1827
1828             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1829                      || (slen << 3) < needents)
1830                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1831         }
1832
1833         PL_last_swash_hv = hv;
1834         assert(klen <= sizeof(PL_last_swash_key));
1835         PL_last_swash_klen = (U8)klen;
1836         /* FIXME change interpvar.h?  */
1837         PL_last_swash_tmps = (U8 *) tmps;
1838         PL_last_swash_slen = slen;
1839         if (klen)
1840             Copy(ptr, PL_last_swash_key, klen, U8);
1841     }
1842
1843     switch ((int)((slen << 3) / needents)) {
1844     case 1:
1845         bit = 1 << (off & 7);
1846         off >>= 3;
1847         return (tmps[off] & bit) != 0;
1848     case 8:
1849         return tmps[off];
1850     case 16:
1851         off <<= 1;
1852         return (tmps[off] << 8) + tmps[off + 1] ;
1853     case 32:
1854         off <<= 2;
1855         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1856     }
1857     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1858     NORETURN_FUNCTION_END;
1859 }
1860
1861 /* Note:
1862  * Returns a swatch (a bit vector string) for a code point sequence
1863  * that starts from the value C<start> and comprises the number C<span>.
1864  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1865  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1866  */
1867 STATIC SV*
1868 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1869 {
1870     SV *swatch;
1871     U8 *l, *lend, *x, *xend, *s;
1872     STRLEN lcur, xcur, scur;
1873     HV *const hv = MUTABLE_HV(SvRV(swash));
1874     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1875     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1876     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1877     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1878     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1879     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1880     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1881     const STRLEN bits  = SvUV(*bitssvp);
1882     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1883     const UV     none  = SvUV(*nonesvp);
1884     const UV     end   = start + span;
1885
1886     PERL_ARGS_ASSERT_SWASH_GET;
1887
1888     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1889         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1890                                                  (UV)bits);
1891     }
1892
1893     /* create and initialize $swatch */
1894     scur   = octets ? (span * octets) : (span + 7) / 8;
1895     swatch = newSV(scur);
1896     SvPOK_on(swatch);
1897     s = (U8*)SvPVX(swatch);
1898     if (octets && none) {
1899         const U8* const e = s + scur;
1900         while (s < e) {
1901             if (bits == 8)
1902                 *s++ = (U8)(none & 0xff);
1903             else if (bits == 16) {
1904                 *s++ = (U8)((none >>  8) & 0xff);
1905                 *s++ = (U8)( none        & 0xff);
1906             }
1907             else if (bits == 32) {
1908                 *s++ = (U8)((none >> 24) & 0xff);
1909                 *s++ = (U8)((none >> 16) & 0xff);
1910                 *s++ = (U8)((none >>  8) & 0xff);
1911                 *s++ = (U8)( none        & 0xff);
1912             }
1913         }
1914         *s = '\0';
1915     }
1916     else {
1917         (void)memzero((U8*)s, scur + 1);
1918     }
1919     SvCUR_set(swatch, scur);
1920     s = (U8*)SvPVX(swatch);
1921
1922     /* read $swash->{LIST} */
1923     l = (U8*)SvPV(*listsvp, lcur);
1924     lend = l + lcur;
1925     while (l < lend) {
1926         UV min, max, val;
1927         STRLEN numlen;
1928         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1929
1930         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1931
1932         numlen = lend - l;
1933         min = grok_hex((char *)l, &numlen, &flags, NULL);
1934         if (numlen)
1935             l += numlen;
1936         else if (nl) {
1937             l = nl + 1; /* 1 is length of "\n" */
1938             continue;
1939         }
1940         else {
1941             l = lend; /* to LIST's end at which \n is not found */
1942             break;
1943         }
1944
1945         if (isBLANK(*l)) {
1946             ++l;
1947             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1948             numlen = lend - l;
1949             max = grok_hex((char *)l, &numlen, &flags, NULL);
1950             if (numlen)
1951                 l += numlen;
1952             else
1953                 max = min;
1954
1955             if (octets) {
1956                 if (isBLANK(*l)) {
1957                     ++l;
1958                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1959                             PERL_SCAN_DISALLOW_PREFIX;
1960                     numlen = lend - l;
1961                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1962                     if (numlen)
1963                         l += numlen;
1964                     else
1965                         val = 0;
1966                 }
1967                 else {
1968                     val = 0;
1969                     if (typeto) {
1970                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1971                                          typestr, l);
1972                     }
1973                 }
1974             }
1975             else
1976                 val = 0; /* bits == 1, then val should be ignored */
1977         }
1978         else {
1979             max = min;
1980             if (octets) {
1981                 val = 0;
1982                 if (typeto) {
1983                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1984                 }
1985             }
1986             else
1987                 val = 0; /* bits == 1, then val should be ignored */
1988         }
1989
1990         if (nl)
1991             l = nl + 1;
1992         else
1993             l = lend;
1994
1995         if (max < start)
1996             continue;
1997
1998         if (octets) {
1999             UV key;
2000             if (min < start) {
2001                 if (!none || val < none) {
2002                     val += start - min;
2003                 }
2004                 min = start;
2005             }
2006             for (key = min; key <= max; key++) {
2007                 STRLEN offset;
2008                 if (key >= end)
2009                     goto go_out_list;
2010                 /* offset must be non-negative (start <= min <= key < end) */
2011                 offset = octets * (key - start);
2012                 if (bits == 8)
2013                     s[offset] = (U8)(val & 0xff);
2014                 else if (bits == 16) {
2015                     s[offset    ] = (U8)((val >>  8) & 0xff);
2016                     s[offset + 1] = (U8)( val        & 0xff);
2017                 }
2018                 else if (bits == 32) {
2019                     s[offset    ] = (U8)((val >> 24) & 0xff);
2020                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2021                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2022                     s[offset + 3] = (U8)( val        & 0xff);
2023                 }
2024
2025                 if (!none || val < none)
2026                     ++val;
2027             }
2028         }
2029         else { /* bits == 1, then val should be ignored */
2030             UV key;
2031             if (min < start)
2032                 min = start;
2033             for (key = min; key <= max; key++) {
2034                 const STRLEN offset = (STRLEN)(key - start);
2035                 if (key >= end)
2036                     goto go_out_list;
2037                 s[offset >> 3] |= 1 << (offset & 7);
2038             }
2039         }
2040     } /* while */
2041   go_out_list:
2042
2043     /* read $swash->{EXTRAS} */
2044     x = (U8*)SvPV(*extssvp, xcur);
2045     xend = x + xcur;
2046     while (x < xend) {
2047         STRLEN namelen;
2048         U8 *namestr;
2049         SV** othersvp;
2050         HV* otherhv;
2051         STRLEN otherbits;
2052         SV **otherbitssvp, *other;
2053         U8 *s, *o, *nl;
2054         STRLEN slen, olen;
2055
2056         const U8 opc = *x++;
2057         if (opc == '\n')
2058             continue;
2059
2060         nl = (U8*)memchr(x, '\n', xend - x);
2061
2062         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2063             if (nl) {
2064                 x = nl + 1; /* 1 is length of "\n" */
2065                 continue;
2066             }
2067             else {
2068                 x = xend; /* to EXTRAS' end at which \n is not found */
2069                 break;
2070             }
2071         }
2072
2073         namestr = x;
2074         if (nl) {
2075             namelen = nl - namestr;
2076             x = nl + 1;
2077         }
2078         else {
2079             namelen = xend - namestr;
2080             x = xend;
2081         }
2082
2083         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2084         otherhv = MUTABLE_HV(SvRV(*othersvp));
2085         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2086         otherbits = (STRLEN)SvUV(*otherbitssvp);
2087         if (bits < otherbits)
2088             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2089
2090         /* The "other" swatch must be destroyed after. */
2091         other = swash_get(*othersvp, start, span);
2092         o = (U8*)SvPV(other, olen);
2093
2094         if (!olen)
2095             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2096
2097         s = (U8*)SvPV(swatch, slen);
2098         if (bits == 1 && otherbits == 1) {
2099             if (slen != olen)
2100                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2101
2102             switch (opc) {
2103             case '+':
2104                 while (slen--)
2105                     *s++ |= *o++;
2106                 break;
2107             case '!':
2108                 while (slen--)
2109                     *s++ |= ~*o++;
2110                 break;
2111             case '-':
2112                 while (slen--)
2113                     *s++ &= ~*o++;
2114                 break;
2115             case '&':
2116                 while (slen--)
2117                     *s++ &= *o++;
2118                 break;
2119             default:
2120                 break;
2121             }
2122         }
2123         else {
2124             STRLEN otheroctets = otherbits >> 3;
2125             STRLEN offset = 0;
2126             U8* const send = s + slen;
2127
2128             while (s < send) {
2129                 UV otherval = 0;
2130
2131                 if (otherbits == 1) {
2132                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2133                     ++offset;
2134                 }
2135                 else {
2136                     STRLEN vlen = otheroctets;
2137                     otherval = *o++;
2138                     while (--vlen) {
2139                         otherval <<= 8;
2140                         otherval |= *o++;
2141                     }
2142                 }
2143
2144                 if (opc == '+' && otherval)
2145                     NOOP;   /* replace with otherval */
2146                 else if (opc == '!' && !otherval)
2147                     otherval = 1;
2148                 else if (opc == '-' && otherval)
2149                     otherval = 0;
2150                 else if (opc == '&' && !otherval)
2151                     otherval = 0;
2152                 else {
2153                     s += octets; /* no replacement */
2154                     continue;
2155                 }
2156
2157                 if (bits == 8)
2158                     *s++ = (U8)( otherval & 0xff);
2159                 else if (bits == 16) {
2160                     *s++ = (U8)((otherval >>  8) & 0xff);
2161                     *s++ = (U8)( otherval        & 0xff);
2162                 }
2163                 else if (bits == 32) {
2164                     *s++ = (U8)((otherval >> 24) & 0xff);
2165                     *s++ = (U8)((otherval >> 16) & 0xff);
2166                     *s++ = (U8)((otherval >>  8) & 0xff);
2167                     *s++ = (U8)( otherval        & 0xff);
2168                 }
2169             }
2170         }
2171         sv_free(other); /* through with it! */
2172     } /* while */
2173     return swatch;
2174 }
2175
2176 /*
2177 =for apidoc uvchr_to_utf8
2178
2179 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2180 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2181 bytes available. The return value is the pointer to the byte after the
2182 end of the new character. In other words,
2183
2184     d = uvchr_to_utf8(d, uv);
2185
2186 is the recommended wide native character-aware way of saying
2187
2188     *(d++) = uv;
2189
2190 =cut
2191 */
2192
2193 /* On ASCII machines this is normally a macro but we want a
2194    real function in case XS code wants it
2195 */
2196 U8 *
2197 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2198 {
2199     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2200
2201     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2202 }
2203
2204 U8 *
2205 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2206 {
2207     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2208
2209     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2210 }
2211
2212 /*
2213 =for apidoc utf8n_to_uvchr
2214 flags
2215
2216 Returns the native character value of the first character in the string
2217 C<s>
2218 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2219 length, in bytes, of that character.
2220
2221 Allows length and flags to be passed to low level routine.
2222
2223 =cut
2224 */
2225 /* On ASCII machines this is normally a macro but we want
2226    a real function in case XS code wants it
2227 */
2228 UV
2229 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2230 U32 flags)
2231 {
2232     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2233
2234     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2235
2236     return UNI_TO_NATIVE(uv);
2237 }
2238
2239 /*
2240 =for apidoc pv_uni_display
2241
2242 Build to the scalar dsv a displayable version of the string spv,
2243 length len, the displayable version being at most pvlim bytes long
2244 (if longer, the rest is truncated and "..." will be appended).
2245
2246 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2247 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2248 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2249 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2250 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2251 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2252
2253 The pointer to the PV of the dsv is returned.
2254
2255 =cut */
2256 char *
2257 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2258 {
2259     int truncated = 0;
2260     const char *s, *e;
2261
2262     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2263
2264     sv_setpvs(dsv, "");
2265     SvUTF8_off(dsv);
2266     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2267          UV u;
2268           /* This serves double duty as a flag and a character to print after
2269              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2270           */
2271          char ok = 0;
2272
2273          if (pvlim && SvCUR(dsv) >= pvlim) {
2274               truncated++;
2275               break;
2276          }
2277          u = utf8_to_uvchr((U8*)s, 0);
2278          if (u < 256) {
2279              const unsigned char c = (unsigned char)u & 0xFF;
2280              if (flags & UNI_DISPLAY_BACKSLASH) {
2281                  switch (c) {
2282                  case '\n':
2283                      ok = 'n'; break;
2284                  case '\r':
2285                      ok = 'r'; break;
2286                  case '\t':
2287                      ok = 't'; break;
2288                  case '\f':
2289                      ok = 'f'; break;
2290                  case '\a':
2291                      ok = 'a'; break;
2292                  case '\\':
2293                      ok = '\\'; break;
2294                  default: break;
2295                  }
2296                  if (ok) {
2297                      const char string = ok;
2298                      sv_catpvs(dsv, "\\");
2299                      sv_catpvn(dsv, &string, 1);
2300                  }
2301              }
2302              /* isPRINT() is the locale-blind version. */
2303              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2304                  const char string = c;
2305                  sv_catpvn(dsv, &string, 1);
2306                  ok = 1;
2307              }
2308          }
2309          if (!ok)
2310              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2311     }
2312     if (truncated)
2313          sv_catpvs(dsv, "...");
2314
2315     return SvPVX(dsv);
2316 }
2317
2318 /*
2319 =for apidoc sv_uni_display
2320
2321 Build to the scalar dsv a displayable version of the scalar sv,
2322 the displayable version being at most pvlim bytes long
2323 (if longer, the rest is truncated and "..." will be appended).
2324
2325 The flags argument is as in pv_uni_display().
2326
2327 The pointer to the PV of the dsv is returned.
2328
2329 =cut
2330 */
2331 char *
2332 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2333 {
2334     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2335
2336      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2337                                 SvCUR(ssv), pvlim, flags);
2338 }
2339
2340 /*
2341 =for apidoc ibcmp_utf8
2342
2343 Return true if the strings s1 and s2 differ case-insensitively, false
2344 if not (if they are equal case-insensitively).  If u1 is true, the
2345 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2346 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2347 are false, the respective string is assumed to be in native 8-bit
2348 encoding.
2349
2350 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2351 in there (they will point at the beginning of the I<next> character).
2352 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2353 pointers beyond which scanning will not continue under any
2354 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2355 s2+l2 will be used as goal end pointers that will also stop the scan,
2356 and which qualify towards defining a successful match: all the scans
2357 that define an explicit length must reach their goal pointers for
2358 a match to succeed).
2359
2360 For case-insensitiveness, the "casefolding" of Unicode is used
2361 instead of upper/lowercasing both the characters, see
2362 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2363
2364 =cut */
2365 I32
2366 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2367 {
2368      dVAR;
2369      register const U8 *p1  = (const U8*)s1;
2370      register const U8 *p2  = (const U8*)s2;
2371      register const U8 *f1 = NULL;
2372      register const U8 *f2 = NULL;
2373      register U8 *e1 = NULL;
2374      register U8 *q1 = NULL;
2375      register U8 *e2 = NULL;
2376      register U8 *q2 = NULL;
2377      STRLEN n1 = 0, n2 = 0;
2378      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2379      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2380      U8 natbuf[1+1];
2381      STRLEN foldlen1, foldlen2;
2382      bool match;
2383
2384      PERL_ARGS_ASSERT_IBCMP_UTF8;
2385
2386      if (pe1)
2387           e1 = *(U8**)pe1;
2388      /* assert(e1 || l1); */
2389      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2390           f1 = (const U8*)s1 + l1;
2391      if (pe2)
2392           e2 = *(U8**)pe2;
2393      /* assert(e2 || l2); */
2394      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2395           f2 = (const U8*)s2 + l2;
2396
2397      /* This shouldn't happen. However, putting an assert() there makes some
2398       * tests fail. */
2399      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2400      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2401           return 1; /* mismatch; possible infinite loop or false positive */
2402
2403      if (!u1 || !u2)
2404           natbuf[1] = 0; /* Need to terminate the buffer. */
2405
2406      while ((e1 == 0 || p1 < e1) &&
2407             (f1 == 0 || p1 < f1) &&
2408             (e2 == 0 || p2 < e2) &&
2409             (f2 == 0 || p2 < f2)) {
2410           if (n1 == 0) {
2411                if (u1)
2412                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2413                else {
2414                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2415                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2416                }
2417                q1 = foldbuf1;
2418                n1 = foldlen1;
2419           }
2420           if (n2 == 0) {
2421                if (u2)
2422                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2423                else {
2424                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2425                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2426                }
2427                q2 = foldbuf2;
2428                n2 = foldlen2;
2429           }
2430           while (n1 && n2) {
2431                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2432                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2433                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2434                    return 1; /* mismatch */
2435                n1 -= UTF8SKIP(q1);
2436                q1 += UTF8SKIP(q1);
2437                n2 -= UTF8SKIP(q2);
2438                q2 += UTF8SKIP(q2);
2439           }
2440           if (n1 == 0)
2441                p1 += u1 ? UTF8SKIP(p1) : 1;
2442           if (n2 == 0)
2443                p2 += u2 ? UTF8SKIP(p2) : 1;
2444
2445      }
2446
2447      /* A match is defined by all the scans that specified
2448       * an explicit length reaching their final goals. */
2449      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2450
2451      if (match) {
2452           if (pe1)
2453                *pe1 = (char*)p1;
2454           if (pe2)
2455                *pe2 = (char*)p2;
2456      }
2457
2458      return match ? 0 : 1; /* 0 match, 1 mismatch */
2459 }
2460
2461 /*
2462  * Local variables:
2463  * c-indentation-style: bsd
2464  * c-basic-offset: 4
2465  * indent-tabs-mode: t
2466  * End:
2467  *
2468  * ex: set ts=8 sts=4 sw=4 noet:
2469  */