utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =for apidoc uvuni_to_utf8_flags
  55
  56 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  57 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  58 bytes available. The return value is the pointer to the byte after the
  59 end of the new character. In other words,
  60
  61     d = uvuni_to_utf8_flags(d, uv, flags);
  62
  63 or, in most cases,
  64
  65     d = uvuni_to_utf8(d, uv);
  66
  67 (which is equivalent to)
  68
  69     d = uvuni_to_utf8_flags(d, uv, 0);
  70
  71 is the recommended Unicode-aware way of saying
  72
  73     *(d++) = uv;
  74
  75 =cut
  76 */
  77
  78 U8 *
  79 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  80 {
  81     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
  82
  83     if (ckWARN(WARN_UTF8)) {
  84          if (UNICODE_IS_SURROGATE(uv) &&
  85              !(flags & UNICODE_ALLOW_SURROGATE))
  86               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  87          else if (
  88                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  89                     !(flags & UNICODE_ALLOW_FDD0))
  90                    ||
  91                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  92                     !(flags & UNICODE_ALLOW_FFFF))) &&
  93                   /* UNICODE_ALLOW_SUPER includes
  94                    * FFFEs and FFFFs beyond 0x10FFFF. */
  95                   ((uv <= PERL_UNICODE_MAX) ||
  96                    !(flags & UNICODE_ALLOW_SUPER))
  97                   )
  98               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  99                          "Unicode character 0x%04"UVxf" is illegal", uv);
 100     }
 101     if (UNI_IS_INVARIANT(uv)) {
 102         *d++ = (U8)UTF_TO_NATIVE(uv);
 103         return d;
 104     }
 105 #if defined(EBCDIC)
 106     else {
 107         STRLEN len  = UNISKIP(uv);
 108         U8 *p = d+len-1;
 109         while (p > d) {
 110             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 111             uv >>= UTF_ACCUMULATION_SHIFT;
 112         }
 113         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 114         return d+len;
 115     }
 116 #else /* Non loop style */
 117     if (uv < 0x800) {
 118         *d++ = (U8)(( uv >>  6)         | 0xc0);
 119         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 120         return d;
 121     }
 122     if (uv < 0x10000) {
 123         *d++ = (U8)(( uv >> 12)         | 0xe0);
 124         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 125         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 126         return d;
 127     }
 128     if (uv < 0x200000) {
 129         *d++ = (U8)(( uv >> 18)         | 0xf0);
 130         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 132         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 133         return d;
 134     }
 135     if (uv < 0x4000000) {
 136         *d++ = (U8)(( uv >> 24)         | 0xf8);
 137         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 138         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 139         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 140         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 141         return d;
 142     }
 143     if (uv < 0x80000000) {
 144         *d++ = (U8)(( uv >> 30)         | 0xfc);
 145         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 146         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 147         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 148         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 149         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 150         return d;
 151     }
 152 #ifdef HAS_QUAD
 153     if (uv < UTF8_QUAD_MAX)
 154 #endif
 155     {
 156         *d++ =                            0xfe; /* Can't match U+FEFF! */
 157         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 162         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 163         return d;
 164     }
 165 #ifdef HAS_QUAD
 166     {
 167         *d++ =                            0xff;         /* Can't match U+FFFE! */
 168         *d++ =                            0x80;         /* 6 Reserved bits */
 169         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 170         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 171         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 172         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 173         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 174         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 175         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 176         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 177         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 178         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 179         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 180         return d;
 181     }
 182 #endif
 183 #endif /* Loop style */
 184 }
 185
 186 /*
 187
 188 Tests if some arbitrary number of bytes begins in a valid UTF-8
 189 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 190 UTF-8 character.  The actual number of bytes in the UTF-8 character
 191 will be returned if it is valid, otherwise 0.
 192
 193 This is the "slow" version as opposed to the "fast" version which is
 194 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 195 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 196 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 197 you should use the _slow().  In practice this means that the _slow()
 198 will be used very rarely, since the maximum Unicode code point (as of
 199 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 200 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 201 five bytes or more.
 202
 203 =cut */
 204 STATIC STRLEN
 205 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 206 {
 207     U8 u = *s;
 208     STRLEN slen;
 209     UV uv, ouv;
 210
 211     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 212
 213     if (UTF8_IS_INVARIANT(u))
 214         return 1;
 215
 216     if (!UTF8_IS_START(u))
 217         return 0;
 218
 219     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 220         return 0;
 221
 222     slen = len - 1;
 223     s++;
 224 #ifdef EBCDIC
 225     u = NATIVE_TO_UTF(u);
 226 #endif
 227     u &= UTF_START_MASK(len);
 228     uv  = u;
 229     ouv = uv;
 230     while (slen--) {
 231         if (!UTF8_IS_CONTINUATION(*s))
 232             return 0;
 233         uv = UTF8_ACCUMULATE(uv, *s);
 234         if (uv < ouv)
 235             return 0;
 236         ouv = uv;
 237         s++;
 238     }
 239
 240     if ((STRLEN)UNISKIP(uv) < len)
 241         return 0;
 242
 243     return len;
 244 }
 245
 246 /*
 247 =for apidoc is_utf8_char
 248
 249 Tests if some arbitrary number of bytes begins in a valid UTF-8
 250 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 251 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 252 character will be returned if it is valid, otherwise 0.
 253
 254 =cut */
 255 STRLEN
 256 Perl_is_utf8_char(pTHX_ const U8 *s)
 257 {
 258     const STRLEN len = UTF8SKIP(s);
 259
 260     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 261     PERL_UNUSED_CONTEXT;
 262 #ifdef IS_UTF8_CHAR
 263     if (IS_UTF8_CHAR_FAST(len))
 264         return IS_UTF8_CHAR(s, len) ? len : 0;
 265 #endif /* #ifdef IS_UTF8_CHAR */
 266     return is_utf8_char_slow(s, len);
 267 }
 268
 269 /*
 270 =for apidoc is_utf8_string
 271
 272 Returns true if first C<len> bytes of the given string form a valid
 273 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 274 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 275 because a valid ASCII string is a valid UTF-8 string.
 276
 277 See also is_utf8_string_loclen() and is_utf8_string_loc().
 278
 279 =cut
 280 */
 281
 282 bool
 283 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 284 {
 285     const U8* const send = s + (len ? len : strlen((const char *)s));
 286     const U8* x = s;
 287
 288     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 289     PERL_UNUSED_CONTEXT;
 290
 291     while (x < send) {
 292         STRLEN c;
 293          /* Inline the easy bits of is_utf8_char() here for speed... */
 294          if (UTF8_IS_INVARIANT(*x))
 295               c = 1;
 296          else if (!UTF8_IS_START(*x))
 297              goto out;
 298          else {
 299               /* ... and call is_utf8_char() only if really needed. */
 300 #ifdef IS_UTF8_CHAR
 301              c = UTF8SKIP(x);
 302              if (IS_UTF8_CHAR_FAST(c)) {
 303                  if (!IS_UTF8_CHAR(x, c))
 304                      c = 0;
 305              }
 306              else
 307                 c = is_utf8_char_slow(x, c);
 308 #else
 309              c = is_utf8_char(x);
 310 #endif /* #ifdef IS_UTF8_CHAR */
 311               if (!c)
 312                   goto out;
 313          }
 314         x += c;
 315     }
 316
 317  out:
 318     if (x != send)
 319         return FALSE;
 320
 321     return TRUE;
 322 }
 323
 324 /*
 325 Implemented as a macro in utf8.h
 326
 327 =for apidoc is_utf8_string_loc
 328
 329 Like is_utf8_string() but stores the location of the failure (in the
 330 case of "utf8ness failure") or the location s+len (in the case of
 331 "utf8ness success") in the C<ep>.
 332
 333 See also is_utf8_string_loclen() and is_utf8_string().
 334
 335 =for apidoc is_utf8_string_loclen
 336
 337 Like is_utf8_string() but stores the location of the failure (in the
 338 case of "utf8ness failure") or the location s+len (in the case of
 339 "utf8ness success") in the C<ep>, and the number of UTF-8
 340 encoded characters in the C<el>.
 341
 342 See also is_utf8_string_loc() and is_utf8_string().
 343
 344 =cut
 345 */
 346
 347 bool
 348 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 349 {
 350     const U8* const send = s + (len ? len : strlen((const char *)s));
 351     const U8* x = s;
 352     STRLEN c;
 353     STRLEN outlen = 0;
 354
 355     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 356     PERL_UNUSED_CONTEXT;
 357
 358     while (x < send) {
 359          /* Inline the easy bits of is_utf8_char() here for speed... */
 360          if (UTF8_IS_INVARIANT(*x))
 361              c = 1;
 362          else if (!UTF8_IS_START(*x))
 363              goto out;
 364          else {
 365              /* ... and call is_utf8_char() only if really needed. */
 366 #ifdef IS_UTF8_CHAR
 367              c = UTF8SKIP(x);
 368              if (IS_UTF8_CHAR_FAST(c)) {
 369                  if (!IS_UTF8_CHAR(x, c))
 370                      c = 0;
 371              } else
 372                  c = is_utf8_char_slow(x, c);
 373 #else
 374              c = is_utf8_char(x);
 375 #endif /* #ifdef IS_UTF8_CHAR */
 376              if (!c)
 377                  goto out;
 378          }
 379          x += c;
 380          outlen++;
 381     }
 382
 383  out:
 384     if (el)
 385         *el = outlen;
 386
 387     if (ep)
 388         *ep = x;
 389     return (x == send);
 390 }
 391
 392 /*
 393
 394 =for apidoc utf8n_to_uvuni
 395
 396 Bottom level UTF-8 decode routine.
 397 Returns the Unicode code point value of the first character in the string C<s>
 398 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 399 C<retlen> will be set to the length, in bytes, of that character.
 400
 401 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 402 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 403 it is assumed that the caller will raise a warning, and this function
 404 will silently just set C<retlen> to C<-1> and return zero.  If the
 405 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 406 malformations will be given, C<retlen> will be set to the expected
 407 length of the UTF-8 character in bytes, and zero will be returned.
 408
 409 The C<flags> can also contain various flags to allow deviations from
 410 the strict UTF-8 encoding (see F<utf8.h>).
 411
 412 Most code should use utf8_to_uvchr() rather than call this directly.
 413
 414 =cut
 415 */
 416
 417 UV
 418 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 419 {
 420     dVAR;
 421     const U8 * const s0 = s;
 422     UV uv = *s, ouv = 0;
 423     STRLEN len = 1;
 424     const bool dowarn = ckWARN_d(WARN_UTF8);
 425     const UV startbyte = *s;
 426     STRLEN expectlen = 0;
 427     U32 warning = 0;
 428
 429     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 430
 431 /* This list is a superset of the UTF8_ALLOW_XXX. */
 432
 433 #define UTF8_WARN_EMPTY                          1
 434 #define UTF8_WARN_CONTINUATION                   2
 435 #define UTF8_WARN_NON_CONTINUATION               3
 436 #define UTF8_WARN_FE_FF                          4
 437 #define UTF8_WARN_SHORT                          5
 438 #define UTF8_WARN_OVERFLOW                       6
 439 #define UTF8_WARN_SURROGATE                      7
 440 #define UTF8_WARN_LONG                           8
 441 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 442
 443     if (curlen == 0 &&
 444         !(flags & UTF8_ALLOW_EMPTY)) {
 445         warning = UTF8_WARN_EMPTY;
 446         goto malformed;
 447     }
 448
 449     if (UTF8_IS_INVARIANT(uv)) {
 450         if (retlen)
 451             *retlen = 1;
 452         return (UV) (NATIVE_TO_UTF(*s));
 453     }
 454
 455     if (UTF8_IS_CONTINUATION(uv) &&
 456         !(flags & UTF8_ALLOW_CONTINUATION)) {
 457         warning = UTF8_WARN_CONTINUATION;
 458         goto malformed;
 459     }
 460
 461     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 462         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 463         warning = UTF8_WARN_NON_CONTINUATION;
 464         goto malformed;
 465     }
 466
 467 #ifdef EBCDIC
 468     uv = NATIVE_TO_UTF(uv);
 469 #else
 470     if ((uv == 0xfe || uv == 0xff) &&
 471         !(flags & UTF8_ALLOW_FE_FF)) {
 472         warning = UTF8_WARN_FE_FF;
 473         goto malformed;
 474     }
 475 #endif
 476
 477     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 478     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 479     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 480     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 481 #ifdef EBCDIC
 482     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 483     else                        { len =  7; uv &= 0x01; }
 484 #else
 485     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 486     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 487     else                        { len = 13; uv = 0; } /* whoa! */
 488 #endif
 489
 490     if (retlen)
 491         *retlen = len;
 492
 493     expectlen = len;
 494
 495     if ((curlen < expectlen) &&
 496         !(flags & UTF8_ALLOW_SHORT)) {
 497         warning = UTF8_WARN_SHORT;
 498         goto malformed;
 499     }
 500
 501     len--;
 502     s++;
 503     ouv = uv;
 504
 505     while (len--) {
 506         if (!UTF8_IS_CONTINUATION(*s) &&
 507             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 508             s--;
 509             warning = UTF8_WARN_NON_CONTINUATION;
 510             goto malformed;
 511         }
 512         else
 513             uv = UTF8_ACCUMULATE(uv, *s);
 514         if (!(uv > ouv)) {
 515             /* These cannot be allowed. */
 516             if (uv == ouv) {
 517                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 518                     warning = UTF8_WARN_LONG;
 519                     goto malformed;
 520                 }
 521             }
 522             else { /* uv < ouv */
 523                 /* This cannot be allowed. */
 524                 warning = UTF8_WARN_OVERFLOW;
 525                 goto malformed;
 526             }
 527         }
 528         s++;
 529         ouv = uv;
 530     }
 531
 532     if (UNICODE_IS_SURROGATE(uv) &&
 533         !(flags & UTF8_ALLOW_SURROGATE)) {
 534         warning = UTF8_WARN_SURROGATE;
 535         goto malformed;
 536     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 537                !(flags & UTF8_ALLOW_LONG)) {
 538         warning = UTF8_WARN_LONG;
 539         goto malformed;
 540     } else if (UNICODE_IS_ILLEGAL(uv) &&
 541                !(flags & UTF8_ALLOW_FFFF)) {
 542         warning = UTF8_WARN_FFFF;
 543         goto malformed;
 544     }
 545
 546     return uv;
 547
 548 malformed:
 549
 550     if (flags & UTF8_CHECK_ONLY) {
 551         if (retlen)
 552             *retlen = ((STRLEN) -1);
 553         return 0;
 554     }
 555
 556     if (dowarn) {
 557         SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 558
 559         switch (warning) {
 560         case 0: /* Intentionally empty. */ break;
 561         case UTF8_WARN_EMPTY:
 562             sv_catpvs(sv, "(empty string)");
 563             break;
 564         case UTF8_WARN_CONTINUATION:
 565             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 566             break;
 567         case UTF8_WARN_NON_CONTINUATION:
 568             if (s == s0)
 569                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 570                            (UV)s[1], startbyte);
 571             else {
 572                 const int len = (int)(s-s0);
 573                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 574                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 575             }
 576
 577             break;
 578         case UTF8_WARN_FE_FF:
 579             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 580             break;
 581         case UTF8_WARN_SHORT:
 582             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 583                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 584             expectlen = curlen;         /* distance for caller to skip */
 585             break;
 586         case UTF8_WARN_OVERFLOW:
 587             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 588                            ouv, *s, startbyte);
 589             break;
 590         case UTF8_WARN_SURROGATE:
 591             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 592             break;
 593         case UTF8_WARN_LONG:
 594             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 595                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 596             break;
 597         case UTF8_WARN_FFFF:
 598             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 599             break;
 600         default:
 601             sv_catpvs(sv, "(unknown reason)");
 602             break;
 603         }
 604
 605         if (warning) {
 606             const char * const s = SvPVX_const(sv);
 607
 608             if (PL_op)
 609                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 610                             "%s in %s", s,  OP_DESC(PL_op));
 611             else
 612                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 613         }
 614     }
 615
 616     if (retlen)
 617         *retlen = expectlen ? expectlen : len;
 618
 619     return 0;
 620 }
 621
 622 /*
 623 =for apidoc utf8_to_uvchr
 624
 625 Returns the native character value of the first character in the string C<s>
 626 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 627 length, in bytes, of that character.
 628
 629 If C<s> does not point to a well-formed UTF-8 character, zero is
 630 returned and retlen is set, if possible, to -1.
 631
 632 =cut
 633 */
 634
 635 UV
 636 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 637 {
 638     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 639
 640     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 641                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 642 }
 643
 644 /*
 645 =for apidoc utf8_to_uvuni
 646
 647 Returns the Unicode code point of the first character in the string C<s>
 648 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 649 length, in bytes, of that character.
 650
 651 This function should only be used when the returned UV is considered
 652 an index into the Unicode semantic tables (e.g. swashes).
 653
 654 If C<s> does not point to a well-formed UTF-8 character, zero is
 655 returned and retlen is set, if possible, to -1.
 656
 657 =cut
 658 */
 659
 660 UV
 661 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 662 {
 663     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 664
 665     /* Call the low level routine asking for checks */
 666     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 667                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 668 }
 669
 670 /*
 671 =for apidoc utf8_length
 672
 673 Return the length of the UTF-8 char encoded string C<s> in characters.
 674 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 675 up past C<e>, croaks.
 676
 677 =cut
 678 */
 679
 680 STRLEN
 681 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 682 {
 683     dVAR;
 684     STRLEN len = 0;
 685
 686     PERL_ARGS_ASSERT_UTF8_LENGTH;
 687
 688     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 689      * the bitops (especially ~) can create illegal UTF-8.
 690      * In other words: in Perl UTF-8 is not just for Unicode. */
 691
 692     if (e < s)
 693         goto warn_and_return;
 694     while (s < e) {
 695         if (!UTF8_IS_INVARIANT(*s))
 696             s += UTF8SKIP(s);
 697         else
 698             s++;
 699         len++;
 700     }
 701
 702     if (e != s) {
 703         len--;
 704         warn_and_return:
 705         if (ckWARN_d(WARN_UTF8)) {
 706             if (PL_op)
 707                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 708                             "%s in %s", unees, OP_DESC(PL_op));
 709             else
 710                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 711         }
 712     }
 713
 714     return len;
 715 }
 716
 717 /*
 718 =for apidoc utf8_distance
 719
 720 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 721 and C<b>.
 722
 723 WARNING: use only if you *know* that the pointers point inside the
 724 same UTF-8 buffer.
 725
 726 =cut
 727 */
 728
 729 IV
 730 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 731 {
 732     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 733
 734     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 735 }
 736
 737 /*
 738 =for apidoc utf8_hop
 739
 740 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 741 forward or backward.
 742
 743 WARNING: do not use the following unless you *know* C<off> is within
 744 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 745 on the first byte of character or just after the last byte of a character.
 746
 747 =cut
 748 */
 749
 750 U8 *
 751 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 752 {
 753     PERL_ARGS_ASSERT_UTF8_HOP;
 754
 755     PERL_UNUSED_CONTEXT;
 756     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 757      * the bitops (especially ~) can create illegal UTF-8.
 758      * In other words: in Perl UTF-8 is not just for Unicode. */
 759
 760     if (off >= 0) {
 761         while (off--)
 762             s += UTF8SKIP(s);
 763     }
 764     else {
 765         while (off++) {
 766             s--;
 767             while (UTF8_IS_CONTINUATION(*s))
 768                 s--;
 769         }
 770     }
 771     return (U8 *)s;
 772 }
 773
 774 /*
 775 =for apidoc utf8_to_bytes
 776
 777 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 778 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 779 updates len to contain the new length.
 780 Returns zero on failure, setting C<len> to -1.
 781
 782 If you need a copy of the string, see C<bytes_from_utf8>.
 783
 784 =cut
 785 */
 786
 787 U8 *
 788 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 789 {
 790     U8 * const save = s;
 791     U8 * const send = s + *len;
 792     U8 *d;
 793
 794     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 795
 796     /* ensure valid UTF-8 and chars < 256 before updating string */
 797     while (s < send) {
 798         U8 c = *s++;
 799
 800         if (!UTF8_IS_INVARIANT(c) &&
 801             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 802              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 803             *len = ((STRLEN) -1);
 804             return 0;
 805         }
 806     }
 807
 808     d = s = save;
 809     while (s < send) {
 810         STRLEN ulen;
 811         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 812         s += ulen;
 813     }
 814     *d = '\0';
 815     *len = d - save;
 816     return save;
 817 }
 818
 819 /*
 820 =for apidoc bytes_from_utf8
 821
 822 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 823 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 824 the newly-created string, and updates C<len> to contain the new
 825 length.  Returns the original string if no conversion occurs, C<len>
 826 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 827 0 if C<s> is converted or consisted entirely of characters that are invariant
 828 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
 829
 830 =cut
 831 */
 832
 833 U8 *
 834 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 835 {
 836     U8 *d;
 837     const U8 *start = s;
 838     const U8 *send;
 839     I32 count = 0;
 840
 841     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 842
 843     PERL_UNUSED_CONTEXT;
 844     if (!*is_utf8)
 845         return (U8 *)start;
 846
 847     /* ensure valid UTF-8 and chars < 256 before converting string */
 848     for (send = s + *len; s < send;) {
 849         U8 c = *s++;
 850         if (!UTF8_IS_INVARIANT(c)) {
 851             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 852                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 853                 count++;
 854             else
 855                 return (U8 *)start;
 856         }
 857     }
 858
 859     *is_utf8 = FALSE;
 860
 861     Newx(d, (*len) - count + 1, U8);
 862     s = start; start = d;
 863     while (s < send) {
 864         U8 c = *s++;
 865         if (!UTF8_IS_INVARIANT(c)) {
 866             /* Then it is two-byte encoded */
 867             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 868             c = ASCII_TO_NATIVE(c);
 869         }
 870         *d++ = c;
 871     }
 872     *d = '\0';
 873     *len = d - start;
 874     return (U8 *)start;
 875 }
 876
 877 /*
 878 =for apidoc bytes_to_utf8
 879
 880 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
 881 Returns a pointer to the newly-created string, and sets C<len> to
 882 reflect the new length.
 883
 884 A NUL character will be written after the end of the string.
 885
 886 If you want to convert to UTF-8 from encodings other than
 887 the native (Latin1 or EBCDIC),
 888 see sv_recode_to_utf8().
 889
 890 =cut
 891 */
 892
 893 U8*
 894 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 895 {
 896     const U8 * const send = s + (*len);
 897     U8 *d;
 898     U8 *dst;
 899
 900     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 901     PERL_UNUSED_CONTEXT;
 902
 903     Newx(d, (*len) * 2 + 1, U8);
 904     dst = d;
 905
 906     while (s < send) {
 907         const UV uv = NATIVE_TO_ASCII(*s++);
 908         if (UNI_IS_INVARIANT(uv))
 909             *d++ = (U8)UTF_TO_NATIVE(uv);
 910         else {
 911             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 912             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 913         }
 914     }
 915     *d = '\0';
 916     *len = d-dst;
 917     return dst;
 918 }
 919
 920 /*
 921  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 922  *
 923  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 924  * We optimize for native, for obvious reasons. */
 925
 926 U8*
 927 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 928 {
 929     U8* pend;
 930     U8* dstart = d;
 931
 932     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 933
 934     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 935          d[0] = 0;
 936          *newlen = 1;
 937          return d;
 938     }
 939
 940     if (bytelen & 1)
 941         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 942
 943     pend = p + bytelen;
 944
 945     while (p < pend) {
 946         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 947         p += 2;
 948         if (uv < 0x80) {
 949 #ifdef EBCDIC
 950             *d++ = UNI_TO_NATIVE(uv);
 951 #else
 952             *d++ = (U8)uv;
 953 #endif
 954             continue;
 955         }
 956         if (uv < 0x800) {
 957             *d++ = (U8)(( uv >>  6)         | 0xc0);
 958             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 959             continue;
 960         }
 961         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 962             UV low = (p[0] << 8) + p[1];
 963             p += 2;
 964             if (low < 0xdc00 || low >= 0xdfff)
 965                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 966             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 967         }
 968         if (uv < 0x10000) {
 969             *d++ = (U8)(( uv >> 12)         | 0xe0);
 970             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 971             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 972             continue;
 973         }
 974         else {
 975             *d++ = (U8)(( uv >> 18)         | 0xf0);
 976             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 977             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 978             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 979             continue;
 980         }
 981     }
 982     *newlen = d - dstart;
 983     return d;
 984 }
 985
 986 /* Note: this one is slightly destructive of the source. */
 987
 988 U8*
 989 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 990 {
 991     U8* s = (U8*)p;
 992     U8* const send = s + bytelen;
 993
 994     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
 995
 996     while (s < send) {
 997         const U8 tmp = s[0];
 998         s[0] = s[1];
 999         s[1] = tmp;
1000         s += 2;
1001     }
1002     return utf16_to_utf8(p, d, bytelen, newlen);
1003 }
1004
1005 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1006
1007 bool
1008 Perl_is_uni_alnum(pTHX_ UV c)
1009 {
1010     U8 tmpbuf[UTF8_MAXBYTES+1];
1011     uvchr_to_utf8(tmpbuf, c);
1012     return is_utf8_alnum(tmpbuf);
1013 }
1014
1015 bool
1016 Perl_is_uni_alnumc(pTHX_ UV c)
1017 {
1018     U8 tmpbuf[UTF8_MAXBYTES+1];
1019     uvchr_to_utf8(tmpbuf, c);
1020     return is_utf8_alnumc(tmpbuf);
1021 }
1022
1023 bool
1024 Perl_is_uni_idfirst(pTHX_ UV c)
1025 {
1026     U8 tmpbuf[UTF8_MAXBYTES+1];
1027     uvchr_to_utf8(tmpbuf, c);
1028     return is_utf8_idfirst(tmpbuf);
1029 }
1030
1031 bool
1032 Perl_is_uni_alpha(pTHX_ UV c)
1033 {
1034     U8 tmpbuf[UTF8_MAXBYTES+1];
1035     uvchr_to_utf8(tmpbuf, c);
1036     return is_utf8_alpha(tmpbuf);
1037 }
1038
1039 bool
1040 Perl_is_uni_ascii(pTHX_ UV c)
1041 {
1042     U8 tmpbuf[UTF8_MAXBYTES+1];
1043     uvchr_to_utf8(tmpbuf, c);
1044     return is_utf8_ascii(tmpbuf);
1045 }
1046
1047 bool
1048 Perl_is_uni_space(pTHX_ UV c)
1049 {
1050     U8 tmpbuf[UTF8_MAXBYTES+1];
1051     uvchr_to_utf8(tmpbuf, c);
1052     return is_utf8_space(tmpbuf);
1053 }
1054
1055 bool
1056 Perl_is_uni_digit(pTHX_ UV c)
1057 {
1058     U8 tmpbuf[UTF8_MAXBYTES+1];
1059     uvchr_to_utf8(tmpbuf, c);
1060     return is_utf8_digit(tmpbuf);
1061 }
1062
1063 bool
1064 Perl_is_uni_upper(pTHX_ UV c)
1065 {
1066     U8 tmpbuf[UTF8_MAXBYTES+1];
1067     uvchr_to_utf8(tmpbuf, c);
1068     return is_utf8_upper(tmpbuf);
1069 }
1070
1071 bool
1072 Perl_is_uni_lower(pTHX_ UV c)
1073 {
1074     U8 tmpbuf[UTF8_MAXBYTES+1];
1075     uvchr_to_utf8(tmpbuf, c);
1076     return is_utf8_lower(tmpbuf);
1077 }
1078
1079 bool
1080 Perl_is_uni_cntrl(pTHX_ UV c)
1081 {
1082     U8 tmpbuf[UTF8_MAXBYTES+1];
1083     uvchr_to_utf8(tmpbuf, c);
1084     return is_utf8_cntrl(tmpbuf);
1085 }
1086
1087 bool
1088 Perl_is_uni_graph(pTHX_ UV c)
1089 {
1090     U8 tmpbuf[UTF8_MAXBYTES+1];
1091     uvchr_to_utf8(tmpbuf, c);
1092     return is_utf8_graph(tmpbuf);
1093 }
1094
1095 bool
1096 Perl_is_uni_print(pTHX_ UV c)
1097 {
1098     U8 tmpbuf[UTF8_MAXBYTES+1];
1099     uvchr_to_utf8(tmpbuf, c);
1100     return is_utf8_print(tmpbuf);
1101 }
1102
1103 bool
1104 Perl_is_uni_punct(pTHX_ UV c)
1105 {
1106     U8 tmpbuf[UTF8_MAXBYTES+1];
1107     uvchr_to_utf8(tmpbuf, c);
1108     return is_utf8_punct(tmpbuf);
1109 }
1110
1111 bool
1112 Perl_is_uni_xdigit(pTHX_ UV c)
1113 {
1114     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1115     uvchr_to_utf8(tmpbuf, c);
1116     return is_utf8_xdigit(tmpbuf);
1117 }
1118
1119 UV
1120 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1121 {
1122     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1123
1124     uvchr_to_utf8(p, c);
1125     return to_utf8_upper(p, p, lenp);
1126 }
1127
1128 UV
1129 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1130 {
1131     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1132
1133     uvchr_to_utf8(p, c);
1134     return to_utf8_title(p, p, lenp);
1135 }
1136
1137 UV
1138 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1139 {
1140     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1141
1142     uvchr_to_utf8(p, c);
1143     return to_utf8_lower(p, p, lenp);
1144 }
1145
1146 UV
1147 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1148 {
1149     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1150
1151     uvchr_to_utf8(p, c);
1152     return to_utf8_fold(p, p, lenp);
1153 }
1154
1155 /* for now these all assume no locale info available for Unicode > 255 */
1156
1157 bool
1158 Perl_is_uni_alnum_lc(pTHX_ UV c)
1159 {
1160     return is_uni_alnum(c);     /* XXX no locale support yet */
1161 }
1162
1163 bool
1164 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1165 {
1166     return is_uni_alnumc(c);    /* XXX no locale support yet */
1167 }
1168
1169 bool
1170 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1171 {
1172     return is_uni_idfirst(c);   /* XXX no locale support yet */
1173 }
1174
1175 bool
1176 Perl_is_uni_alpha_lc(pTHX_ UV c)
1177 {
1178     return is_uni_alpha(c);     /* XXX no locale support yet */
1179 }
1180
1181 bool
1182 Perl_is_uni_ascii_lc(pTHX_ UV c)
1183 {
1184     return is_uni_ascii(c);     /* XXX no locale support yet */
1185 }
1186
1187 bool
1188 Perl_is_uni_space_lc(pTHX_ UV c)
1189 {
1190     return is_uni_space(c);     /* XXX no locale support yet */
1191 }
1192
1193 bool
1194 Perl_is_uni_digit_lc(pTHX_ UV c)
1195 {
1196     return is_uni_digit(c);     /* XXX no locale support yet */
1197 }
1198
1199 bool
1200 Perl_is_uni_upper_lc(pTHX_ UV c)
1201 {
1202     return is_uni_upper(c);     /* XXX no locale support yet */
1203 }
1204
1205 bool
1206 Perl_is_uni_lower_lc(pTHX_ UV c)
1207 {
1208     return is_uni_lower(c);     /* XXX no locale support yet */
1209 }
1210
1211 bool
1212 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1213 {
1214     return is_uni_cntrl(c);     /* XXX no locale support yet */
1215 }
1216
1217 bool
1218 Perl_is_uni_graph_lc(pTHX_ UV c)
1219 {
1220     return is_uni_graph(c);     /* XXX no locale support yet */
1221 }
1222
1223 bool
1224 Perl_is_uni_print_lc(pTHX_ UV c)
1225 {
1226     return is_uni_print(c);     /* XXX no locale support yet */
1227 }
1228
1229 bool
1230 Perl_is_uni_punct_lc(pTHX_ UV c)
1231 {
1232     return is_uni_punct(c);     /* XXX no locale support yet */
1233 }
1234
1235 bool
1236 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1237 {
1238     return is_uni_xdigit(c);    /* XXX no locale support yet */
1239 }
1240
1241 U32
1242 Perl_to_uni_upper_lc(pTHX_ U32 c)
1243 {
1244     /* XXX returns only the first character -- do not use XXX */
1245     /* XXX no locale support yet */
1246     STRLEN len;
1247     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1248     return (U32)to_uni_upper(c, tmpbuf, &len);
1249 }
1250
1251 U32
1252 Perl_to_uni_title_lc(pTHX_ U32 c)
1253 {
1254     /* XXX returns only the first character XXX -- do not use XXX */
1255     /* XXX no locale support yet */
1256     STRLEN len;
1257     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1258     return (U32)to_uni_title(c, tmpbuf, &len);
1259 }
1260
1261 U32
1262 Perl_to_uni_lower_lc(pTHX_ U32 c)
1263 {
1264     /* XXX returns only the first character -- do not use XXX */
1265     /* XXX no locale support yet */
1266     STRLEN len;
1267     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1268     return (U32)to_uni_lower(c, tmpbuf, &len);
1269 }
1270
1271 static bool
1272 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1273                  const char *const swashname)
1274 {
1275     dVAR;
1276
1277     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1278
1279     if (!is_utf8_char(p))
1280         return FALSE;
1281     if (!*swash)
1282         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1283     return swash_fetch(*swash, p, TRUE) != 0;
1284 }
1285
1286 bool
1287 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1288 {
1289     dVAR;
1290
1291     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1292
1293     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1294      * descendant of isalnum(3), in other words, it doesn't
1295      * contain the '_'. --jhi */
1296     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1297 }
1298
1299 bool
1300 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1301 {
1302     dVAR;
1303
1304     PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1305
1306     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1307 }
1308
1309 bool
1310 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1311 {
1312     dVAR;
1313
1314     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1315
1316     if (*p == '_')
1317         return TRUE;
1318     /* is_utf8_idstart would be more logical. */
1319     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1320 }
1321
1322 bool
1323 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1324 {
1325     dVAR;
1326
1327     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1328
1329     if (*p == '_')
1330         return TRUE;
1331     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1332 }
1333
1334 bool
1335 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1336 {
1337     dVAR;
1338
1339     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1340
1341     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1342 }
1343
1344 bool
1345 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1346 {
1347     dVAR;
1348
1349     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1350
1351     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1352 }
1353
1354 bool
1355 Perl_is_utf8_space(pTHX_ const U8 *p)
1356 {
1357     dVAR;
1358
1359     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1360
1361     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1362 }
1363
1364 bool
1365 Perl_is_utf8_digit(pTHX_ const U8 *p)
1366 {
1367     dVAR;
1368
1369     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1370
1371     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1372 }
1373
1374 bool
1375 Perl_is_utf8_upper(pTHX_ const U8 *p)
1376 {
1377     dVAR;
1378
1379     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1380
1381     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1382 }
1383
1384 bool
1385 Perl_is_utf8_lower(pTHX_ const U8 *p)
1386 {
1387     dVAR;
1388
1389     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1390
1391     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1392 }
1393
1394 bool
1395 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1396 {
1397     dVAR;
1398
1399     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1400
1401     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1402 }
1403
1404 bool
1405 Perl_is_utf8_graph(pTHX_ const U8 *p)
1406 {
1407     dVAR;
1408
1409     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1410
1411     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1412 }
1413
1414 bool
1415 Perl_is_utf8_print(pTHX_ const U8 *p)
1416 {
1417     dVAR;
1418
1419     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1420
1421     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1422 }
1423
1424 bool
1425 Perl_is_utf8_punct(pTHX_ const U8 *p)
1426 {
1427     dVAR;
1428
1429     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1430
1431     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1432 }
1433
1434 bool
1435 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1436 {
1437     dVAR;
1438
1439     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1440
1441     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1442 }
1443
1444 bool
1445 Perl_is_utf8_mark(pTHX_ const U8 *p)
1446 {
1447     dVAR;
1448
1449     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1450
1451     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1452 }
1453
1454 /*
1455 =for apidoc to_utf8_case
1456
1457 The "p" contains the pointer to the UTF-8 string encoding
1458 the character that is being converted.
1459
1460 The "ustrp" is a pointer to the character buffer to put the
1461 conversion result to.  The "lenp" is a pointer to the length
1462 of the result.
1463
1464 The "swashp" is a pointer to the swash to use.
1465
1466 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1467 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1468 but not always, a multicharacter mapping), is tried first.
1469
1470 The "special" is a string like "utf8::ToSpecLower", which means the
1471 hash %utf8::ToSpecLower.  The access to the hash is through
1472 Perl_to_utf8_case().
1473
1474 The "normal" is a string like "ToLower" which means the swash
1475 %utf8::ToLower.
1476
1477 =cut */
1478
1479 UV
1480 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1481                         SV **swashp, const char *normal, const char *special)
1482 {
1483     dVAR;
1484     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1485     STRLEN len = 0;
1486     const UV uv0 = utf8_to_uvchr(p, NULL);
1487     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1488      * are necessary in EBCDIC, they are redundant no-ops
1489      * in ASCII-ish platforms, and hopefully optimized away. */
1490     const UV uv1 = NATIVE_TO_UNI(uv0);
1491
1492     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1493
1494     uvuni_to_utf8(tmpbuf, uv1);
1495
1496     if (!*swashp) /* load on-demand */
1497          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1498
1499     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1500     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1501          /* It might be "special" (sometimes, but not always,
1502           * a multicharacter mapping) */
1503          HV * const hv = get_hv(special, 0);
1504          SV **svp;
1505
1506          if (hv &&
1507              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1508              (*svp)) {
1509              const char *s;
1510
1511               s = SvPV_const(*svp, len);
1512               if (len == 1)
1513                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1514               else {
1515 #ifdef EBCDIC
1516                    /* If we have EBCDIC we need to remap the characters
1517                     * since any characters in the low 256 are Unicode
1518                     * code points, not EBCDIC. */
1519                    U8 *t = (U8*)s, *tend = t + len, *d;
1520
1521                    d = tmpbuf;
1522                    if (SvUTF8(*svp)) {
1523                         STRLEN tlen = 0;
1524
1525                         while (t < tend) {
1526                              const UV c = utf8_to_uvchr(t, &tlen);
1527                              if (tlen > 0) {
1528                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1529                                   t += tlen;
1530                              }
1531                              else
1532                                   break;
1533                         }
1534                    }
1535                    else {
1536                         while (t < tend) {
1537                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1538                              t++;
1539                         }
1540                    }
1541                    len = d - tmpbuf;
1542                    Copy(tmpbuf, ustrp, len, U8);
1543 #else
1544                    Copy(s, ustrp, len, U8);
1545 #endif
1546               }
1547          }
1548     }
1549
1550     if (!len && *swashp) {
1551         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1552
1553          if (uv2) {
1554               /* It was "normal" (a single character mapping). */
1555               const UV uv3 = UNI_TO_NATIVE(uv2);
1556               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1557          }
1558     }
1559
1560     if (!len) /* Neither: just copy. */
1561          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1562
1563     if (lenp)
1564          *lenp = len;
1565
1566     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1567 }
1568
1569 /*
1570 =for apidoc to_utf8_upper
1571
1572 Convert the UTF-8 encoded character at p to its uppercase version and
1573 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1574 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1575 the uppercase version may be longer than the original character.
1576
1577 The first character of the uppercased version is returned
1578 (but note, as explained above, that there may be more.)
1579
1580 =cut */
1581
1582 UV
1583 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1584 {
1585     dVAR;
1586
1587     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1588
1589     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1590                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1591 }
1592
1593 /*
1594 =for apidoc to_utf8_title
1595
1596 Convert the UTF-8 encoded character at p to its titlecase version and
1597 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1598 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1599 titlecase version may be longer than the original character.
1600
1601 The first character of the titlecased version is returned
1602 (but note, as explained above, that there may be more.)
1603
1604 =cut */
1605
1606 UV
1607 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1608 {
1609     dVAR;
1610
1611     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1612
1613     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1614                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1615 }
1616
1617 /*
1618 =for apidoc to_utf8_lower
1619
1620 Convert the UTF-8 encoded character at p to its lowercase version and
1621 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1622 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1623 lowercase version may be longer than the original character.
1624
1625 The first character of the lowercased version is returned
1626 (but note, as explained above, that there may be more.)
1627
1628 =cut */
1629
1630 UV
1631 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1632 {
1633     dVAR;
1634
1635     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1636
1637     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1638                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1639 }
1640
1641 /*
1642 =for apidoc to_utf8_fold
1643
1644 Convert the UTF-8 encoded character at p to its foldcase version and
1645 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1646 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1647 foldcase version may be longer than the original character (up to
1648 three characters).
1649
1650 The first character of the foldcased version is returned
1651 (but note, as explained above, that there may be more.)
1652
1653 =cut */
1654
1655 UV
1656 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1657 {
1658     dVAR;
1659
1660     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1661
1662     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1663                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1664 }
1665
1666 /* Note:
1667  * A "swash" is a swatch hash.
1668  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1669  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1670  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1671  */
1672 SV*
1673 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1674 {
1675     dVAR;
1676     SV* retval;
1677     dSP;
1678     const size_t pkg_len = strlen(pkg);
1679     const size_t name_len = strlen(name);
1680     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1681     SV* errsv_save;
1682
1683     PERL_ARGS_ASSERT_SWASH_INIT;
1684
1685     PUSHSTACKi(PERLSI_MAGIC);
1686     ENTER;
1687     SAVEI32(PL_hints);
1688     PL_hints = 0;
1689     save_re_context();
1690     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1691         ENTER;
1692         errsv_save = newSVsv(ERRSV);
1693         /* It is assumed that callers of this routine are not passing in any
1694            user derived data.  */
1695         /* Need to do this after save_re_context() as it will set PL_tainted to
1696            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1697            Even line to create errsv_save can turn on PL_tainted.  */
1698         SAVEBOOL(PL_tainted);
1699         PL_tainted = 0;
1700         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1701                          NULL);
1702         if (!SvTRUE(ERRSV))
1703             sv_setsv(ERRSV, errsv_save);
1704         SvREFCNT_dec(errsv_save);
1705         LEAVE;
1706     }
1707     SPAGAIN;
1708     PUSHMARK(SP);
1709     EXTEND(SP,5);
1710     mPUSHp(pkg, pkg_len);
1711     mPUSHp(name, name_len);
1712     PUSHs(listsv);
1713     mPUSHi(minbits);
1714     mPUSHi(none);
1715     PUTBACK;
1716     errsv_save = newSVsv(ERRSV);
1717     if (call_method("SWASHNEW", G_SCALAR))
1718         retval = newSVsv(*PL_stack_sp--);
1719     else
1720         retval = &PL_sv_undef;
1721     if (!SvTRUE(ERRSV))
1722         sv_setsv(ERRSV, errsv_save);
1723     SvREFCNT_dec(errsv_save);
1724     LEAVE;
1725     POPSTACK;
1726     if (IN_PERL_COMPILETIME) {
1727         CopHINTS_set(PL_curcop, PL_hints);
1728     }
1729     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1730         if (SvPOK(retval))
1731             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1732                        SVfARG(retval));
1733         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1734     }
1735     return retval;
1736 }
1737
1738
1739 /* This API is wrong for special case conversions since we may need to
1740  * return several Unicode characters for a single Unicode character
1741  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1742  * the lower-level routine, and it is similarly broken for returning
1743  * multiple values.  --jhi */
1744 /* Now SWASHGET is recasted into S_swash_get in this file. */
1745
1746 /* Note:
1747  * Returns the value of property/mapping C<swash> for the first character
1748  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1749  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1750  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1751  */
1752 UV
1753 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1754 {
1755     dVAR;
1756     HV *const hv = MUTABLE_HV(SvRV(swash));
1757     U32 klen;
1758     U32 off;
1759     STRLEN slen;
1760     STRLEN needents;
1761     const U8 *tmps = NULL;
1762     U32 bit;
1763     SV *swatch;
1764     U8 tmputf8[2];
1765     const UV c = NATIVE_TO_ASCII(*ptr);
1766
1767     PERL_ARGS_ASSERT_SWASH_FETCH;
1768
1769     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1770         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1771         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1772         ptr = tmputf8;
1773     }
1774     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1775      * then the "swatch" is a vec() for al the chars which start
1776      * with 0xAA..0xYY
1777      * So the key in the hash (klen) is length of encoded char -1
1778      */
1779     klen = UTF8SKIP(ptr) - 1;
1780     off  = ptr[klen];
1781
1782     if (klen == 0) {
1783       /* If char in invariant then swatch is for all the invariant chars
1784        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1785        */
1786         needents = UTF_CONTINUATION_MARK;
1787         off      = NATIVE_TO_UTF(ptr[klen]);
1788     }
1789     else {
1790       /* If char is encoded then swatch is for the prefix */
1791         needents = (1 << UTF_ACCUMULATION_SHIFT);
1792         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1793     }
1794
1795     /*
1796      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1797      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1798      * it's nothing to sniff at.)  Pity we usually come through at least
1799      * two function calls to get here...
1800      *
1801      * NB: this code assumes that swatches are never modified, once generated!
1802      */
1803
1804     if (hv   == PL_last_swash_hv &&
1805         klen == PL_last_swash_klen &&
1806         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1807     {
1808         tmps = PL_last_swash_tmps;
1809         slen = PL_last_swash_slen;
1810     }
1811     else {
1812         /* Try our second-level swatch cache, kept in a hash. */
1813         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1814
1815         /* If not cached, generate it via swash_get */
1816         if (!svp || !SvPOK(*svp)
1817                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1818             /* We use utf8n_to_uvuni() as we want an index into
1819                Unicode tables, not a native character number.
1820              */
1821             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1822                                            ckWARN(WARN_UTF8) ?
1823                                            0 : UTF8_ALLOW_ANY);
1824             swatch = swash_get(swash,
1825                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1826                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1827                                 needents);
1828
1829             if (IN_PERL_COMPILETIME)
1830                 CopHINTS_set(PL_curcop, PL_hints);
1831
1832             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1833
1834             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1835                      || (slen << 3) < needents)
1836                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1837         }
1838
1839         PL_last_swash_hv = hv;
1840         assert(klen <= sizeof(PL_last_swash_key));
1841         PL_last_swash_klen = (U8)klen;
1842         /* FIXME change interpvar.h?  */
1843         PL_last_swash_tmps = (U8 *) tmps;
1844         PL_last_swash_slen = slen;
1845         if (klen)
1846             Copy(ptr, PL_last_swash_key, klen, U8);
1847     }
1848
1849     switch ((int)((slen << 3) / needents)) {
1850     case 1:
1851         bit = 1 << (off & 7);
1852         off >>= 3;
1853         return (tmps[off] & bit) != 0;
1854     case 8:
1855         return tmps[off];
1856     case 16:
1857         off <<= 1;
1858         return (tmps[off] << 8) + tmps[off + 1] ;
1859     case 32:
1860         off <<= 2;
1861         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1862     }
1863     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1864     NORETURN_FUNCTION_END;
1865 }
1866
1867 /* Note:
1868  * Returns a swatch (a bit vector string) for a code point sequence
1869  * that starts from the value C<start> and comprises the number C<span>.
1870  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1871  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1872  */
1873 STATIC SV*
1874 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1875 {
1876     SV *swatch;
1877     U8 *l, *lend, *x, *xend, *s;
1878     STRLEN lcur, xcur, scur;
1879     HV *const hv = MUTABLE_HV(SvRV(swash));
1880     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1881     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1882     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1883     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1884     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1885     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1886     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1887     const STRLEN bits  = SvUV(*bitssvp);
1888     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1889     const UV     none  = SvUV(*nonesvp);
1890     const UV     end   = start + span;
1891
1892     PERL_ARGS_ASSERT_SWASH_GET;
1893
1894     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1895         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1896                                                  (UV)bits);
1897     }
1898
1899     /* create and initialize $swatch */
1900     scur   = octets ? (span * octets) : (span + 7) / 8;
1901     swatch = newSV(scur);
1902     SvPOK_on(swatch);
1903     s = (U8*)SvPVX(swatch);
1904     if (octets && none) {
1905         const U8* const e = s + scur;
1906         while (s < e) {
1907             if (bits == 8)
1908                 *s++ = (U8)(none & 0xff);
1909             else if (bits == 16) {
1910                 *s++ = (U8)((none >>  8) & 0xff);
1911                 *s++ = (U8)( none        & 0xff);
1912             }
1913             else if (bits == 32) {
1914                 *s++ = (U8)((none >> 24) & 0xff);
1915                 *s++ = (U8)((none >> 16) & 0xff);
1916                 *s++ = (U8)((none >>  8) & 0xff);
1917                 *s++ = (U8)( none        & 0xff);
1918             }
1919         }
1920         *s = '\0';
1921     }
1922     else {
1923         (void)memzero((U8*)s, scur + 1);
1924     }
1925     SvCUR_set(swatch, scur);
1926     s = (U8*)SvPVX(swatch);
1927
1928     /* read $swash->{LIST} */
1929     l = (U8*)SvPV(*listsvp, lcur);
1930     lend = l + lcur;
1931     while (l < lend) {
1932         UV min, max, val;
1933         STRLEN numlen;
1934         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1935
1936         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1937
1938         numlen = lend - l;
1939         min = grok_hex((char *)l, &numlen, &flags, NULL);
1940         if (numlen)
1941             l += numlen;
1942         else if (nl) {
1943             l = nl + 1; /* 1 is length of "\n" */
1944             continue;
1945         }
1946         else {
1947             l = lend; /* to LIST's end at which \n is not found */
1948             break;
1949         }
1950
1951         if (isBLANK(*l)) {
1952             ++l;
1953             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1954             numlen = lend - l;
1955             max = grok_hex((char *)l, &numlen, &flags, NULL);
1956             if (numlen)
1957                 l += numlen;
1958             else
1959                 max = min;
1960
1961             if (octets) {
1962                 if (isBLANK(*l)) {
1963                     ++l;
1964                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1965                             PERL_SCAN_DISALLOW_PREFIX;
1966                     numlen = lend - l;
1967                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1968                     if (numlen)
1969                         l += numlen;
1970                     else
1971                         val = 0;
1972                 }
1973                 else {
1974                     val = 0;
1975                     if (typeto) {
1976                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1977                                          typestr, l);
1978                     }
1979                 }
1980             }
1981             else
1982                 val = 0; /* bits == 1, then val should be ignored */
1983         }
1984         else {
1985             max = min;
1986             if (octets) {
1987                 val = 0;
1988                 if (typeto) {
1989                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1990                 }
1991             }
1992             else
1993                 val = 0; /* bits == 1, then val should be ignored */
1994         }
1995
1996         if (nl)
1997             l = nl + 1;
1998         else
1999             l = lend;
2000
2001         if (max < start)
2002             continue;
2003
2004         if (octets) {
2005             UV key;
2006             if (min < start) {
2007                 if (!none || val < none) {
2008                     val += start - min;
2009                 }
2010                 min = start;
2011             }
2012             for (key = min; key <= max; key++) {
2013                 STRLEN offset;
2014                 if (key >= end)
2015                     goto go_out_list;
2016                 /* offset must be non-negative (start <= min <= key < end) */
2017                 offset = octets * (key - start);
2018                 if (bits == 8)
2019                     s[offset] = (U8)(val & 0xff);
2020                 else if (bits == 16) {
2021                     s[offset    ] = (U8)((val >>  8) & 0xff);
2022                     s[offset + 1] = (U8)( val        & 0xff);
2023                 }
2024                 else if (bits == 32) {
2025                     s[offset    ] = (U8)((val >> 24) & 0xff);
2026                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2027                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2028                     s[offset + 3] = (U8)( val        & 0xff);
2029                 }
2030
2031                 if (!none || val < none)
2032                     ++val;
2033             }
2034         }
2035         else { /* bits == 1, then val should be ignored */
2036             UV key;
2037             if (min < start)
2038                 min = start;
2039             for (key = min; key <= max; key++) {
2040                 const STRLEN offset = (STRLEN)(key - start);
2041                 if (key >= end)
2042                     goto go_out_list;
2043                 s[offset >> 3] |= 1 << (offset & 7);
2044             }
2045         }
2046     } /* while */
2047   go_out_list:
2048
2049     /* read $swash->{EXTRAS} */
2050     x = (U8*)SvPV(*extssvp, xcur);
2051     xend = x + xcur;
2052     while (x < xend) {
2053         STRLEN namelen;
2054         U8 *namestr;
2055         SV** othersvp;
2056         HV* otherhv;
2057         STRLEN otherbits;
2058         SV **otherbitssvp, *other;
2059         U8 *s, *o, *nl;
2060         STRLEN slen, olen;
2061
2062         const U8 opc = *x++;
2063         if (opc == '\n')
2064             continue;
2065
2066         nl = (U8*)memchr(x, '\n', xend - x);
2067
2068         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2069             if (nl) {
2070                 x = nl + 1; /* 1 is length of "\n" */
2071                 continue;
2072             }
2073             else {
2074                 x = xend; /* to EXTRAS' end at which \n is not found */
2075                 break;
2076             }
2077         }
2078
2079         namestr = x;
2080         if (nl) {
2081             namelen = nl - namestr;
2082             x = nl + 1;
2083         }
2084         else {
2085             namelen = xend - namestr;
2086             x = xend;
2087         }
2088
2089         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2090         otherhv = MUTABLE_HV(SvRV(*othersvp));
2091         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2092         otherbits = (STRLEN)SvUV(*otherbitssvp);
2093         if (bits < otherbits)
2094             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2095
2096         /* The "other" swatch must be destroyed after. */
2097         other = swash_get(*othersvp, start, span);
2098         o = (U8*)SvPV(other, olen);
2099
2100         if (!olen)
2101             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2102
2103         s = (U8*)SvPV(swatch, slen);
2104         if (bits == 1 && otherbits == 1) {
2105             if (slen != olen)
2106                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2107
2108             switch (opc) {
2109             case '+':
2110                 while (slen--)
2111                     *s++ |= *o++;
2112                 break;
2113             case '!':
2114                 while (slen--)
2115                     *s++ |= ~*o++;
2116                 break;
2117             case '-':
2118                 while (slen--)
2119                     *s++ &= ~*o++;
2120                 break;
2121             case '&':
2122                 while (slen--)
2123                     *s++ &= *o++;
2124                 break;
2125             default:
2126                 break;
2127             }
2128         }
2129         else {
2130             STRLEN otheroctets = otherbits >> 3;
2131             STRLEN offset = 0;
2132             U8* const send = s + slen;
2133
2134             while (s < send) {
2135                 UV otherval = 0;
2136
2137                 if (otherbits == 1) {
2138                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2139                     ++offset;
2140                 }
2141                 else {
2142                     STRLEN vlen = otheroctets;
2143                     otherval = *o++;
2144                     while (--vlen) {
2145                         otherval <<= 8;
2146                         otherval |= *o++;
2147                     }
2148                 }
2149
2150                 if (opc == '+' && otherval)
2151                     NOOP;   /* replace with otherval */
2152                 else if (opc == '!' && !otherval)
2153                     otherval = 1;
2154                 else if (opc == '-' && otherval)
2155                     otherval = 0;
2156                 else if (opc == '&' && !otherval)
2157                     otherval = 0;
2158                 else {
2159                     s += octets; /* no replacement */
2160                     continue;
2161                 }
2162
2163                 if (bits == 8)
2164                     *s++ = (U8)( otherval & 0xff);
2165                 else if (bits == 16) {
2166                     *s++ = (U8)((otherval >>  8) & 0xff);
2167                     *s++ = (U8)( otherval        & 0xff);
2168                 }
2169                 else if (bits == 32) {
2170                     *s++ = (U8)((otherval >> 24) & 0xff);
2171                     *s++ = (U8)((otherval >> 16) & 0xff);
2172                     *s++ = (U8)((otherval >>  8) & 0xff);
2173                     *s++ = (U8)( otherval        & 0xff);
2174                 }
2175             }
2176         }
2177         sv_free(other); /* through with it! */
2178     } /* while */
2179     return swatch;
2180 }
2181
2182 /*
2183 =for apidoc uvchr_to_utf8
2184
2185 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2186 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2187 bytes available. The return value is the pointer to the byte after the
2188 end of the new character. In other words,
2189
2190     d = uvchr_to_utf8(d, uv);
2191
2192 is the recommended wide native character-aware way of saying
2193
2194     *(d++) = uv;
2195
2196 =cut
2197 */
2198
2199 /* On ASCII machines this is normally a macro but we want a
2200    real function in case XS code wants it
2201 */
2202 U8 *
2203 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2204 {
2205     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2206
2207     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2208 }
2209
2210 U8 *
2211 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2212 {
2213     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2214
2215     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2216 }
2217
2218 /*
2219 =for apidoc utf8n_to_uvchr
2220 flags
2221
2222 Returns the native character value of the first character in the string
2223 C<s>
2224 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2225 length, in bytes, of that character.
2226
2227 Allows length and flags to be passed to low level routine.
2228
2229 =cut
2230 */
2231 /* On ASCII machines this is normally a macro but we want
2232    a real function in case XS code wants it
2233 */
2234 UV
2235 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2236 U32 flags)
2237 {
2238     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2239
2240     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2241
2242     return UNI_TO_NATIVE(uv);
2243 }
2244
2245 /*
2246 =for apidoc pv_uni_display
2247
2248 Build to the scalar dsv a displayable version of the string spv,
2249 length len, the displayable version being at most pvlim bytes long
2250 (if longer, the rest is truncated and "..." will be appended).
2251
2252 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2253 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2254 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2255 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2256 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2257 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2258
2259 The pointer to the PV of the dsv is returned.
2260
2261 =cut */
2262 char *
2263 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2264 {
2265     int truncated = 0;
2266     const char *s, *e;
2267
2268     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2269
2270     sv_setpvs(dsv, "");
2271     SvUTF8_off(dsv);
2272     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2273          UV u;
2274           /* This serves double duty as a flag and a character to print after
2275              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2276           */
2277          char ok = 0;
2278
2279          if (pvlim && SvCUR(dsv) >= pvlim) {
2280               truncated++;
2281               break;
2282          }
2283          u = utf8_to_uvchr((U8*)s, 0);
2284          if (u < 256) {
2285              const unsigned char c = (unsigned char)u & 0xFF;
2286              if (flags & UNI_DISPLAY_BACKSLASH) {
2287                  switch (c) {
2288                  case '\n':
2289                      ok = 'n'; break;
2290                  case '\r':
2291                      ok = 'r'; break;
2292                  case '\t':
2293                      ok = 't'; break;
2294                  case '\f':
2295                      ok = 'f'; break;
2296                  case '\a':
2297                      ok = 'a'; break;
2298                  case '\\':
2299                      ok = '\\'; break;
2300                  default: break;
2301                  }
2302                  if (ok) {
2303                      const char string = ok;
2304                      sv_catpvs(dsv, "\\");
2305                      sv_catpvn(dsv, &string, 1);
2306                  }
2307              }
2308              /* isPRINT() is the locale-blind version. */
2309              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2310                  const char string = c;
2311                  sv_catpvn(dsv, &string, 1);
2312                  ok = 1;
2313              }
2314          }
2315          if (!ok)
2316              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2317     }
2318     if (truncated)
2319          sv_catpvs(dsv, "...");
2320
2321     return SvPVX(dsv);
2322 }
2323
2324 /*
2325 =for apidoc sv_uni_display
2326
2327 Build to the scalar dsv a displayable version of the scalar sv,
2328 the displayable version being at most pvlim bytes long
2329 (if longer, the rest is truncated and "..." will be appended).
2330
2331 The flags argument is as in pv_uni_display().
2332
2333 The pointer to the PV of the dsv is returned.
2334
2335 =cut
2336 */
2337 char *
2338 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2339 {
2340     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2341
2342      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2343                                 SvCUR(ssv), pvlim, flags);
2344 }
2345
2346 /*
2347 =for apidoc ibcmp_utf8
2348
2349 Return true if the strings s1 and s2 differ case-insensitively, false
2350 if not (if they are equal case-insensitively).  If u1 is true, the
2351 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2352 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2353 are false, the respective string is assumed to be in native 8-bit
2354 encoding.
2355
2356 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2357 in there (they will point at the beginning of the I<next> character).
2358 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2359 pointers beyond which scanning will not continue under any
2360 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2361 s2+l2 will be used as goal end pointers that will also stop the scan,
2362 and which qualify towards defining a successful match: all the scans
2363 that define an explicit length must reach their goal pointers for
2364 a match to succeed).
2365
2366 For case-insensitiveness, the "casefolding" of Unicode is used
2367 instead of upper/lowercasing both the characters, see
2368 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2369
2370 =cut */
2371 I32
2372 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2373 {
2374      dVAR;
2375      register const U8 *p1  = (const U8*)s1;
2376      register const U8 *p2  = (const U8*)s2;
2377      register const U8 *f1 = NULL;
2378      register const U8 *f2 = NULL;
2379      register U8 *e1 = NULL;
2380      register U8 *q1 = NULL;
2381      register U8 *e2 = NULL;
2382      register U8 *q2 = NULL;
2383      STRLEN n1 = 0, n2 = 0;
2384      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2385      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2386      U8 natbuf[1+1];
2387      STRLEN foldlen1, foldlen2;
2388      bool match;
2389
2390      PERL_ARGS_ASSERT_IBCMP_UTF8;
2391
2392      if (pe1)
2393           e1 = *(U8**)pe1;
2394      /* assert(e1 || l1); */
2395      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2396           f1 = (const U8*)s1 + l1;
2397      if (pe2)
2398           e2 = *(U8**)pe2;
2399      /* assert(e2 || l2); */
2400      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2401           f2 = (const U8*)s2 + l2;
2402
2403      /* This shouldn't happen. However, putting an assert() there makes some
2404       * tests fail. */
2405      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2406      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2407           return 1; /* mismatch; possible infinite loop or false positive */
2408
2409      if (!u1 || !u2)
2410           natbuf[1] = 0; /* Need to terminate the buffer. */
2411
2412      while ((e1 == 0 || p1 < e1) &&
2413             (f1 == 0 || p1 < f1) &&
2414             (e2 == 0 || p2 < e2) &&
2415             (f2 == 0 || p2 < f2)) {
2416           if (n1 == 0) {
2417                if (u1)
2418                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2419                else {
2420                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2421                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2422                }
2423                q1 = foldbuf1;
2424                n1 = foldlen1;
2425           }
2426           if (n2 == 0) {
2427                if (u2)
2428                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2429                else {
2430                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2431                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2432                }
2433                q2 = foldbuf2;
2434                n2 = foldlen2;
2435           }
2436           while (n1 && n2) {
2437                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2438                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2439                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2440                    return 1; /* mismatch */
2441                n1 -= UTF8SKIP(q1);
2442                q1 += UTF8SKIP(q1);
2443                n2 -= UTF8SKIP(q2);
2444                q2 += UTF8SKIP(q2);
2445           }
2446           if (n1 == 0)
2447                p1 += u1 ? UTF8SKIP(p1) : 1;
2448           if (n2 == 0)
2449                p2 += u2 ? UTF8SKIP(p2) : 1;
2450
2451      }
2452
2453      /* A match is defined by all the scans that specified
2454       * an explicit length reaching their final goals. */
2455      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2456
2457      if (match) {
2458           if (pe1)
2459                *pe1 = (char*)p1;
2460           if (pe2)
2461                *pe2 = (char*)p2;
2462      }
2463
2464      return match ? 0 : 1; /* 0 match, 1 mismatch */
2465 }
2466
2467 /*
2468  * Local variables:
2469  * c-indentation-style: bsd
2470  * c-basic-offset: 4
2471  * indent-tabs-mode: t
2472  * End:
2473  *
2474  * ex: set ts=8 sts=4 sw=4 noet:
2475  */