utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =cut
  55 */
  56
  57 /*
  58 =for apidoc is_ascii_string
  59
  60 Returns true if first C<len> bytes of the given string are ASCII (i.e. none
  61 of them even raise the question of UTF-8-ness).
  62
  63 See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  64
  65 =cut
  66 */
  67
  68 bool
  69 Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len)
  70 {
  71     const U8* const send = s + (len ? len : strlen((const char *)s));
  72     const U8* x = s;
  73
  74     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  75     PERL_UNUSED_CONTEXT;
  76
  77     for (; x < send; ++x) {
  78         if (!UTF8_IS_INVARIANT(*x))
  79             break;
  80     }
  81
  82     return x == send;
  83 }
  84
  85 /*
  86 =for apidoc uvuni_to_utf8_flags
  87
  88 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  89 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  90 bytes available. The return value is the pointer to the byte after the
  91 end of the new character. In other words,
  92
  93     d = uvuni_to_utf8_flags(d, uv, flags);
  94
  95 or, in most cases,
  96
  97     d = uvuni_to_utf8(d, uv);
  98
  99 (which is equivalent to)
 100
 101     d = uvuni_to_utf8_flags(d, uv, 0);
 102
 103 is the recommended Unicode-aware way of saying
 104
 105     *(d++) = uv;
 106
 107 =cut
 108 */
 109
 110 U8 *
 111 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 112 {
 113     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
 114
 115     if (ckWARN(WARN_UTF8)) {
 116          if (UNICODE_IS_SURROGATE(uv) &&
 117              !(flags & UNICODE_ALLOW_SURROGATE))
 118               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
 119          else if (
 120                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
 121                     !(flags & UNICODE_ALLOW_FDD0))
 122                    ||
 123                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
 124                     !(flags & UNICODE_ALLOW_FFFF))) &&
 125                   /* UNICODE_ALLOW_SUPER includes
 126                    * FFFEs and FFFFs beyond 0x10FFFF. */
 127                   ((uv <= PERL_UNICODE_MAX) ||
 128                    !(flags & UNICODE_ALLOW_SUPER))
 129                   )
 130               Perl_warner(aTHX_ packWARN(WARN_UTF8),
 131                          "Unicode character 0x%04"UVxf" is illegal", uv);
 132     }
 133     if (UNI_IS_INVARIANT(uv)) {
 134         *d++ = (U8)UTF_TO_NATIVE(uv);
 135         return d;
 136     }
 137 #if defined(EBCDIC)
 138     else {
 139         STRLEN len  = UNISKIP(uv);
 140         U8 *p = d+len-1;
 141         while (p > d) {
 142             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 143             uv >>= UTF_ACCUMULATION_SHIFT;
 144         }
 145         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 146         return d+len;
 147     }
 148 #else /* Non loop style */
 149     if (uv < 0x800) {
 150         *d++ = (U8)(( uv >>  6)         | 0xc0);
 151         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 152         return d;
 153     }
 154     if (uv < 0x10000) {
 155         *d++ = (U8)(( uv >> 12)         | 0xe0);
 156         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 157         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 158         return d;
 159     }
 160     if (uv < 0x200000) {
 161         *d++ = (U8)(( uv >> 18)         | 0xf0);
 162         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 163         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 164         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 165         return d;
 166     }
 167     if (uv < 0x4000000) {
 168         *d++ = (U8)(( uv >> 24)         | 0xf8);
 169         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 170         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 171         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 172         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 173         return d;
 174     }
 175     if (uv < 0x80000000) {
 176         *d++ = (U8)(( uv >> 30)         | 0xfc);
 177         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 178         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 179         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 180         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 181         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 182         return d;
 183     }
 184 #ifdef HAS_QUAD
 185     if (uv < UTF8_QUAD_MAX)
 186 #endif
 187     {
 188         *d++ =                            0xfe; /* Can't match U+FEFF! */
 189         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 190         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 191         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 192         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 193         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 194         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 195         return d;
 196     }
 197 #ifdef HAS_QUAD
 198     {
 199         *d++ =                            0xff;         /* Can't match U+FFFE! */
 200         *d++ =                            0x80;         /* 6 Reserved bits */
 201         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 202         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 203         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 205         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 206         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 207         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 208         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 209         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 210         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 211         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 212         return d;
 213     }
 214 #endif
 215 #endif /* Loop style */
 216 }
 217
 218 /*
 219
 220 Tests if some arbitrary number of bytes begins in a valid UTF-8
 221 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 222 UTF-8 character.  The actual number of bytes in the UTF-8 character
 223 will be returned if it is valid, otherwise 0.
 224
 225 This is the "slow" version as opposed to the "fast" version which is
 226 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 227 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 228 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 229 you should use the _slow().  In practice this means that the _slow()
 230 will be used very rarely, since the maximum Unicode code point (as of
 231 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 232 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 233 five bytes or more.
 234
 235 =cut */
 236 STATIC STRLEN
 237 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 238 {
 239     U8 u = *s;
 240     STRLEN slen;
 241     UV uv, ouv;
 242
 243     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 244
 245     if (UTF8_IS_INVARIANT(u))
 246         return 1;
 247
 248     if (!UTF8_IS_START(u))
 249         return 0;
 250
 251     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 252         return 0;
 253
 254     slen = len - 1;
 255     s++;
 256 #ifdef EBCDIC
 257     u = NATIVE_TO_UTF(u);
 258 #endif
 259     u &= UTF_START_MASK(len);
 260     uv  = u;
 261     ouv = uv;
 262     while (slen--) {
 263         if (!UTF8_IS_CONTINUATION(*s))
 264             return 0;
 265         uv = UTF8_ACCUMULATE(uv, *s);
 266         if (uv < ouv)
 267             return 0;
 268         ouv = uv;
 269         s++;
 270     }
 271
 272     if ((STRLEN)UNISKIP(uv) < len)
 273         return 0;
 274
 275     return len;
 276 }
 277
 278 /*
 279 =for apidoc is_utf8_char
 280
 281 Tests if some arbitrary number of bytes begins in a valid UTF-8
 282 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 283 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 284 character will be returned if it is valid, otherwise 0.
 285
 286 =cut */
 287 STRLEN
 288 Perl_is_utf8_char(pTHX_ const U8 *s)
 289 {
 290     const STRLEN len = UTF8SKIP(s);
 291
 292     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 293     PERL_UNUSED_CONTEXT;
 294 #ifdef IS_UTF8_CHAR
 295     if (IS_UTF8_CHAR_FAST(len))
 296         return IS_UTF8_CHAR(s, len) ? len : 0;
 297 #endif /* #ifdef IS_UTF8_CHAR */
 298     return is_utf8_char_slow(s, len);
 299 }
 300
 301
 302 /*
 303 =for apidoc is_utf8_string
 304
 305 Returns true if first C<len> bytes of the given string form a valid
 306 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 307 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 308 because a valid ASCII string is a valid UTF-8 string.
 309
 310 See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
 311
 312 =cut
 313 */
 314
 315 bool
 316 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 317 {
 318     const U8* const send = s + (len ? len : strlen((const char *)s));
 319     const U8* x = s;
 320
 321     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 322     PERL_UNUSED_CONTEXT;
 323
 324     while (x < send) {
 325         STRLEN c;
 326          /* Inline the easy bits of is_utf8_char() here for speed... */
 327          if (UTF8_IS_INVARIANT(*x))
 328               c = 1;
 329          else if (!UTF8_IS_START(*x))
 330              goto out;
 331          else {
 332               /* ... and call is_utf8_char() only if really needed. */
 333 #ifdef IS_UTF8_CHAR
 334              c = UTF8SKIP(x);
 335              if (IS_UTF8_CHAR_FAST(c)) {
 336                  if (!IS_UTF8_CHAR(x, c))
 337                      c = 0;
 338              }
 339              else
 340                 c = is_utf8_char_slow(x, c);
 341 #else
 342              c = is_utf8_char(x);
 343 #endif /* #ifdef IS_UTF8_CHAR */
 344               if (!c)
 345                   goto out;
 346          }
 347         x += c;
 348     }
 349
 350  out:
 351     if (x != send)
 352         return FALSE;
 353
 354     return TRUE;
 355 }
 356
 357 /*
 358 Implemented as a macro in utf8.h
 359
 360 =for apidoc is_utf8_string_loc
 361
 362 Like is_utf8_string() but stores the location of the failure (in the
 363 case of "utf8ness failure") or the location s+len (in the case of
 364 "utf8ness success") in the C<ep>.
 365
 366 See also is_utf8_string_loclen() and is_utf8_string().
 367
 368 =for apidoc is_utf8_string_loclen
 369
 370 Like is_utf8_string() but stores the location of the failure (in the
 371 case of "utf8ness failure") or the location s+len (in the case of
 372 "utf8ness success") in the C<ep>, and the number of UTF-8
 373 encoded characters in the C<el>.
 374
 375 See also is_utf8_string_loc() and is_utf8_string().
 376
 377 =cut
 378 */
 379
 380 bool
 381 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 382 {
 383     const U8* const send = s + (len ? len : strlen((const char *)s));
 384     const U8* x = s;
 385     STRLEN c;
 386     STRLEN outlen = 0;
 387
 388     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 389     PERL_UNUSED_CONTEXT;
 390
 391     while (x < send) {
 392          /* Inline the easy bits of is_utf8_char() here for speed... */
 393          if (UTF8_IS_INVARIANT(*x))
 394              c = 1;
 395          else if (!UTF8_IS_START(*x))
 396              goto out;
 397          else {
 398              /* ... and call is_utf8_char() only if really needed. */
 399 #ifdef IS_UTF8_CHAR
 400              c = UTF8SKIP(x);
 401              if (IS_UTF8_CHAR_FAST(c)) {
 402                  if (!IS_UTF8_CHAR(x, c))
 403                      c = 0;
 404              } else
 405                  c = is_utf8_char_slow(x, c);
 406 #else
 407              c = is_utf8_char(x);
 408 #endif /* #ifdef IS_UTF8_CHAR */
 409              if (!c)
 410                  goto out;
 411          }
 412          x += c;
 413          outlen++;
 414     }
 415
 416  out:
 417     if (el)
 418         *el = outlen;
 419
 420     if (ep)
 421         *ep = x;
 422     return (x == send);
 423 }
 424
 425 /*
 426
 427 =for apidoc utf8n_to_uvuni
 428
 429 Bottom level UTF-8 decode routine.
 430 Returns the Unicode code point value of the first character in the string C<s>
 431 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 432 C<retlen> will be set to the length, in bytes, of that character.
 433
 434 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 435 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 436 it is assumed that the caller will raise a warning, and this function
 437 will silently just set C<retlen> to C<-1> and return zero.  If the
 438 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 439 malformations will be given, C<retlen> will be set to the expected
 440 length of the UTF-8 character in bytes, and zero will be returned.
 441
 442 The C<flags> can also contain various flags to allow deviations from
 443 the strict UTF-8 encoding (see F<utf8.h>).
 444
 445 Most code should use utf8_to_uvchr() rather than call this directly.
 446
 447 =cut
 448 */
 449
 450 UV
 451 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 452 {
 453     dVAR;
 454     const U8 * const s0 = s;
 455     UV uv = *s, ouv = 0;
 456     STRLEN len = 1;
 457     const bool dowarn = ckWARN_d(WARN_UTF8);
 458     const UV startbyte = *s;
 459     STRLEN expectlen = 0;
 460     U32 warning = 0;
 461
 462     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 463
 464 /* This list is a superset of the UTF8_ALLOW_XXX. */
 465
 466 #define UTF8_WARN_EMPTY                          1
 467 #define UTF8_WARN_CONTINUATION                   2
 468 #define UTF8_WARN_NON_CONTINUATION               3
 469 #define UTF8_WARN_FE_FF                          4
 470 #define UTF8_WARN_SHORT                          5
 471 #define UTF8_WARN_OVERFLOW                       6
 472 #define UTF8_WARN_SURROGATE                      7
 473 #define UTF8_WARN_LONG                           8
 474 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 475
 476     if (curlen == 0 &&
 477         !(flags & UTF8_ALLOW_EMPTY)) {
 478         warning = UTF8_WARN_EMPTY;
 479         goto malformed;
 480     }
 481
 482     if (UTF8_IS_INVARIANT(uv)) {
 483         if (retlen)
 484             *retlen = 1;
 485         return (UV) (NATIVE_TO_UTF(*s));
 486     }
 487
 488     if (UTF8_IS_CONTINUATION(uv) &&
 489         !(flags & UTF8_ALLOW_CONTINUATION)) {
 490         warning = UTF8_WARN_CONTINUATION;
 491         goto malformed;
 492     }
 493
 494     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 495         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 496         warning = UTF8_WARN_NON_CONTINUATION;
 497         goto malformed;
 498     }
 499
 500 #ifdef EBCDIC
 501     uv = NATIVE_TO_UTF(uv);
 502 #else
 503     if ((uv == 0xfe || uv == 0xff) &&
 504         !(flags & UTF8_ALLOW_FE_FF)) {
 505         warning = UTF8_WARN_FE_FF;
 506         goto malformed;
 507     }
 508 #endif
 509
 510     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 511     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 512     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 513     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 514 #ifdef EBCDIC
 515     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 516     else                        { len =  7; uv &= 0x01; }
 517 #else
 518     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 519     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 520     else                        { len = 13; uv = 0; } /* whoa! */
 521 #endif
 522
 523     if (retlen)
 524         *retlen = len;
 525
 526     expectlen = len;
 527
 528     if ((curlen < expectlen) &&
 529         !(flags & UTF8_ALLOW_SHORT)) {
 530         warning = UTF8_WARN_SHORT;
 531         goto malformed;
 532     }
 533
 534     len--;
 535     s++;
 536     ouv = uv;
 537
 538     while (len--) {
 539         if (!UTF8_IS_CONTINUATION(*s) &&
 540             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 541             s--;
 542             warning = UTF8_WARN_NON_CONTINUATION;
 543             goto malformed;
 544         }
 545         else
 546             uv = UTF8_ACCUMULATE(uv, *s);
 547         if (!(uv > ouv)) {
 548             /* These cannot be allowed. */
 549             if (uv == ouv) {
 550                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 551                     warning = UTF8_WARN_LONG;
 552                     goto malformed;
 553                 }
 554             }
 555             else { /* uv < ouv */
 556                 /* This cannot be allowed. */
 557                 warning = UTF8_WARN_OVERFLOW;
 558                 goto malformed;
 559             }
 560         }
 561         s++;
 562         ouv = uv;
 563     }
 564
 565     if (UNICODE_IS_SURROGATE(uv) &&
 566         !(flags & UTF8_ALLOW_SURROGATE)) {
 567         warning = UTF8_WARN_SURROGATE;
 568         goto malformed;
 569     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 570                !(flags & UTF8_ALLOW_LONG)) {
 571         warning = UTF8_WARN_LONG;
 572         goto malformed;
 573     } else if (UNICODE_IS_ILLEGAL(uv) &&
 574                !(flags & UTF8_ALLOW_FFFF)) {
 575         warning = UTF8_WARN_FFFF;
 576         goto malformed;
 577     }
 578
 579     return uv;
 580
 581 malformed:
 582
 583     if (flags & UTF8_CHECK_ONLY) {
 584         if (retlen)
 585             *retlen = ((STRLEN) -1);
 586         return 0;
 587     }
 588
 589     if (dowarn) {
 590         SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 591
 592         switch (warning) {
 593         case 0: /* Intentionally empty. */ break;
 594         case UTF8_WARN_EMPTY:
 595             sv_catpvs(sv, "(empty string)");
 596             break;
 597         case UTF8_WARN_CONTINUATION:
 598             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 599             break;
 600         case UTF8_WARN_NON_CONTINUATION:
 601             if (s == s0)
 602                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 603                            (UV)s[1], startbyte);
 604             else {
 605                 const int len = (int)(s-s0);
 606                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 607                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 608             }
 609
 610             break;
 611         case UTF8_WARN_FE_FF:
 612             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 613             break;
 614         case UTF8_WARN_SHORT:
 615             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 616                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 617             expectlen = curlen;         /* distance for caller to skip */
 618             break;
 619         case UTF8_WARN_OVERFLOW:
 620             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 621                            ouv, *s, startbyte);
 622             break;
 623         case UTF8_WARN_SURROGATE:
 624             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 625             break;
 626         case UTF8_WARN_LONG:
 627             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 628                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 629             break;
 630         case UTF8_WARN_FFFF:
 631             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 632             break;
 633         default:
 634             sv_catpvs(sv, "(unknown reason)");
 635             break;
 636         }
 637
 638         if (warning) {
 639             const char * const s = SvPVX_const(sv);
 640
 641             if (PL_op)
 642                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 643                             "%s in %s", s,  OP_DESC(PL_op));
 644             else
 645                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 646         }
 647     }
 648
 649     if (retlen)
 650         *retlen = expectlen ? expectlen : len;
 651
 652     return 0;
 653 }
 654
 655 /*
 656 =for apidoc utf8_to_uvchr
 657
 658 Returns the native character value of the first character in the string C<s>
 659 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 660 length, in bytes, of that character.
 661
 662 If C<s> does not point to a well-formed UTF-8 character, zero is
 663 returned and retlen is set, if possible, to -1.
 664
 665 =cut
 666 */
 667
 668 UV
 669 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 670 {
 671     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 672
 673     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 674                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 675 }
 676
 677 /*
 678 =for apidoc utf8_to_uvuni
 679
 680 Returns the Unicode code point of the first character in the string C<s>
 681 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 682 length, in bytes, of that character.
 683
 684 This function should only be used when the returned UV is considered
 685 an index into the Unicode semantic tables (e.g. swashes).
 686
 687 If C<s> does not point to a well-formed UTF-8 character, zero is
 688 returned and retlen is set, if possible, to -1.
 689
 690 =cut
 691 */
 692
 693 UV
 694 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 695 {
 696     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 697
 698     /* Call the low level routine asking for checks */
 699     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 700                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 701 }
 702
 703 /*
 704 =for apidoc utf8_length
 705
 706 Return the length of the UTF-8 char encoded string C<s> in characters.
 707 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 708 up past C<e>, croaks.
 709
 710 =cut
 711 */
 712
 713 STRLEN
 714 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 715 {
 716     dVAR;
 717     STRLEN len = 0;
 718
 719     PERL_ARGS_ASSERT_UTF8_LENGTH;
 720
 721     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 722      * the bitops (especially ~) can create illegal UTF-8.
 723      * In other words: in Perl UTF-8 is not just for Unicode. */
 724
 725     if (e < s)
 726         goto warn_and_return;
 727     while (s < e) {
 728         if (!UTF8_IS_INVARIANT(*s))
 729             s += UTF8SKIP(s);
 730         else
 731             s++;
 732         len++;
 733     }
 734
 735     if (e != s) {
 736         len--;
 737         warn_and_return:
 738         if (ckWARN_d(WARN_UTF8)) {
 739             if (PL_op)
 740                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 741                             "%s in %s", unees, OP_DESC(PL_op));
 742             else
 743                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 744         }
 745     }
 746
 747     return len;
 748 }
 749
 750 /*
 751 =for apidoc utf8_distance
 752
 753 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 754 and C<b>.
 755
 756 WARNING: use only if you *know* that the pointers point inside the
 757 same UTF-8 buffer.
 758
 759 =cut
 760 */
 761
 762 IV
 763 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 764 {
 765     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 766
 767     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 768 }
 769
 770 /*
 771 =for apidoc utf8_hop
 772
 773 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 774 forward or backward.
 775
 776 WARNING: do not use the following unless you *know* C<off> is within
 777 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 778 on the first byte of character or just after the last byte of a character.
 779
 780 =cut
 781 */
 782
 783 U8 *
 784 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 785 {
 786     PERL_ARGS_ASSERT_UTF8_HOP;
 787
 788     PERL_UNUSED_CONTEXT;
 789     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 790      * the bitops (especially ~) can create illegal UTF-8.
 791      * In other words: in Perl UTF-8 is not just for Unicode. */
 792
 793     if (off >= 0) {
 794         while (off--)
 795             s += UTF8SKIP(s);
 796     }
 797     else {
 798         while (off++) {
 799             s--;
 800             while (UTF8_IS_CONTINUATION(*s))
 801                 s--;
 802         }
 803     }
 804     return (U8 *)s;
 805 }
 806
 807 /*
 808 =for apidoc utf8_to_bytes
 809
 810 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 811 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 812 updates len to contain the new length.
 813 Returns zero on failure, setting C<len> to -1.
 814
 815 If you need a copy of the string, see C<bytes_from_utf8>.
 816
 817 =cut
 818 */
 819
 820 U8 *
 821 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 822 {
 823     U8 * const save = s;
 824     U8 * const send = s + *len;
 825     U8 *d;
 826
 827     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 828
 829     /* ensure valid UTF-8 and chars < 256 before updating string */
 830     while (s < send) {
 831         U8 c = *s++;
 832
 833         if (!UTF8_IS_INVARIANT(c) &&
 834             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 835              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 836             *len = ((STRLEN) -1);
 837             return 0;
 838         }
 839     }
 840
 841     d = s = save;
 842     while (s < send) {
 843         STRLEN ulen;
 844         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 845         s += ulen;
 846     }
 847     *d = '\0';
 848     *len = d - save;
 849     return save;
 850 }
 851
 852 /*
 853 =for apidoc bytes_from_utf8
 854
 855 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 856 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 857 the newly-created string, and updates C<len> to contain the new
 858 length.  Returns the original string if no conversion occurs, C<len>
 859 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 860 0 if C<s> is converted or consisted entirely of characters that are invariant
 861 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
 862
 863 =cut
 864 */
 865
 866 U8 *
 867 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 868 {
 869     U8 *d;
 870     const U8 *start = s;
 871     const U8 *send;
 872     I32 count = 0;
 873
 874     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 875
 876     PERL_UNUSED_CONTEXT;
 877     if (!*is_utf8)
 878         return (U8 *)start;
 879
 880     /* ensure valid UTF-8 and chars < 256 before converting string */
 881     for (send = s + *len; s < send;) {
 882         U8 c = *s++;
 883         if (!UTF8_IS_INVARIANT(c)) {
 884             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 885                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 886                 count++;
 887             else
 888                 return (U8 *)start;
 889         }
 890     }
 891
 892     *is_utf8 = FALSE;
 893
 894     Newx(d, (*len) - count + 1, U8);
 895     s = start; start = d;
 896     while (s < send) {
 897         U8 c = *s++;
 898         if (!UTF8_IS_INVARIANT(c)) {
 899             /* Then it is two-byte encoded */
 900             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 901             c = ASCII_TO_NATIVE(c);
 902         }
 903         *d++ = c;
 904     }
 905     *d = '\0';
 906     *len = d - start;
 907     return (U8 *)start;
 908 }
 909
 910 /*
 911 =for apidoc bytes_to_utf8
 912
 913 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
 914 Returns a pointer to the newly-created string, and sets C<len> to
 915 reflect the new length.
 916
 917 A NUL character will be written after the end of the string.
 918
 919 If you want to convert to UTF-8 from encodings other than
 920 the native (Latin1 or EBCDIC),
 921 see sv_recode_to_utf8().
 922
 923 =cut
 924 */
 925
 926 U8*
 927 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 928 {
 929     const U8 * const send = s + (*len);
 930     U8 *d;
 931     U8 *dst;
 932
 933     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 934     PERL_UNUSED_CONTEXT;
 935
 936     Newx(d, (*len) * 2 + 1, U8);
 937     dst = d;
 938
 939     while (s < send) {
 940         const UV uv = NATIVE_TO_ASCII(*s++);
 941         if (UNI_IS_INVARIANT(uv))
 942             *d++ = (U8)UTF_TO_NATIVE(uv);
 943         else {
 944             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 945             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 946         }
 947     }
 948     *d = '\0';
 949     *len = d-dst;
 950     return dst;
 951 }
 952
 953 /*
 954  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 955  *
 956  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 957  * We optimize for native, for obvious reasons. */
 958
 959 U8*
 960 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 961 {
 962     U8* pend;
 963     U8* dstart = d;
 964
 965     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 966
 967     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 968          d[0] = 0;
 969          *newlen = 1;
 970          return d;
 971     }
 972
 973     if (bytelen & 1)
 974         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 975
 976     pend = p + bytelen;
 977
 978     while (p < pend) {
 979         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 980         p += 2;
 981         if (uv < 0x80) {
 982 #ifdef EBCDIC
 983             *d++ = UNI_TO_NATIVE(uv);
 984 #else
 985             *d++ = (U8)uv;
 986 #endif
 987             continue;
 988         }
 989         if (uv < 0x800) {
 990             *d++ = (U8)(( uv >>  6)         | 0xc0);
 991             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 992             continue;
 993         }
 994         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 995             UV low = (p[0] << 8) + p[1];
 996             p += 2;
 997             if (low < 0xdc00 || low >= 0xdfff)
 998                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 999             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
1000         }
1001         if (uv < 0x10000) {
1002             *d++ = (U8)(( uv >> 12)         | 0xe0);
1003             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1004             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1005             continue;
1006         }
1007         else {
1008             *d++ = (U8)(( uv >> 18)         | 0xf0);
1009             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1010             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1011             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1012             continue;
1013         }
1014     }
1015     *newlen = d - dstart;
1016     return d;
1017 }
1018
1019 /* Note: this one is slightly destructive of the source. */
1020
1021 U8*
1022 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1023 {
1024     U8* s = (U8*)p;
1025     U8* const send = s + bytelen;
1026
1027     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1028
1029     while (s < send) {
1030         const U8 tmp = s[0];
1031         s[0] = s[1];
1032         s[1] = tmp;
1033         s += 2;
1034     }
1035     return utf16_to_utf8(p, d, bytelen, newlen);
1036 }
1037
1038 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1039
1040 bool
1041 Perl_is_uni_alnum(pTHX_ UV c)
1042 {
1043     U8 tmpbuf[UTF8_MAXBYTES+1];
1044     uvchr_to_utf8(tmpbuf, c);
1045     return is_utf8_alnum(tmpbuf);
1046 }
1047
1048 bool
1049 Perl_is_uni_alnumc(pTHX_ UV c)
1050 {
1051     U8 tmpbuf[UTF8_MAXBYTES+1];
1052     uvchr_to_utf8(tmpbuf, c);
1053     return is_utf8_alnumc(tmpbuf);
1054 }
1055
1056 bool
1057 Perl_is_uni_idfirst(pTHX_ UV c)
1058 {
1059     U8 tmpbuf[UTF8_MAXBYTES+1];
1060     uvchr_to_utf8(tmpbuf, c);
1061     return is_utf8_idfirst(tmpbuf);
1062 }
1063
1064 bool
1065 Perl_is_uni_alpha(pTHX_ UV c)
1066 {
1067     U8 tmpbuf[UTF8_MAXBYTES+1];
1068     uvchr_to_utf8(tmpbuf, c);
1069     return is_utf8_alpha(tmpbuf);
1070 }
1071
1072 bool
1073 Perl_is_uni_ascii(pTHX_ UV c)
1074 {
1075     U8 tmpbuf[UTF8_MAXBYTES+1];
1076     uvchr_to_utf8(tmpbuf, c);
1077     return is_utf8_ascii(tmpbuf);
1078 }
1079
1080 bool
1081 Perl_is_uni_space(pTHX_ UV c)
1082 {
1083     U8 tmpbuf[UTF8_MAXBYTES+1];
1084     uvchr_to_utf8(tmpbuf, c);
1085     return is_utf8_space(tmpbuf);
1086 }
1087
1088 bool
1089 Perl_is_uni_digit(pTHX_ UV c)
1090 {
1091     U8 tmpbuf[UTF8_MAXBYTES+1];
1092     uvchr_to_utf8(tmpbuf, c);
1093     return is_utf8_digit(tmpbuf);
1094 }
1095
1096 bool
1097 Perl_is_uni_upper(pTHX_ UV c)
1098 {
1099     U8 tmpbuf[UTF8_MAXBYTES+1];
1100     uvchr_to_utf8(tmpbuf, c);
1101     return is_utf8_upper(tmpbuf);
1102 }
1103
1104 bool
1105 Perl_is_uni_lower(pTHX_ UV c)
1106 {
1107     U8 tmpbuf[UTF8_MAXBYTES+1];
1108     uvchr_to_utf8(tmpbuf, c);
1109     return is_utf8_lower(tmpbuf);
1110 }
1111
1112 bool
1113 Perl_is_uni_cntrl(pTHX_ UV c)
1114 {
1115     U8 tmpbuf[UTF8_MAXBYTES+1];
1116     uvchr_to_utf8(tmpbuf, c);
1117     return is_utf8_cntrl(tmpbuf);
1118 }
1119
1120 bool
1121 Perl_is_uni_graph(pTHX_ UV c)
1122 {
1123     U8 tmpbuf[UTF8_MAXBYTES+1];
1124     uvchr_to_utf8(tmpbuf, c);
1125     return is_utf8_graph(tmpbuf);
1126 }
1127
1128 bool
1129 Perl_is_uni_print(pTHX_ UV c)
1130 {
1131     U8 tmpbuf[UTF8_MAXBYTES+1];
1132     uvchr_to_utf8(tmpbuf, c);
1133     return is_utf8_print(tmpbuf);
1134 }
1135
1136 bool
1137 Perl_is_uni_punct(pTHX_ UV c)
1138 {
1139     U8 tmpbuf[UTF8_MAXBYTES+1];
1140     uvchr_to_utf8(tmpbuf, c);
1141     return is_utf8_punct(tmpbuf);
1142 }
1143
1144 bool
1145 Perl_is_uni_xdigit(pTHX_ UV c)
1146 {
1147     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1148     uvchr_to_utf8(tmpbuf, c);
1149     return is_utf8_xdigit(tmpbuf);
1150 }
1151
1152 UV
1153 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1154 {
1155     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1156
1157     uvchr_to_utf8(p, c);
1158     return to_utf8_upper(p, p, lenp);
1159 }
1160
1161 UV
1162 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1163 {
1164     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1165
1166     uvchr_to_utf8(p, c);
1167     return to_utf8_title(p, p, lenp);
1168 }
1169
1170 UV
1171 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1172 {
1173     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1174
1175     uvchr_to_utf8(p, c);
1176     return to_utf8_lower(p, p, lenp);
1177 }
1178
1179 UV
1180 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1181 {
1182     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1183
1184     uvchr_to_utf8(p, c);
1185     return to_utf8_fold(p, p, lenp);
1186 }
1187
1188 /* for now these all assume no locale info available for Unicode > 255 */
1189
1190 bool
1191 Perl_is_uni_alnum_lc(pTHX_ UV c)
1192 {
1193     return is_uni_alnum(c);     /* XXX no locale support yet */
1194 }
1195
1196 bool
1197 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1198 {
1199     return is_uni_alnumc(c);    /* XXX no locale support yet */
1200 }
1201
1202 bool
1203 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1204 {
1205     return is_uni_idfirst(c);   /* XXX no locale support yet */
1206 }
1207
1208 bool
1209 Perl_is_uni_alpha_lc(pTHX_ UV c)
1210 {
1211     return is_uni_alpha(c);     /* XXX no locale support yet */
1212 }
1213
1214 bool
1215 Perl_is_uni_ascii_lc(pTHX_ UV c)
1216 {
1217     return is_uni_ascii(c);     /* XXX no locale support yet */
1218 }
1219
1220 bool
1221 Perl_is_uni_space_lc(pTHX_ UV c)
1222 {
1223     return is_uni_space(c);     /* XXX no locale support yet */
1224 }
1225
1226 bool
1227 Perl_is_uni_digit_lc(pTHX_ UV c)
1228 {
1229     return is_uni_digit(c);     /* XXX no locale support yet */
1230 }
1231
1232 bool
1233 Perl_is_uni_upper_lc(pTHX_ UV c)
1234 {
1235     return is_uni_upper(c);     /* XXX no locale support yet */
1236 }
1237
1238 bool
1239 Perl_is_uni_lower_lc(pTHX_ UV c)
1240 {
1241     return is_uni_lower(c);     /* XXX no locale support yet */
1242 }
1243
1244 bool
1245 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1246 {
1247     return is_uni_cntrl(c);     /* XXX no locale support yet */
1248 }
1249
1250 bool
1251 Perl_is_uni_graph_lc(pTHX_ UV c)
1252 {
1253     return is_uni_graph(c);     /* XXX no locale support yet */
1254 }
1255
1256 bool
1257 Perl_is_uni_print_lc(pTHX_ UV c)
1258 {
1259     return is_uni_print(c);     /* XXX no locale support yet */
1260 }
1261
1262 bool
1263 Perl_is_uni_punct_lc(pTHX_ UV c)
1264 {
1265     return is_uni_punct(c);     /* XXX no locale support yet */
1266 }
1267
1268 bool
1269 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1270 {
1271     return is_uni_xdigit(c);    /* XXX no locale support yet */
1272 }
1273
1274 U32
1275 Perl_to_uni_upper_lc(pTHX_ U32 c)
1276 {
1277     /* XXX returns only the first character -- do not use XXX */
1278     /* XXX no locale support yet */
1279     STRLEN len;
1280     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1281     return (U32)to_uni_upper(c, tmpbuf, &len);
1282 }
1283
1284 U32
1285 Perl_to_uni_title_lc(pTHX_ U32 c)
1286 {
1287     /* XXX returns only the first character XXX -- do not use XXX */
1288     /* XXX no locale support yet */
1289     STRLEN len;
1290     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1291     return (U32)to_uni_title(c, tmpbuf, &len);
1292 }
1293
1294 U32
1295 Perl_to_uni_lower_lc(pTHX_ U32 c)
1296 {
1297     /* XXX returns only the first character -- do not use XXX */
1298     /* XXX no locale support yet */
1299     STRLEN len;
1300     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1301     return (U32)to_uni_lower(c, tmpbuf, &len);
1302 }
1303
1304 static bool
1305 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1306                  const char *const swashname)
1307 {
1308     dVAR;
1309
1310     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1311
1312     if (!is_utf8_char(p))
1313         return FALSE;
1314     if (!*swash)
1315         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1316     return swash_fetch(*swash, p, TRUE) != 0;
1317 }
1318
1319 bool
1320 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1321 {
1322     dVAR;
1323
1324     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1325
1326     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1327      * descendant of isalnum(3), in other words, it doesn't
1328      * contain the '_'. --jhi */
1329     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1330 }
1331
1332 bool
1333 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1334 {
1335     dVAR;
1336
1337     PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1338
1339     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1340 }
1341
1342 bool
1343 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1344 {
1345     dVAR;
1346
1347     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1348
1349     if (*p == '_')
1350         return TRUE;
1351     /* is_utf8_idstart would be more logical. */
1352     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1353 }
1354
1355 bool
1356 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1357 {
1358     dVAR;
1359
1360     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1361
1362     if (*p == '_')
1363         return TRUE;
1364     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1365 }
1366
1367 bool
1368 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1369 {
1370     dVAR;
1371
1372     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1373
1374     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1375 }
1376
1377 bool
1378 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1379 {
1380     dVAR;
1381
1382     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1383
1384     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1385 }
1386
1387 bool
1388 Perl_is_utf8_space(pTHX_ const U8 *p)
1389 {
1390     dVAR;
1391
1392     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1393
1394     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1395 }
1396
1397 bool
1398 Perl_is_utf8_digit(pTHX_ const U8 *p)
1399 {
1400     dVAR;
1401
1402     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1403
1404     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1405 }
1406
1407 bool
1408 Perl_is_utf8_upper(pTHX_ const U8 *p)
1409 {
1410     dVAR;
1411
1412     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1413
1414     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1415 }
1416
1417 bool
1418 Perl_is_utf8_lower(pTHX_ const U8 *p)
1419 {
1420     dVAR;
1421
1422     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1423
1424     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1425 }
1426
1427 bool
1428 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1429 {
1430     dVAR;
1431
1432     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1433
1434     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1435 }
1436
1437 bool
1438 Perl_is_utf8_graph(pTHX_ const U8 *p)
1439 {
1440     dVAR;
1441
1442     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1443
1444     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1445 }
1446
1447 bool
1448 Perl_is_utf8_print(pTHX_ const U8 *p)
1449 {
1450     dVAR;
1451
1452     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1453
1454     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1455 }
1456
1457 bool
1458 Perl_is_utf8_punct(pTHX_ const U8 *p)
1459 {
1460     dVAR;
1461
1462     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1463
1464     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1465 }
1466
1467 bool
1468 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1469 {
1470     dVAR;
1471
1472     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1473
1474     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1475 }
1476
1477 bool
1478 Perl_is_utf8_mark(pTHX_ const U8 *p)
1479 {
1480     dVAR;
1481
1482     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1483
1484     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1485 }
1486
1487 /*
1488 =for apidoc to_utf8_case
1489
1490 The "p" contains the pointer to the UTF-8 string encoding
1491 the character that is being converted.
1492
1493 The "ustrp" is a pointer to the character buffer to put the
1494 conversion result to.  The "lenp" is a pointer to the length
1495 of the result.
1496
1497 The "swashp" is a pointer to the swash to use.
1498
1499 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1500 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1501 but not always, a multicharacter mapping), is tried first.
1502
1503 The "special" is a string like "utf8::ToSpecLower", which means the
1504 hash %utf8::ToSpecLower.  The access to the hash is through
1505 Perl_to_utf8_case().
1506
1507 The "normal" is a string like "ToLower" which means the swash
1508 %utf8::ToLower.
1509
1510 =cut */
1511
1512 UV
1513 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1514                         SV **swashp, const char *normal, const char *special)
1515 {
1516     dVAR;
1517     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1518     STRLEN len = 0;
1519     const UV uv0 = utf8_to_uvchr(p, NULL);
1520     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1521      * are necessary in EBCDIC, they are redundant no-ops
1522      * in ASCII-ish platforms, and hopefully optimized away. */
1523     const UV uv1 = NATIVE_TO_UNI(uv0);
1524
1525     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1526
1527     uvuni_to_utf8(tmpbuf, uv1);
1528
1529     if (!*swashp) /* load on-demand */
1530          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1531
1532     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1533     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1534          /* It might be "special" (sometimes, but not always,
1535           * a multicharacter mapping) */
1536          HV * const hv = get_hv(special, 0);
1537          SV **svp;
1538
1539          if (hv &&
1540              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1541              (*svp)) {
1542              const char *s;
1543
1544               s = SvPV_const(*svp, len);
1545               if (len == 1)
1546                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1547               else {
1548 #ifdef EBCDIC
1549                    /* If we have EBCDIC we need to remap the characters
1550                     * since any characters in the low 256 are Unicode
1551                     * code points, not EBCDIC. */
1552                    U8 *t = (U8*)s, *tend = t + len, *d;
1553
1554                    d = tmpbuf;
1555                    if (SvUTF8(*svp)) {
1556                         STRLEN tlen = 0;
1557
1558                         while (t < tend) {
1559                              const UV c = utf8_to_uvchr(t, &tlen);
1560                              if (tlen > 0) {
1561                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1562                                   t += tlen;
1563                              }
1564                              else
1565                                   break;
1566                         }
1567                    }
1568                    else {
1569                         while (t < tend) {
1570                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1571                              t++;
1572                         }
1573                    }
1574                    len = d - tmpbuf;
1575                    Copy(tmpbuf, ustrp, len, U8);
1576 #else
1577                    Copy(s, ustrp, len, U8);
1578 #endif
1579               }
1580          }
1581     }
1582
1583     if (!len && *swashp) {
1584         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1585
1586          if (uv2) {
1587               /* It was "normal" (a single character mapping). */
1588               const UV uv3 = UNI_TO_NATIVE(uv2);
1589               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1590          }
1591     }
1592
1593     if (!len) /* Neither: just copy. */
1594          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1595
1596     if (lenp)
1597          *lenp = len;
1598
1599     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1600 }
1601
1602 /*
1603 =for apidoc to_utf8_upper
1604
1605 Convert the UTF-8 encoded character at p to its uppercase version and
1606 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1607 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1608 the uppercase version may be longer than the original character.
1609
1610 The first character of the uppercased version is returned
1611 (but note, as explained above, that there may be more.)
1612
1613 =cut */
1614
1615 UV
1616 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1617 {
1618     dVAR;
1619
1620     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1621
1622     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1623                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1624 }
1625
1626 /*
1627 =for apidoc to_utf8_title
1628
1629 Convert the UTF-8 encoded character at p to its titlecase version and
1630 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1631 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1632 titlecase version may be longer than the original character.
1633
1634 The first character of the titlecased version is returned
1635 (but note, as explained above, that there may be more.)
1636
1637 =cut */
1638
1639 UV
1640 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1641 {
1642     dVAR;
1643
1644     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1645
1646     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1647                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1648 }
1649
1650 /*
1651 =for apidoc to_utf8_lower
1652
1653 Convert the UTF-8 encoded character at p to its lowercase version and
1654 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1655 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1656 lowercase version may be longer than the original character.
1657
1658 The first character of the lowercased version is returned
1659 (but note, as explained above, that there may be more.)
1660
1661 =cut */
1662
1663 UV
1664 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1665 {
1666     dVAR;
1667
1668     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1669
1670     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1671                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1672 }
1673
1674 /*
1675 =for apidoc to_utf8_fold
1676
1677 Convert the UTF-8 encoded character at p to its foldcase version and
1678 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1679 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1680 foldcase version may be longer than the original character (up to
1681 three characters).
1682
1683 The first character of the foldcased version is returned
1684 (but note, as explained above, that there may be more.)
1685
1686 =cut */
1687
1688 UV
1689 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1690 {
1691     dVAR;
1692
1693     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1694
1695     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1696                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1697 }
1698
1699 /* Note:
1700  * A "swash" is a swatch hash.
1701  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1702  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1703  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1704  */
1705 SV*
1706 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1707 {
1708     dVAR;
1709     SV* retval;
1710     dSP;
1711     const size_t pkg_len = strlen(pkg);
1712     const size_t name_len = strlen(name);
1713     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1714     SV* errsv_save;
1715
1716     PERL_ARGS_ASSERT_SWASH_INIT;
1717
1718     PUSHSTACKi(PERLSI_MAGIC);
1719     ENTER;
1720     SAVEI32(PL_hints);
1721     PL_hints = 0;
1722     save_re_context();
1723     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1724         ENTER;
1725         errsv_save = newSVsv(ERRSV);
1726         /* It is assumed that callers of this routine are not passing in any
1727            user derived data.  */
1728         /* Need to do this after save_re_context() as it will set PL_tainted to
1729            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1730            Even line to create errsv_save can turn on PL_tainted.  */
1731         SAVEBOOL(PL_tainted);
1732         PL_tainted = 0;
1733         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1734                          NULL);
1735         if (!SvTRUE(ERRSV))
1736             sv_setsv(ERRSV, errsv_save);
1737         SvREFCNT_dec(errsv_save);
1738         LEAVE;
1739     }
1740     SPAGAIN;
1741     PUSHMARK(SP);
1742     EXTEND(SP,5);
1743     mPUSHp(pkg, pkg_len);
1744     mPUSHp(name, name_len);
1745     PUSHs(listsv);
1746     mPUSHi(minbits);
1747     mPUSHi(none);
1748     PUTBACK;
1749     errsv_save = newSVsv(ERRSV);
1750     if (call_method("SWASHNEW", G_SCALAR))
1751         retval = newSVsv(*PL_stack_sp--);
1752     else
1753         retval = &PL_sv_undef;
1754     if (!SvTRUE(ERRSV))
1755         sv_setsv(ERRSV, errsv_save);
1756     SvREFCNT_dec(errsv_save);
1757     LEAVE;
1758     POPSTACK;
1759     if (IN_PERL_COMPILETIME) {
1760         CopHINTS_set(PL_curcop, PL_hints);
1761     }
1762     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1763         if (SvPOK(retval))
1764             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1765                        SVfARG(retval));
1766         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1767     }
1768     return retval;
1769 }
1770
1771
1772 /* This API is wrong for special case conversions since we may need to
1773  * return several Unicode characters for a single Unicode character
1774  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1775  * the lower-level routine, and it is similarly broken for returning
1776  * multiple values.  --jhi */
1777 /* Now SWASHGET is recasted into S_swash_get in this file. */
1778
1779 /* Note:
1780  * Returns the value of property/mapping C<swash> for the first character
1781  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1782  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1783  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1784  */
1785 UV
1786 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1787 {
1788     dVAR;
1789     HV *const hv = MUTABLE_HV(SvRV(swash));
1790     U32 klen;
1791     U32 off;
1792     STRLEN slen;
1793     STRLEN needents;
1794     const U8 *tmps = NULL;
1795     U32 bit;
1796     SV *swatch;
1797     U8 tmputf8[2];
1798     const UV c = NATIVE_TO_ASCII(*ptr);
1799
1800     PERL_ARGS_ASSERT_SWASH_FETCH;
1801
1802     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1803         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1804         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1805         ptr = tmputf8;
1806     }
1807     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1808      * then the "swatch" is a vec() for al the chars which start
1809      * with 0xAA..0xYY
1810      * So the key in the hash (klen) is length of encoded char -1
1811      */
1812     klen = UTF8SKIP(ptr) - 1;
1813     off  = ptr[klen];
1814
1815     if (klen == 0) {
1816       /* If char in invariant then swatch is for all the invariant chars
1817        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1818        */
1819         needents = UTF_CONTINUATION_MARK;
1820         off      = NATIVE_TO_UTF(ptr[klen]);
1821     }
1822     else {
1823       /* If char is encoded then swatch is for the prefix */
1824         needents = (1 << UTF_ACCUMULATION_SHIFT);
1825         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1826     }
1827
1828     /*
1829      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1830      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1831      * it's nothing to sniff at.)  Pity we usually come through at least
1832      * two function calls to get here...
1833      *
1834      * NB: this code assumes that swatches are never modified, once generated!
1835      */
1836
1837     if (hv   == PL_last_swash_hv &&
1838         klen == PL_last_swash_klen &&
1839         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1840     {
1841         tmps = PL_last_swash_tmps;
1842         slen = PL_last_swash_slen;
1843     }
1844     else {
1845         /* Try our second-level swatch cache, kept in a hash. */
1846         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1847
1848         /* If not cached, generate it via swash_get */
1849         if (!svp || !SvPOK(*svp)
1850                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1851             /* We use utf8n_to_uvuni() as we want an index into
1852                Unicode tables, not a native character number.
1853              */
1854             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1855                                            ckWARN(WARN_UTF8) ?
1856                                            0 : UTF8_ALLOW_ANY);
1857             swatch = swash_get(swash,
1858                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1859                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1860                                 needents);
1861
1862             if (IN_PERL_COMPILETIME)
1863                 CopHINTS_set(PL_curcop, PL_hints);
1864
1865             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1866
1867             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1868                      || (slen << 3) < needents)
1869                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1870         }
1871
1872         PL_last_swash_hv = hv;
1873         assert(klen <= sizeof(PL_last_swash_key));
1874         PL_last_swash_klen = (U8)klen;
1875         /* FIXME change interpvar.h?  */
1876         PL_last_swash_tmps = (U8 *) tmps;
1877         PL_last_swash_slen = slen;
1878         if (klen)
1879             Copy(ptr, PL_last_swash_key, klen, U8);
1880     }
1881
1882     switch ((int)((slen << 3) / needents)) {
1883     case 1:
1884         bit = 1 << (off & 7);
1885         off >>= 3;
1886         return (tmps[off] & bit) != 0;
1887     case 8:
1888         return tmps[off];
1889     case 16:
1890         off <<= 1;
1891         return (tmps[off] << 8) + tmps[off + 1] ;
1892     case 32:
1893         off <<= 2;
1894         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1895     }
1896     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1897     NORETURN_FUNCTION_END;
1898 }
1899
1900 /* Note:
1901  * Returns a swatch (a bit vector string) for a code point sequence
1902  * that starts from the value C<start> and comprises the number C<span>.
1903  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1904  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1905  */
1906 STATIC SV*
1907 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1908 {
1909     SV *swatch;
1910     U8 *l, *lend, *x, *xend, *s;
1911     STRLEN lcur, xcur, scur;
1912     HV *const hv = MUTABLE_HV(SvRV(swash));
1913     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1914     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1915     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1916     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1917     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1918     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1919     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1920     const STRLEN bits  = SvUV(*bitssvp);
1921     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1922     const UV     none  = SvUV(*nonesvp);
1923     const UV     end   = start + span;
1924
1925     PERL_ARGS_ASSERT_SWASH_GET;
1926
1927     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1928         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1929                                                  (UV)bits);
1930     }
1931
1932     /* create and initialize $swatch */
1933     scur   = octets ? (span * octets) : (span + 7) / 8;
1934     swatch = newSV(scur);
1935     SvPOK_on(swatch);
1936     s = (U8*)SvPVX(swatch);
1937     if (octets && none) {
1938         const U8* const e = s + scur;
1939         while (s < e) {
1940             if (bits == 8)
1941                 *s++ = (U8)(none & 0xff);
1942             else if (bits == 16) {
1943                 *s++ = (U8)((none >>  8) & 0xff);
1944                 *s++ = (U8)( none        & 0xff);
1945             }
1946             else if (bits == 32) {
1947                 *s++ = (U8)((none >> 24) & 0xff);
1948                 *s++ = (U8)((none >> 16) & 0xff);
1949                 *s++ = (U8)((none >>  8) & 0xff);
1950                 *s++ = (U8)( none        & 0xff);
1951             }
1952         }
1953         *s = '\0';
1954     }
1955     else {
1956         (void)memzero((U8*)s, scur + 1);
1957     }
1958     SvCUR_set(swatch, scur);
1959     s = (U8*)SvPVX(swatch);
1960
1961     /* read $swash->{LIST} */
1962     l = (U8*)SvPV(*listsvp, lcur);
1963     lend = l + lcur;
1964     while (l < lend) {
1965         UV min, max, val;
1966         STRLEN numlen;
1967         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1968
1969         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1970
1971         numlen = lend - l;
1972         min = grok_hex((char *)l, &numlen, &flags, NULL);
1973         if (numlen)
1974             l += numlen;
1975         else if (nl) {
1976             l = nl + 1; /* 1 is length of "\n" */
1977             continue;
1978         }
1979         else {
1980             l = lend; /* to LIST's end at which \n is not found */
1981             break;
1982         }
1983
1984         if (isBLANK(*l)) {
1985             ++l;
1986             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1987             numlen = lend - l;
1988             max = grok_hex((char *)l, &numlen, &flags, NULL);
1989             if (numlen)
1990                 l += numlen;
1991             else
1992                 max = min;
1993
1994             if (octets) {
1995                 if (isBLANK(*l)) {
1996                     ++l;
1997                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1998                             PERL_SCAN_DISALLOW_PREFIX;
1999                     numlen = lend - l;
2000                     val = grok_hex((char *)l, &numlen, &flags, NULL);
2001                     if (numlen)
2002                         l += numlen;
2003                     else
2004                         val = 0;
2005                 }
2006                 else {
2007                     val = 0;
2008                     if (typeto) {
2009                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2010                                          typestr, l);
2011                     }
2012                 }
2013             }
2014             else
2015                 val = 0; /* bits == 1, then val should be ignored */
2016         }
2017         else {
2018             max = min;
2019             if (octets) {
2020                 val = 0;
2021                 if (typeto) {
2022                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2023                 }
2024             }
2025             else
2026                 val = 0; /* bits == 1, then val should be ignored */
2027         }
2028
2029         if (nl)
2030             l = nl + 1;
2031         else
2032             l = lend;
2033
2034         if (max < start)
2035             continue;
2036
2037         if (octets) {
2038             UV key;
2039             if (min < start) {
2040                 if (!none || val < none) {
2041                     val += start - min;
2042                 }
2043                 min = start;
2044             }
2045             for (key = min; key <= max; key++) {
2046                 STRLEN offset;
2047                 if (key >= end)
2048                     goto go_out_list;
2049                 /* offset must be non-negative (start <= min <= key < end) */
2050                 offset = octets * (key - start);
2051                 if (bits == 8)
2052                     s[offset] = (U8)(val & 0xff);
2053                 else if (bits == 16) {
2054                     s[offset    ] = (U8)((val >>  8) & 0xff);
2055                     s[offset + 1] = (U8)( val        & 0xff);
2056                 }
2057                 else if (bits == 32) {
2058                     s[offset    ] = (U8)((val >> 24) & 0xff);
2059                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2060                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2061                     s[offset + 3] = (U8)( val        & 0xff);
2062                 }
2063
2064                 if (!none || val < none)
2065                     ++val;
2066             }
2067         }
2068         else { /* bits == 1, then val should be ignored */
2069             UV key;
2070             if (min < start)
2071                 min = start;
2072             for (key = min; key <= max; key++) {
2073                 const STRLEN offset = (STRLEN)(key - start);
2074                 if (key >= end)
2075                     goto go_out_list;
2076                 s[offset >> 3] |= 1 << (offset & 7);
2077             }
2078         }
2079     } /* while */
2080   go_out_list:
2081
2082     /* read $swash->{EXTRAS} */
2083     x = (U8*)SvPV(*extssvp, xcur);
2084     xend = x + xcur;
2085     while (x < xend) {
2086         STRLEN namelen;
2087         U8 *namestr;
2088         SV** othersvp;
2089         HV* otherhv;
2090         STRLEN otherbits;
2091         SV **otherbitssvp, *other;
2092         U8 *s, *o, *nl;
2093         STRLEN slen, olen;
2094
2095         const U8 opc = *x++;
2096         if (opc == '\n')
2097             continue;
2098
2099         nl = (U8*)memchr(x, '\n', xend - x);
2100
2101         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2102             if (nl) {
2103                 x = nl + 1; /* 1 is length of "\n" */
2104                 continue;
2105             }
2106             else {
2107                 x = xend; /* to EXTRAS' end at which \n is not found */
2108                 break;
2109             }
2110         }
2111
2112         namestr = x;
2113         if (nl) {
2114             namelen = nl - namestr;
2115             x = nl + 1;
2116         }
2117         else {
2118             namelen = xend - namestr;
2119             x = xend;
2120         }
2121
2122         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2123         otherhv = MUTABLE_HV(SvRV(*othersvp));
2124         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2125         otherbits = (STRLEN)SvUV(*otherbitssvp);
2126         if (bits < otherbits)
2127             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2128
2129         /* The "other" swatch must be destroyed after. */
2130         other = swash_get(*othersvp, start, span);
2131         o = (U8*)SvPV(other, olen);
2132
2133         if (!olen)
2134             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2135
2136         s = (U8*)SvPV(swatch, slen);
2137         if (bits == 1 && otherbits == 1) {
2138             if (slen != olen)
2139                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2140
2141             switch (opc) {
2142             case '+':
2143                 while (slen--)
2144                     *s++ |= *o++;
2145                 break;
2146             case '!':
2147                 while (slen--)
2148                     *s++ |= ~*o++;
2149                 break;
2150             case '-':
2151                 while (slen--)
2152                     *s++ &= ~*o++;
2153                 break;
2154             case '&':
2155                 while (slen--)
2156                     *s++ &= *o++;
2157                 break;
2158             default:
2159                 break;
2160             }
2161         }
2162         else {
2163             STRLEN otheroctets = otherbits >> 3;
2164             STRLEN offset = 0;
2165             U8* const send = s + slen;
2166
2167             while (s < send) {
2168                 UV otherval = 0;
2169
2170                 if (otherbits == 1) {
2171                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2172                     ++offset;
2173                 }
2174                 else {
2175                     STRLEN vlen = otheroctets;
2176                     otherval = *o++;
2177                     while (--vlen) {
2178                         otherval <<= 8;
2179                         otherval |= *o++;
2180                     }
2181                 }
2182
2183                 if (opc == '+' && otherval)
2184                     NOOP;   /* replace with otherval */
2185                 else if (opc == '!' && !otherval)
2186                     otherval = 1;
2187                 else if (opc == '-' && otherval)
2188                     otherval = 0;
2189                 else if (opc == '&' && !otherval)
2190                     otherval = 0;
2191                 else {
2192                     s += octets; /* no replacement */
2193                     continue;
2194                 }
2195
2196                 if (bits == 8)
2197                     *s++ = (U8)( otherval & 0xff);
2198                 else if (bits == 16) {
2199                     *s++ = (U8)((otherval >>  8) & 0xff);
2200                     *s++ = (U8)( otherval        & 0xff);
2201                 }
2202                 else if (bits == 32) {
2203                     *s++ = (U8)((otherval >> 24) & 0xff);
2204                     *s++ = (U8)((otherval >> 16) & 0xff);
2205                     *s++ = (U8)((otherval >>  8) & 0xff);
2206                     *s++ = (U8)( otherval        & 0xff);
2207                 }
2208             }
2209         }
2210         sv_free(other); /* through with it! */
2211     } /* while */
2212     return swatch;
2213 }
2214
2215 /*
2216 =for apidoc uvchr_to_utf8
2217
2218 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2219 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2220 bytes available. The return value is the pointer to the byte after the
2221 end of the new character. In other words,
2222
2223     d = uvchr_to_utf8(d, uv);
2224
2225 is the recommended wide native character-aware way of saying
2226
2227     *(d++) = uv;
2228
2229 =cut
2230 */
2231
2232 /* On ASCII machines this is normally a macro but we want a
2233    real function in case XS code wants it
2234 */
2235 U8 *
2236 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2237 {
2238     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2239
2240     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2241 }
2242
2243 U8 *
2244 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2245 {
2246     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2247
2248     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2249 }
2250
2251 /*
2252 =for apidoc utf8n_to_uvchr
2253 flags
2254
2255 Returns the native character value of the first character in the string
2256 C<s>
2257 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2258 length, in bytes, of that character.
2259
2260 Allows length and flags to be passed to low level routine.
2261
2262 =cut
2263 */
2264 /* On ASCII machines this is normally a macro but we want
2265    a real function in case XS code wants it
2266 */
2267 UV
2268 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2269 U32 flags)
2270 {
2271     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2272
2273     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2274
2275     return UNI_TO_NATIVE(uv);
2276 }
2277
2278 /*
2279 =for apidoc pv_uni_display
2280
2281 Build to the scalar dsv a displayable version of the string spv,
2282 length len, the displayable version being at most pvlim bytes long
2283 (if longer, the rest is truncated and "..." will be appended).
2284
2285 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2286 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2287 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2288 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2289 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2290 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2291
2292 The pointer to the PV of the dsv is returned.
2293
2294 =cut */
2295 char *
2296 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2297 {
2298     int truncated = 0;
2299     const char *s, *e;
2300
2301     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2302
2303     sv_setpvs(dsv, "");
2304     SvUTF8_off(dsv);
2305     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2306          UV u;
2307           /* This serves double duty as a flag and a character to print after
2308              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2309           */
2310          char ok = 0;
2311
2312          if (pvlim && SvCUR(dsv) >= pvlim) {
2313               truncated++;
2314               break;
2315          }
2316          u = utf8_to_uvchr((U8*)s, 0);
2317          if (u < 256) {
2318              const unsigned char c = (unsigned char)u & 0xFF;
2319              if (flags & UNI_DISPLAY_BACKSLASH) {
2320                  switch (c) {
2321                  case '\n':
2322                      ok = 'n'; break;
2323                  case '\r':
2324                      ok = 'r'; break;
2325                  case '\t':
2326                      ok = 't'; break;
2327                  case '\f':
2328                      ok = 'f'; break;
2329                  case '\a':
2330                      ok = 'a'; break;
2331                  case '\\':
2332                      ok = '\\'; break;
2333                  default: break;
2334                  }
2335                  if (ok) {
2336                      const char string = ok;
2337                      sv_catpvs(dsv, "\\");
2338                      sv_catpvn(dsv, &string, 1);
2339                  }
2340              }
2341              /* isPRINT() is the locale-blind version. */
2342              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2343                  const char string = c;
2344                  sv_catpvn(dsv, &string, 1);
2345                  ok = 1;
2346              }
2347          }
2348          if (!ok)
2349              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2350     }
2351     if (truncated)
2352          sv_catpvs(dsv, "...");
2353
2354     return SvPVX(dsv);
2355 }
2356
2357 /*
2358 =for apidoc sv_uni_display
2359
2360 Build to the scalar dsv a displayable version of the scalar sv,
2361 the displayable version being at most pvlim bytes long
2362 (if longer, the rest is truncated and "..." will be appended).
2363
2364 The flags argument is as in pv_uni_display().
2365
2366 The pointer to the PV of the dsv is returned.
2367
2368 =cut
2369 */
2370 char *
2371 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2372 {
2373     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2374
2375      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2376                                 SvCUR(ssv), pvlim, flags);
2377 }
2378
2379 /*
2380 =for apidoc ibcmp_utf8
2381
2382 Return true if the strings s1 and s2 differ case-insensitively, false
2383 if not (if they are equal case-insensitively).  If u1 is true, the
2384 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2385 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2386 are false, the respective string is assumed to be in native 8-bit
2387 encoding.
2388
2389 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2390 in there (they will point at the beginning of the I<next> character).
2391 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2392 pointers beyond which scanning will not continue under any
2393 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2394 s2+l2 will be used as goal end pointers that will also stop the scan,
2395 and which qualify towards defining a successful match: all the scans
2396 that define an explicit length must reach their goal pointers for
2397 a match to succeed).
2398
2399 For case-insensitiveness, the "casefolding" of Unicode is used
2400 instead of upper/lowercasing both the characters, see
2401 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2402
2403 =cut */
2404 I32
2405 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2406 {
2407      dVAR;
2408      register const U8 *p1  = (const U8*)s1;
2409      register const U8 *p2  = (const U8*)s2;
2410      register const U8 *f1 = NULL;
2411      register const U8 *f2 = NULL;
2412      register U8 *e1 = NULL;
2413      register U8 *q1 = NULL;
2414      register U8 *e2 = NULL;
2415      register U8 *q2 = NULL;
2416      STRLEN n1 = 0, n2 = 0;
2417      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2418      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2419      U8 natbuf[1+1];
2420      STRLEN foldlen1, foldlen2;
2421      bool match;
2422
2423      PERL_ARGS_ASSERT_IBCMP_UTF8;
2424
2425      if (pe1)
2426           e1 = *(U8**)pe1;
2427      /* assert(e1 || l1); */
2428      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2429           f1 = (const U8*)s1 + l1;
2430      if (pe2)
2431           e2 = *(U8**)pe2;
2432      /* assert(e2 || l2); */
2433      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2434           f2 = (const U8*)s2 + l2;
2435
2436      /* This shouldn't happen. However, putting an assert() there makes some
2437       * tests fail. */
2438      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2439      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2440           return 1; /* mismatch; possible infinite loop or false positive */
2441
2442      if (!u1 || !u2)
2443           natbuf[1] = 0; /* Need to terminate the buffer. */
2444
2445      while ((e1 == 0 || p1 < e1) &&
2446             (f1 == 0 || p1 < f1) &&
2447             (e2 == 0 || p2 < e2) &&
2448             (f2 == 0 || p2 < f2)) {
2449           if (n1 == 0) {
2450                if (u1)
2451                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2452                else {
2453                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2454                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2455                }
2456                q1 = foldbuf1;
2457                n1 = foldlen1;
2458           }
2459           if (n2 == 0) {
2460                if (u2)
2461                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2462                else {
2463                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2464                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2465                }
2466                q2 = foldbuf2;
2467                n2 = foldlen2;
2468           }
2469           while (n1 && n2) {
2470                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2471                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2472                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2473                    return 1; /* mismatch */
2474                n1 -= UTF8SKIP(q1);
2475                q1 += UTF8SKIP(q1);
2476                n2 -= UTF8SKIP(q2);
2477                q2 += UTF8SKIP(q2);
2478           }
2479           if (n1 == 0)
2480                p1 += u1 ? UTF8SKIP(p1) : 1;
2481           if (n2 == 0)
2482                p2 += u2 ? UTF8SKIP(p2) : 1;
2483
2484      }
2485
2486      /* A match is defined by all the scans that specified
2487       * an explicit length reaching their final goals. */
2488      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2489
2490      if (match) {
2491           if (pe1)
2492                *pe1 = (char*)p1;
2493           if (pe2)
2494                *pe2 = (char*)p2;
2495      }
2496
2497      return match ? 0 : 1; /* 0 match, 1 mismatch */
2498 }
2499
2500 /*
2501  * Local variables:
2502  * c-indentation-style: bsd
2503  * c-basic-offset: 4
2504  * indent-tabs-mode: t
2505  * End:
2506  *
2507  * ex: set ts=8 sts=4 sw=4 noet:
2508  */