utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =cut
  55 */
  56
  57 /*
  58 =for apidoc is_ascii_string
  59
  60 Returns true if first C<len> bytes of the given string are ASCII (i.e. none
  61 of them even raise the question of UTF-8-ness).
  62
  63 See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  64
  65 =cut
  66 */
  67
  68 bool
  69 Perl_is_ascii_string(const U8 *s, STRLEN len)
  70 {
  71     const U8* const send = s + (len ? len : strlen((const char *)s));
  72     const U8* x = s;
  73
  74     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  75
  76     for (; x < send; ++x) {
  77         if (!UTF8_IS_INVARIANT(*x))
  78             break;
  79     }
  80
  81     return x == send;
  82 }
  83
  84 /*
  85 =for apidoc uvuni_to_utf8_flags
  86
  87 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  88 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  89 bytes available. The return value is the pointer to the byte after the
  90 end of the new character. In other words,
  91
  92     d = uvuni_to_utf8_flags(d, uv, flags);
  93
  94 or, in most cases,
  95
  96     d = uvuni_to_utf8(d, uv);
  97
  98 (which is equivalent to)
  99
 100     d = uvuni_to_utf8_flags(d, uv, 0);
 101
 102 is the recommended Unicode-aware way of saying
 103
 104     *(d++) = uv;
 105
 106 =cut
 107 */
 108
 109 U8 *
 110 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 111 {
 112     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
 113
 114     if (ckWARN(WARN_UTF8)) {
 115          if (UNICODE_IS_SURROGATE(uv) &&
 116              !(flags & UNICODE_ALLOW_SURROGATE))
 117               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
 118          else if (
 119                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
 120                     !(flags & UNICODE_ALLOW_FDD0))
 121                    ||
 122                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
 123                     !(flags & UNICODE_ALLOW_FFFF))) &&
 124                   /* UNICODE_ALLOW_SUPER includes
 125                    * FFFEs and FFFFs beyond 0x10FFFF. */
 126                   ((uv <= PERL_UNICODE_MAX) ||
 127                    !(flags & UNICODE_ALLOW_SUPER))
 128                   )
 129               Perl_warner(aTHX_ packWARN(WARN_UTF8),
 130                       "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
 131     }
 132     if (UNI_IS_INVARIANT(uv)) {
 133         *d++ = (U8)UTF_TO_NATIVE(uv);
 134         return d;
 135     }
 136 #if defined(EBCDIC)
 137     else {
 138         STRLEN len  = UNISKIP(uv);
 139         U8 *p = d+len-1;
 140         while (p > d) {
 141             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 142             uv >>= UTF_ACCUMULATION_SHIFT;
 143         }
 144         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 145         return d+len;
 146     }
 147 #else /* Non loop style */
 148     if (uv < 0x800) {
 149         *d++ = (U8)(( uv >>  6)         | 0xc0);
 150         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 151         return d;
 152     }
 153     if (uv < 0x10000) {
 154         *d++ = (U8)(( uv >> 12)         | 0xe0);
 155         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 156         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 157         return d;
 158     }
 159     if (uv < 0x200000) {
 160         *d++ = (U8)(( uv >> 18)         | 0xf0);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166     if (uv < 0x4000000) {
 167         *d++ = (U8)(( uv >> 24)         | 0xf8);
 168         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 169         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 170         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 171         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 172         return d;
 173     }
 174     if (uv < 0x80000000) {
 175         *d++ = (U8)(( uv >> 30)         | 0xfc);
 176         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 177         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 178         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 179         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 180         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 181         return d;
 182     }
 183 #ifdef HAS_QUAD
 184     if (uv < UTF8_QUAD_MAX)
 185 #endif
 186     {
 187         *d++ =                            0xfe; /* Can't match U+FEFF! */
 188         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 189         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 190         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 191         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 192         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 193         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 194         return d;
 195     }
 196 #ifdef HAS_QUAD
 197     {
 198         *d++ =                            0xff;         /* Can't match U+FFFE! */
 199         *d++ =                            0x80;         /* 6 Reserved bits */
 200         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 201         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 202         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 203         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 205         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 206         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 207         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 208         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 209         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 210         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 211         return d;
 212     }
 213 #endif
 214 #endif /* Loop style */
 215 }
 216
 217 /*
 218
 219 Tests if some arbitrary number of bytes begins in a valid UTF-8
 220 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 221 UTF-8 character.  The actual number of bytes in the UTF-8 character
 222 will be returned if it is valid, otherwise 0.
 223
 224 This is the "slow" version as opposed to the "fast" version which is
 225 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 226 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 227 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 228 you should use the _slow().  In practice this means that the _slow()
 229 will be used very rarely, since the maximum Unicode code point (as of
 230 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 231 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 232 five bytes or more.
 233
 234 =cut */
 235 STATIC STRLEN
 236 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 237 {
 238     U8 u = *s;
 239     STRLEN slen;
 240     UV uv, ouv;
 241
 242     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 243
 244     if (UTF8_IS_INVARIANT(u))
 245         return 1;
 246
 247     if (!UTF8_IS_START(u))
 248         return 0;
 249
 250     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 251         return 0;
 252
 253     slen = len - 1;
 254     s++;
 255 #ifdef EBCDIC
 256     u = NATIVE_TO_UTF(u);
 257 #endif
 258     u &= UTF_START_MASK(len);
 259     uv  = u;
 260     ouv = uv;
 261     while (slen--) {
 262         if (!UTF8_IS_CONTINUATION(*s))
 263             return 0;
 264         uv = UTF8_ACCUMULATE(uv, *s);
 265         if (uv < ouv)
 266             return 0;
 267         ouv = uv;
 268         s++;
 269     }
 270
 271     if ((STRLEN)UNISKIP(uv) < len)
 272         return 0;
 273
 274     return len;
 275 }
 276
 277 /*
 278 =for apidoc is_utf8_char
 279
 280 Tests if some arbitrary number of bytes begins in a valid UTF-8
 281 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 282 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 283 character will be returned if it is valid, otherwise 0.
 284
 285 =cut */
 286 STRLEN
 287 Perl_is_utf8_char(const U8 *s)
 288 {
 289     const STRLEN len = UTF8SKIP(s);
 290
 291     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 292 #ifdef IS_UTF8_CHAR
 293     if (IS_UTF8_CHAR_FAST(len))
 294         return IS_UTF8_CHAR(s, len) ? len : 0;
 295 #endif /* #ifdef IS_UTF8_CHAR */
 296     return is_utf8_char_slow(s, len);
 297 }
 298
 299
 300 /*
 301 =for apidoc is_utf8_string
 302
 303 Returns true if first C<len> bytes of the given string form a valid
 304 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 305 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 306 because a valid ASCII string is a valid UTF-8 string.
 307
 308 See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
 309
 310 =cut
 311 */
 312
 313 bool
 314 Perl_is_utf8_string(const U8 *s, STRLEN len)
 315 {
 316     const U8* const send = s + (len ? len : strlen((const char *)s));
 317     const U8* x = s;
 318
 319     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 320
 321     while (x < send) {
 322         STRLEN c;
 323          /* Inline the easy bits of is_utf8_char() here for speed... */
 324          if (UTF8_IS_INVARIANT(*x))
 325               c = 1;
 326          else if (!UTF8_IS_START(*x))
 327              goto out;
 328          else {
 329               /* ... and call is_utf8_char() only if really needed. */
 330 #ifdef IS_UTF8_CHAR
 331              c = UTF8SKIP(x);
 332              if (IS_UTF8_CHAR_FAST(c)) {
 333                  if (!IS_UTF8_CHAR(x, c))
 334                      c = 0;
 335              }
 336              else
 337                 c = is_utf8_char_slow(x, c);
 338 #else
 339              c = is_utf8_char(x);
 340 #endif /* #ifdef IS_UTF8_CHAR */
 341               if (!c)
 342                   goto out;
 343          }
 344         x += c;
 345     }
 346
 347  out:
 348     if (x != send)
 349         return FALSE;
 350
 351     return TRUE;
 352 }
 353
 354 /*
 355 Implemented as a macro in utf8.h
 356
 357 =for apidoc is_utf8_string_loc
 358
 359 Like is_utf8_string() but stores the location of the failure (in the
 360 case of "utf8ness failure") or the location s+len (in the case of
 361 "utf8ness success") in the C<ep>.
 362
 363 See also is_utf8_string_loclen() and is_utf8_string().
 364
 365 =for apidoc is_utf8_string_loclen
 366
 367 Like is_utf8_string() but stores the location of the failure (in the
 368 case of "utf8ness failure") or the location s+len (in the case of
 369 "utf8ness success") in the C<ep>, and the number of UTF-8
 370 encoded characters in the C<el>.
 371
 372 See also is_utf8_string_loc() and is_utf8_string().
 373
 374 =cut
 375 */
 376
 377 bool
 378 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 379 {
 380     const U8* const send = s + (len ? len : strlen((const char *)s));
 381     const U8* x = s;
 382     STRLEN c;
 383     STRLEN outlen = 0;
 384
 385     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 386
 387     while (x < send) {
 388          /* Inline the easy bits of is_utf8_char() here for speed... */
 389          if (UTF8_IS_INVARIANT(*x))
 390              c = 1;
 391          else if (!UTF8_IS_START(*x))
 392              goto out;
 393          else {
 394              /* ... and call is_utf8_char() only if really needed. */
 395 #ifdef IS_UTF8_CHAR
 396              c = UTF8SKIP(x);
 397              if (IS_UTF8_CHAR_FAST(c)) {
 398                  if (!IS_UTF8_CHAR(x, c))
 399                      c = 0;
 400              } else
 401                  c = is_utf8_char_slow(x, c);
 402 #else
 403              c = is_utf8_char(x);
 404 #endif /* #ifdef IS_UTF8_CHAR */
 405              if (!c)
 406                  goto out;
 407          }
 408          x += c;
 409          outlen++;
 410     }
 411
 412  out:
 413     if (el)
 414         *el = outlen;
 415
 416     if (ep)
 417         *ep = x;
 418     return (x == send);
 419 }
 420
 421 /*
 422
 423 =for apidoc utf8n_to_uvuni
 424
 425 Bottom level UTF-8 decode routine.
 426 Returns the Unicode code point value of the first character in the string C<s>
 427 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 428 C<retlen> will be set to the length, in bytes, of that character.
 429
 430 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 431 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 432 it is assumed that the caller will raise a warning, and this function
 433 will silently just set C<retlen> to C<-1> and return zero.  If the
 434 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 435 malformations will be given, C<retlen> will be set to the expected
 436 length of the UTF-8 character in bytes, and zero will be returned.
 437
 438 The C<flags> can also contain various flags to allow deviations from
 439 the strict UTF-8 encoding (see F<utf8.h>).
 440
 441 Most code should use utf8_to_uvchr() rather than call this directly.
 442
 443 =cut
 444 */
 445
 446 UV
 447 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 448 {
 449     dVAR;
 450     const U8 * const s0 = s;
 451     UV uv = *s, ouv = 0;
 452     STRLEN len = 1;
 453     const bool dowarn = ckWARN_d(WARN_UTF8);
 454     const UV startbyte = *s;
 455     STRLEN expectlen = 0;
 456     U32 warning = 0;
 457
 458     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 459
 460 /* This list is a superset of the UTF8_ALLOW_XXX. */
 461
 462 #define UTF8_WARN_EMPTY                          1
 463 #define UTF8_WARN_CONTINUATION                   2
 464 #define UTF8_WARN_NON_CONTINUATION               3
 465 #define UTF8_WARN_FE_FF                          4
 466 #define UTF8_WARN_SHORT                          5
 467 #define UTF8_WARN_OVERFLOW                       6
 468 #define UTF8_WARN_SURROGATE                      7
 469 #define UTF8_WARN_LONG                           8
 470 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 471
 472     if (curlen == 0 &&
 473         !(flags & UTF8_ALLOW_EMPTY)) {
 474         warning = UTF8_WARN_EMPTY;
 475         goto malformed;
 476     }
 477
 478     if (UTF8_IS_INVARIANT(uv)) {
 479         if (retlen)
 480             *retlen = 1;
 481         return (UV) (NATIVE_TO_UTF(*s));
 482     }
 483
 484     if (UTF8_IS_CONTINUATION(uv) &&
 485         !(flags & UTF8_ALLOW_CONTINUATION)) {
 486         warning = UTF8_WARN_CONTINUATION;
 487         goto malformed;
 488     }
 489
 490     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 491         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 492         warning = UTF8_WARN_NON_CONTINUATION;
 493         goto malformed;
 494     }
 495
 496 #ifdef EBCDIC
 497     uv = NATIVE_TO_UTF(uv);
 498 #else
 499     if ((uv == 0xfe || uv == 0xff) &&
 500         !(flags & UTF8_ALLOW_FE_FF)) {
 501         warning = UTF8_WARN_FE_FF;
 502         goto malformed;
 503     }
 504 #endif
 505
 506     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 507     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 508     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 509     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 510 #ifdef EBCDIC
 511     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 512     else                        { len =  7; uv &= 0x01; }
 513 #else
 514     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 515     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 516     else                        { len = 13; uv = 0; } /* whoa! */
 517 #endif
 518
 519     if (retlen)
 520         *retlen = len;
 521
 522     expectlen = len;
 523
 524     if ((curlen < expectlen) &&
 525         !(flags & UTF8_ALLOW_SHORT)) {
 526         warning = UTF8_WARN_SHORT;
 527         goto malformed;
 528     }
 529
 530     len--;
 531     s++;
 532     ouv = uv;
 533
 534     while (len--) {
 535         if (!UTF8_IS_CONTINUATION(*s) &&
 536             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 537             s--;
 538             warning = UTF8_WARN_NON_CONTINUATION;
 539             goto malformed;
 540         }
 541         else
 542             uv = UTF8_ACCUMULATE(uv, *s);
 543         if (!(uv > ouv)) {
 544             /* These cannot be allowed. */
 545             if (uv == ouv) {
 546                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 547                     warning = UTF8_WARN_LONG;
 548                     goto malformed;
 549                 }
 550             }
 551             else { /* uv < ouv */
 552                 /* This cannot be allowed. */
 553                 warning = UTF8_WARN_OVERFLOW;
 554                 goto malformed;
 555             }
 556         }
 557         s++;
 558         ouv = uv;
 559     }
 560
 561     if (UNICODE_IS_SURROGATE(uv) &&
 562         !(flags & UTF8_ALLOW_SURROGATE)) {
 563         warning = UTF8_WARN_SURROGATE;
 564         goto malformed;
 565     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 566                !(flags & UTF8_ALLOW_LONG)) {
 567         warning = UTF8_WARN_LONG;
 568         goto malformed;
 569     } else if (UNICODE_IS_ILLEGAL(uv) &&
 570                !(flags & UTF8_ALLOW_FFFF)) {
 571         warning = UTF8_WARN_FFFF;
 572         goto malformed;
 573     }
 574
 575     return uv;
 576
 577 malformed:
 578
 579     if (flags & UTF8_CHECK_ONLY) {
 580         if (retlen)
 581             *retlen = ((STRLEN) -1);
 582         return 0;
 583     }
 584
 585     if (dowarn) {
 586         SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 587
 588         switch (warning) {
 589         case 0: /* Intentionally empty. */ break;
 590         case UTF8_WARN_EMPTY:
 591             sv_catpvs(sv, "(empty string)");
 592             break;
 593         case UTF8_WARN_CONTINUATION:
 594             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 595             break;
 596         case UTF8_WARN_NON_CONTINUATION:
 597             if (s == s0)
 598                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 599                            (UV)s[1], startbyte);
 600             else {
 601                 const int len = (int)(s-s0);
 602                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 603                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 604             }
 605
 606             break;
 607         case UTF8_WARN_FE_FF:
 608             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 609             break;
 610         case UTF8_WARN_SHORT:
 611             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 612                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 613             expectlen = curlen;         /* distance for caller to skip */
 614             break;
 615         case UTF8_WARN_OVERFLOW:
 616             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 617                            ouv, *s, startbyte);
 618             break;
 619         case UTF8_WARN_SURROGATE:
 620             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 621             break;
 622         case UTF8_WARN_LONG:
 623             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 624                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 625             break;
 626         case UTF8_WARN_FFFF:
 627             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 628             break;
 629         default:
 630             sv_catpvs(sv, "(unknown reason)");
 631             break;
 632         }
 633
 634         if (warning) {
 635             const char * const s = SvPVX_const(sv);
 636
 637             if (PL_op)
 638                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 639                             "%s in %s", s,  OP_DESC(PL_op));
 640             else
 641                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 642         }
 643     }
 644
 645     if (retlen)
 646         *retlen = expectlen ? expectlen : len;
 647
 648     return 0;
 649 }
 650
 651 /*
 652 =for apidoc utf8_to_uvchr
 653
 654 Returns the native character value of the first character in the string C<s>
 655 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 656 length, in bytes, of that character.
 657
 658 If C<s> does not point to a well-formed UTF-8 character, zero is
 659 returned and retlen is set, if possible, to -1.
 660
 661 =cut
 662 */
 663
 664 UV
 665 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 666 {
 667     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 668
 669     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 670                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 671 }
 672
 673 /*
 674 =for apidoc utf8_to_uvuni
 675
 676 Returns the Unicode code point of the first character in the string C<s>
 677 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 678 length, in bytes, of that character.
 679
 680 This function should only be used when the returned UV is considered
 681 an index into the Unicode semantic tables (e.g. swashes).
 682
 683 If C<s> does not point to a well-formed UTF-8 character, zero is
 684 returned and retlen is set, if possible, to -1.
 685
 686 =cut
 687 */
 688
 689 UV
 690 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 691 {
 692     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 693
 694     /* Call the low level routine asking for checks */
 695     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 696                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 697 }
 698
 699 /*
 700 =for apidoc utf8_length
 701
 702 Return the length of the UTF-8 char encoded string C<s> in characters.
 703 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 704 up past C<e>, croaks.
 705
 706 =cut
 707 */
 708
 709 STRLEN
 710 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 711 {
 712     dVAR;
 713     STRLEN len = 0;
 714
 715     PERL_ARGS_ASSERT_UTF8_LENGTH;
 716
 717     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 718      * the bitops (especially ~) can create illegal UTF-8.
 719      * In other words: in Perl UTF-8 is not just for Unicode. */
 720
 721     if (e < s)
 722         goto warn_and_return;
 723     while (s < e) {
 724         if (!UTF8_IS_INVARIANT(*s))
 725             s += UTF8SKIP(s);
 726         else
 727             s++;
 728         len++;
 729     }
 730
 731     if (e != s) {
 732         len--;
 733         warn_and_return:
 734         if (PL_op)
 735             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
 736                              "%s in %s", unees, OP_DESC(PL_op));
 737         else
 738             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
 739     }
 740
 741     return len;
 742 }
 743
 744 /*
 745 =for apidoc utf8_distance
 746
 747 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 748 and C<b>.
 749
 750 WARNING: use only if you *know* that the pointers point inside the
 751 same UTF-8 buffer.
 752
 753 =cut
 754 */
 755
 756 IV
 757 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 758 {
 759     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 760
 761     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 762 }
 763
 764 /*
 765 =for apidoc utf8_hop
 766
 767 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 768 forward or backward.
 769
 770 WARNING: do not use the following unless you *know* C<off> is within
 771 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 772 on the first byte of character or just after the last byte of a character.
 773
 774 =cut
 775 */
 776
 777 U8 *
 778 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 779 {
 780     PERL_ARGS_ASSERT_UTF8_HOP;
 781
 782     PERL_UNUSED_CONTEXT;
 783     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 784      * the bitops (especially ~) can create illegal UTF-8.
 785      * In other words: in Perl UTF-8 is not just for Unicode. */
 786
 787     if (off >= 0) {
 788         while (off--)
 789             s += UTF8SKIP(s);
 790     }
 791     else {
 792         while (off++) {
 793             s--;
 794             while (UTF8_IS_CONTINUATION(*s))
 795                 s--;
 796         }
 797     }
 798     return (U8 *)s;
 799 }
 800
 801 /*
 802 =for apidoc utf8_to_bytes
 803
 804 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 805 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 806 updates len to contain the new length.
 807 Returns zero on failure, setting C<len> to -1.
 808
 809 If you need a copy of the string, see C<bytes_from_utf8>.
 810
 811 =cut
 812 */
 813
 814 U8 *
 815 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 816 {
 817     U8 * const save = s;
 818     U8 * const send = s + *len;
 819     U8 *d;
 820
 821     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 822
 823     /* ensure valid UTF-8 and chars < 256 before updating string */
 824     while (s < send) {
 825         U8 c = *s++;
 826
 827         if (!UTF8_IS_INVARIANT(c) &&
 828             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 829              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 830             *len = ((STRLEN) -1);
 831             return 0;
 832         }
 833     }
 834
 835     d = s = save;
 836     while (s < send) {
 837         STRLEN ulen;
 838         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 839         s += ulen;
 840     }
 841     *d = '\0';
 842     *len = d - save;
 843     return save;
 844 }
 845
 846 /*
 847 =for apidoc bytes_from_utf8
 848
 849 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 850 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 851 the newly-created string, and updates C<len> to contain the new
 852 length.  Returns the original string if no conversion occurs, C<len>
 853 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 854 0 if C<s> is converted or consisted entirely of characters that are invariant
 855 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
 856
 857 =cut
 858 */
 859
 860 U8 *
 861 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 862 {
 863     U8 *d;
 864     const U8 *start = s;
 865     const U8 *send;
 866     I32 count = 0;
 867
 868     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 869
 870     PERL_UNUSED_CONTEXT;
 871     if (!*is_utf8)
 872         return (U8 *)start;
 873
 874     /* ensure valid UTF-8 and chars < 256 before converting string */
 875     for (send = s + *len; s < send;) {
 876         U8 c = *s++;
 877         if (!UTF8_IS_INVARIANT(c)) {
 878             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 879                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 880                 count++;
 881             else
 882                 return (U8 *)start;
 883         }
 884     }
 885
 886     *is_utf8 = FALSE;
 887
 888     Newx(d, (*len) - count + 1, U8);
 889     s = start; start = d;
 890     while (s < send) {
 891         U8 c = *s++;
 892         if (!UTF8_IS_INVARIANT(c)) {
 893             /* Then it is two-byte encoded */
 894             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 895             c = ASCII_TO_NATIVE(c);
 896         }
 897         *d++ = c;
 898     }
 899     *d = '\0';
 900     *len = d - start;
 901     return (U8 *)start;
 902 }
 903
 904 /*
 905 =for apidoc bytes_to_utf8
 906
 907 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
 908 Returns a pointer to the newly-created string, and sets C<len> to
 909 reflect the new length.
 910
 911 A NUL character will be written after the end of the string.
 912
 913 If you want to convert to UTF-8 from encodings other than
 914 the native (Latin1 or EBCDIC),
 915 see sv_recode_to_utf8().
 916
 917 =cut
 918 */
 919
 920 U8*
 921 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 922 {
 923     const U8 * const send = s + (*len);
 924     U8 *d;
 925     U8 *dst;
 926
 927     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 928     PERL_UNUSED_CONTEXT;
 929
 930     Newx(d, (*len) * 2 + 1, U8);
 931     dst = d;
 932
 933     while (s < send) {
 934         const UV uv = NATIVE_TO_ASCII(*s++);
 935         if (UNI_IS_INVARIANT(uv))
 936             *d++ = (U8)UTF_TO_NATIVE(uv);
 937         else {
 938             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 939             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 940         }
 941     }
 942     *d = '\0';
 943     *len = d-dst;
 944     return dst;
 945 }
 946
 947 /*
 948  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 949  *
 950  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 951  * We optimize for native, for obvious reasons. */
 952
 953 U8*
 954 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 955 {
 956     U8* pend;
 957     U8* dstart = d;
 958
 959     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 960
 961     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 962          d[0] = 0;
 963          *newlen = 1;
 964          return d;
 965     }
 966
 967     if (bytelen & 1)
 968         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 969
 970     pend = p + bytelen;
 971
 972     while (p < pend) {
 973         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 974         p += 2;
 975         if (uv < 0x80) {
 976 #ifdef EBCDIC
 977             *d++ = UNI_TO_NATIVE(uv);
 978 #else
 979             *d++ = (U8)uv;
 980 #endif
 981             continue;
 982         }
 983         if (uv < 0x800) {
 984             *d++ = (U8)(( uv >>  6)         | 0xc0);
 985             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 986             continue;
 987         }
 988         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 989             if (p >= pend) {
 990                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 991             } else {
 992                 UV low = (p[0] << 8) + p[1];
 993                 p += 2;
 994                 if (low < 0xdc00 || low >= 0xdfff)
 995                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 996                 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 997             }
 998         }
 999         if (uv < 0x10000) {
1000             *d++ = (U8)(( uv >> 12)         | 0xe0);
1001             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1002             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1003             continue;
1004         }
1005         else {
1006             *d++ = (U8)(( uv >> 18)         | 0xf0);
1007             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1008             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1009             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1010             continue;
1011         }
1012     }
1013     *newlen = d - dstart;
1014     return d;
1015 }
1016
1017 /* Note: this one is slightly destructive of the source. */
1018
1019 U8*
1020 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1021 {
1022     U8* s = (U8*)p;
1023     U8* const send = s + bytelen;
1024
1025     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1026
1027     if (bytelen & 1)
1028         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1029                    (UV)bytelen);
1030
1031     while (s < send) {
1032         const U8 tmp = s[0];
1033         s[0] = s[1];
1034         s[1] = tmp;
1035         s += 2;
1036     }
1037     return utf16_to_utf8(p, d, bytelen, newlen);
1038 }
1039
1040 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1041
1042 bool
1043 Perl_is_uni_alnum(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_alnum(tmpbuf);
1048 }
1049
1050 bool
1051 Perl_is_uni_idfirst(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_idfirst(tmpbuf);
1056 }
1057
1058 bool
1059 Perl_is_uni_alpha(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_alpha(tmpbuf);
1064 }
1065
1066 bool
1067 Perl_is_uni_ascii(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_ascii(tmpbuf);
1072 }
1073
1074 bool
1075 Perl_is_uni_space(pTHX_ UV c)
1076 {
1077     U8 tmpbuf[UTF8_MAXBYTES+1];
1078     uvchr_to_utf8(tmpbuf, c);
1079     return is_utf8_space(tmpbuf);
1080 }
1081
1082 bool
1083 Perl_is_uni_digit(pTHX_ UV c)
1084 {
1085     U8 tmpbuf[UTF8_MAXBYTES+1];
1086     uvchr_to_utf8(tmpbuf, c);
1087     return is_utf8_digit(tmpbuf);
1088 }
1089
1090 bool
1091 Perl_is_uni_upper(pTHX_ UV c)
1092 {
1093     U8 tmpbuf[UTF8_MAXBYTES+1];
1094     uvchr_to_utf8(tmpbuf, c);
1095     return is_utf8_upper(tmpbuf);
1096 }
1097
1098 bool
1099 Perl_is_uni_lower(pTHX_ UV c)
1100 {
1101     U8 tmpbuf[UTF8_MAXBYTES+1];
1102     uvchr_to_utf8(tmpbuf, c);
1103     return is_utf8_lower(tmpbuf);
1104 }
1105
1106 bool
1107 Perl_is_uni_cntrl(pTHX_ UV c)
1108 {
1109     U8 tmpbuf[UTF8_MAXBYTES+1];
1110     uvchr_to_utf8(tmpbuf, c);
1111     return is_utf8_cntrl(tmpbuf);
1112 }
1113
1114 bool
1115 Perl_is_uni_graph(pTHX_ UV c)
1116 {
1117     U8 tmpbuf[UTF8_MAXBYTES+1];
1118     uvchr_to_utf8(tmpbuf, c);
1119     return is_utf8_graph(tmpbuf);
1120 }
1121
1122 bool
1123 Perl_is_uni_print(pTHX_ UV c)
1124 {
1125     U8 tmpbuf[UTF8_MAXBYTES+1];
1126     uvchr_to_utf8(tmpbuf, c);
1127     return is_utf8_print(tmpbuf);
1128 }
1129
1130 bool
1131 Perl_is_uni_punct(pTHX_ UV c)
1132 {
1133     U8 tmpbuf[UTF8_MAXBYTES+1];
1134     uvchr_to_utf8(tmpbuf, c);
1135     return is_utf8_punct(tmpbuf);
1136 }
1137
1138 bool
1139 Perl_is_uni_xdigit(pTHX_ UV c)
1140 {
1141     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1142     uvchr_to_utf8(tmpbuf, c);
1143     return is_utf8_xdigit(tmpbuf);
1144 }
1145
1146 UV
1147 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1148 {
1149     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1150
1151     uvchr_to_utf8(p, c);
1152     return to_utf8_upper(p, p, lenp);
1153 }
1154
1155 UV
1156 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1157 {
1158     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1159
1160     uvchr_to_utf8(p, c);
1161     return to_utf8_title(p, p, lenp);
1162 }
1163
1164 UV
1165 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1166 {
1167     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1168
1169     uvchr_to_utf8(p, c);
1170     return to_utf8_lower(p, p, lenp);
1171 }
1172
1173 UV
1174 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1175 {
1176     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1177
1178     uvchr_to_utf8(p, c);
1179     return to_utf8_fold(p, p, lenp);
1180 }
1181
1182 /* for now these all assume no locale info available for Unicode > 255 */
1183
1184 bool
1185 Perl_is_uni_alnum_lc(pTHX_ UV c)
1186 {
1187     return is_uni_alnum(c);     /* XXX no locale support yet */
1188 }
1189
1190 bool
1191 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1192 {
1193     return is_uni_idfirst(c);   /* XXX no locale support yet */
1194 }
1195
1196 bool
1197 Perl_is_uni_alpha_lc(pTHX_ UV c)
1198 {
1199     return is_uni_alpha(c);     /* XXX no locale support yet */
1200 }
1201
1202 bool
1203 Perl_is_uni_ascii_lc(pTHX_ UV c)
1204 {
1205     return is_uni_ascii(c);     /* XXX no locale support yet */
1206 }
1207
1208 bool
1209 Perl_is_uni_space_lc(pTHX_ UV c)
1210 {
1211     return is_uni_space(c);     /* XXX no locale support yet */
1212 }
1213
1214 bool
1215 Perl_is_uni_digit_lc(pTHX_ UV c)
1216 {
1217     return is_uni_digit(c);     /* XXX no locale support yet */
1218 }
1219
1220 bool
1221 Perl_is_uni_upper_lc(pTHX_ UV c)
1222 {
1223     return is_uni_upper(c);     /* XXX no locale support yet */
1224 }
1225
1226 bool
1227 Perl_is_uni_lower_lc(pTHX_ UV c)
1228 {
1229     return is_uni_lower(c);     /* XXX no locale support yet */
1230 }
1231
1232 bool
1233 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1234 {
1235     return is_uni_cntrl(c);     /* XXX no locale support yet */
1236 }
1237
1238 bool
1239 Perl_is_uni_graph_lc(pTHX_ UV c)
1240 {
1241     return is_uni_graph(c);     /* XXX no locale support yet */
1242 }
1243
1244 bool
1245 Perl_is_uni_print_lc(pTHX_ UV c)
1246 {
1247     return is_uni_print(c);     /* XXX no locale support yet */
1248 }
1249
1250 bool
1251 Perl_is_uni_punct_lc(pTHX_ UV c)
1252 {
1253     return is_uni_punct(c);     /* XXX no locale support yet */
1254 }
1255
1256 bool
1257 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1258 {
1259     return is_uni_xdigit(c);    /* XXX no locale support yet */
1260 }
1261
1262 U32
1263 Perl_to_uni_upper_lc(pTHX_ U32 c)
1264 {
1265     /* XXX returns only the first character -- do not use XXX */
1266     /* XXX no locale support yet */
1267     STRLEN len;
1268     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1269     return (U32)to_uni_upper(c, tmpbuf, &len);
1270 }
1271
1272 U32
1273 Perl_to_uni_title_lc(pTHX_ U32 c)
1274 {
1275     /* XXX returns only the first character XXX -- do not use XXX */
1276     /* XXX no locale support yet */
1277     STRLEN len;
1278     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1279     return (U32)to_uni_title(c, tmpbuf, &len);
1280 }
1281
1282 U32
1283 Perl_to_uni_lower_lc(pTHX_ U32 c)
1284 {
1285     /* XXX returns only the first character -- do not use XXX */
1286     /* XXX no locale support yet */
1287     STRLEN len;
1288     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1289     return (U32)to_uni_lower(c, tmpbuf, &len);
1290 }
1291
1292 static bool
1293 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1294                  const char *const swashname)
1295 {
1296     dVAR;
1297
1298     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1299
1300     if (!is_utf8_char(p))
1301         return FALSE;
1302     if (!*swash)
1303         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1304     return swash_fetch(*swash, p, TRUE) != 0;
1305 }
1306
1307 bool
1308 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1309 {
1310     dVAR;
1311
1312     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1313
1314     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1315      * descendant of isalnum(3), in other words, it doesn't
1316      * contain the '_'. --jhi */
1317     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1318 }
1319
1320 bool
1321 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1322 {
1323     dVAR;
1324
1325     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1326
1327     if (*p == '_')
1328         return TRUE;
1329     /* is_utf8_idstart would be more logical. */
1330     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1331 }
1332
1333 bool
1334 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1335 {
1336     dVAR;
1337
1338     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1339
1340     if (*p == '_')
1341         return TRUE;
1342     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1343 }
1344
1345 bool
1346 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1347 {
1348     dVAR;
1349
1350     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1351
1352     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1353 }
1354
1355 bool
1356 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1357 {
1358     dVAR;
1359
1360     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1361
1362     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1363 }
1364
1365 bool
1366 Perl_is_utf8_space(pTHX_ const U8 *p)
1367 {
1368     dVAR;
1369
1370     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1371
1372     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1373 }
1374
1375 bool
1376 Perl_is_utf8_digit(pTHX_ const U8 *p)
1377 {
1378     dVAR;
1379
1380     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1381
1382     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1383 }
1384
1385 bool
1386 Perl_is_utf8_upper(pTHX_ const U8 *p)
1387 {
1388     dVAR;
1389
1390     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1391
1392     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1393 }
1394
1395 bool
1396 Perl_is_utf8_lower(pTHX_ const U8 *p)
1397 {
1398     dVAR;
1399
1400     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1401
1402     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1403 }
1404
1405 bool
1406 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1407 {
1408     dVAR;
1409
1410     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1411
1412     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1413 }
1414
1415 bool
1416 Perl_is_utf8_graph(pTHX_ const U8 *p)
1417 {
1418     dVAR;
1419
1420     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1421
1422     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1423 }
1424
1425 bool
1426 Perl_is_utf8_print(pTHX_ const U8 *p)
1427 {
1428     dVAR;
1429
1430     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1431
1432     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1433 }
1434
1435 bool
1436 Perl_is_utf8_punct(pTHX_ const U8 *p)
1437 {
1438     dVAR;
1439
1440     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1441
1442     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1443 }
1444
1445 bool
1446 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1447 {
1448     dVAR;
1449
1450     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1451
1452     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1453 }
1454
1455 bool
1456 Perl_is_utf8_mark(pTHX_ const U8 *p)
1457 {
1458     dVAR;
1459
1460     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1461
1462     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1463 }
1464
1465 /*
1466 =for apidoc to_utf8_case
1467
1468 The "p" contains the pointer to the UTF-8 string encoding
1469 the character that is being converted.
1470
1471 The "ustrp" is a pointer to the character buffer to put the
1472 conversion result to.  The "lenp" is a pointer to the length
1473 of the result.
1474
1475 The "swashp" is a pointer to the swash to use.
1476
1477 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1478 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1479 but not always, a multicharacter mapping), is tried first.
1480
1481 The "special" is a string like "utf8::ToSpecLower", which means the
1482 hash %utf8::ToSpecLower.  The access to the hash is through
1483 Perl_to_utf8_case().
1484
1485 The "normal" is a string like "ToLower" which means the swash
1486 %utf8::ToLower.
1487
1488 =cut */
1489
1490 UV
1491 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1492                         SV **swashp, const char *normal, const char *special)
1493 {
1494     dVAR;
1495     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1496     STRLEN len = 0;
1497     const UV uv0 = utf8_to_uvchr(p, NULL);
1498     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1499      * are necessary in EBCDIC, they are redundant no-ops
1500      * in ASCII-ish platforms, and hopefully optimized away. */
1501     const UV uv1 = NATIVE_TO_UNI(uv0);
1502
1503     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1504
1505     uvuni_to_utf8(tmpbuf, uv1);
1506
1507     if (!*swashp) /* load on-demand */
1508          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1509
1510     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1511     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1512          /* It might be "special" (sometimes, but not always,
1513           * a multicharacter mapping) */
1514          HV * const hv = get_hv(special, 0);
1515          SV **svp;
1516
1517          if (hv &&
1518              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1519              (*svp)) {
1520              const char *s;
1521
1522               s = SvPV_const(*svp, len);
1523               if (len == 1)
1524                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1525               else {
1526 #ifdef EBCDIC
1527                    /* If we have EBCDIC we need to remap the characters
1528                     * since any characters in the low 256 are Unicode
1529                     * code points, not EBCDIC. */
1530                    U8 *t = (U8*)s, *tend = t + len, *d;
1531
1532                    d = tmpbuf;
1533                    if (SvUTF8(*svp)) {
1534                         STRLEN tlen = 0;
1535
1536                         while (t < tend) {
1537                              const UV c = utf8_to_uvchr(t, &tlen);
1538                              if (tlen > 0) {
1539                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1540                                   t += tlen;
1541                              }
1542                              else
1543                                   break;
1544                         }
1545                    }
1546                    else {
1547                         while (t < tend) {
1548                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1549                              t++;
1550                         }
1551                    }
1552                    len = d - tmpbuf;
1553                    Copy(tmpbuf, ustrp, len, U8);
1554 #else
1555                    Copy(s, ustrp, len, U8);
1556 #endif
1557               }
1558          }
1559     }
1560
1561     if (!len && *swashp) {
1562         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1563
1564          if (uv2) {
1565               /* It was "normal" (a single character mapping). */
1566               const UV uv3 = UNI_TO_NATIVE(uv2);
1567               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1568          }
1569     }
1570
1571     if (!len) /* Neither: just copy. */
1572          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1573
1574     if (lenp)
1575          *lenp = len;
1576
1577     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1578 }
1579
1580 /*
1581 =for apidoc to_utf8_upper
1582
1583 Convert the UTF-8 encoded character at p to its uppercase version and
1584 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1585 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1586 the uppercase version may be longer than the original character.
1587
1588 The first character of the uppercased version is returned
1589 (but note, as explained above, that there may be more.)
1590
1591 =cut */
1592
1593 UV
1594 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1595 {
1596     dVAR;
1597
1598     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1599
1600     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1601                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1602 }
1603
1604 /*
1605 =for apidoc to_utf8_title
1606
1607 Convert the UTF-8 encoded character at p to its titlecase version and
1608 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1609 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1610 titlecase version may be longer than the original character.
1611
1612 The first character of the titlecased version is returned
1613 (but note, as explained above, that there may be more.)
1614
1615 =cut */
1616
1617 UV
1618 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1619 {
1620     dVAR;
1621
1622     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1623
1624     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1625                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1626 }
1627
1628 /*
1629 =for apidoc to_utf8_lower
1630
1631 Convert the UTF-8 encoded character at p to its lowercase version and
1632 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1633 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1634 lowercase version may be longer than the original character.
1635
1636 The first character of the lowercased version is returned
1637 (but note, as explained above, that there may be more.)
1638
1639 =cut */
1640
1641 UV
1642 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1643 {
1644     dVAR;
1645
1646     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1647
1648     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1649                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1650 }
1651
1652 /*
1653 =for apidoc to_utf8_fold
1654
1655 Convert the UTF-8 encoded character at p to its foldcase version and
1656 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1657 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1658 foldcase version may be longer than the original character (up to
1659 three characters).
1660
1661 The first character of the foldcased version is returned
1662 (but note, as explained above, that there may be more.)
1663
1664 =cut */
1665
1666 UV
1667 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1668 {
1669     dVAR;
1670
1671     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1672
1673     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1674                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1675 }
1676
1677 /* Note:
1678  * A "swash" is a swatch hash.
1679  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1680  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1681  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1682  */
1683 SV*
1684 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1685 {
1686     dVAR;
1687     SV* retval;
1688     dSP;
1689     const size_t pkg_len = strlen(pkg);
1690     const size_t name_len = strlen(name);
1691     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1692     SV* errsv_save;
1693
1694     PERL_ARGS_ASSERT_SWASH_INIT;
1695
1696     PUSHSTACKi(PERLSI_MAGIC);
1697     ENTER;
1698     SAVEI32(PL_hints);
1699     PL_hints = 0;
1700     save_re_context();
1701     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1702         ENTER;
1703         errsv_save = newSVsv(ERRSV);
1704         /* It is assumed that callers of this routine are not passing in any
1705            user derived data.  */
1706         /* Need to do this after save_re_context() as it will set PL_tainted to
1707            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1708            Even line to create errsv_save can turn on PL_tainted.  */
1709         SAVEBOOL(PL_tainted);
1710         PL_tainted = 0;
1711         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1712                          NULL);
1713         if (!SvTRUE(ERRSV))
1714             sv_setsv(ERRSV, errsv_save);
1715         SvREFCNT_dec(errsv_save);
1716         LEAVE;
1717     }
1718     SPAGAIN;
1719     PUSHMARK(SP);
1720     EXTEND(SP,5);
1721     mPUSHp(pkg, pkg_len);
1722     mPUSHp(name, name_len);
1723     PUSHs(listsv);
1724     mPUSHi(minbits);
1725     mPUSHi(none);
1726     PUTBACK;
1727     errsv_save = newSVsv(ERRSV);
1728     if (call_method("SWASHNEW", G_SCALAR))
1729         retval = newSVsv(*PL_stack_sp--);
1730     else
1731         retval = &PL_sv_undef;
1732     if (!SvTRUE(ERRSV))
1733         sv_setsv(ERRSV, errsv_save);
1734     SvREFCNT_dec(errsv_save);
1735     LEAVE;
1736     POPSTACK;
1737     if (IN_PERL_COMPILETIME) {
1738         CopHINTS_set(PL_curcop, PL_hints);
1739     }
1740     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1741         if (SvPOK(retval))
1742             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1743                        SVfARG(retval));
1744         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1745     }
1746     return retval;
1747 }
1748
1749
1750 /* This API is wrong for special case conversions since we may need to
1751  * return several Unicode characters for a single Unicode character
1752  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1753  * the lower-level routine, and it is similarly broken for returning
1754  * multiple values.  --jhi */
1755 /* Now SWASHGET is recasted into S_swash_get in this file. */
1756
1757 /* Note:
1758  * Returns the value of property/mapping C<swash> for the first character
1759  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1760  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1761  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1762  */
1763 UV
1764 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1765 {
1766     dVAR;
1767     HV *const hv = MUTABLE_HV(SvRV(swash));
1768     U32 klen;
1769     U32 off;
1770     STRLEN slen;
1771     STRLEN needents;
1772     const U8 *tmps = NULL;
1773     U32 bit;
1774     SV *swatch;
1775     U8 tmputf8[2];
1776     const UV c = NATIVE_TO_ASCII(*ptr);
1777
1778     PERL_ARGS_ASSERT_SWASH_FETCH;
1779
1780     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1781         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1782         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1783         ptr = tmputf8;
1784     }
1785     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1786      * then the "swatch" is a vec() for al the chars which start
1787      * with 0xAA..0xYY
1788      * So the key in the hash (klen) is length of encoded char -1
1789      */
1790     klen = UTF8SKIP(ptr) - 1;
1791     off  = ptr[klen];
1792
1793     if (klen == 0) {
1794       /* If char in invariant then swatch is for all the invariant chars
1795        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1796        */
1797         needents = UTF_CONTINUATION_MARK;
1798         off      = NATIVE_TO_UTF(ptr[klen]);
1799     }
1800     else {
1801       /* If char is encoded then swatch is for the prefix */
1802         needents = (1 << UTF_ACCUMULATION_SHIFT);
1803         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1804     }
1805
1806     /*
1807      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1808      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1809      * it's nothing to sniff at.)  Pity we usually come through at least
1810      * two function calls to get here...
1811      *
1812      * NB: this code assumes that swatches are never modified, once generated!
1813      */
1814
1815     if (hv   == PL_last_swash_hv &&
1816         klen == PL_last_swash_klen &&
1817         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1818     {
1819         tmps = PL_last_swash_tmps;
1820         slen = PL_last_swash_slen;
1821     }
1822     else {
1823         /* Try our second-level swatch cache, kept in a hash. */
1824         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1825
1826         /* If not cached, generate it via swash_get */
1827         if (!svp || !SvPOK(*svp)
1828                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1829             /* We use utf8n_to_uvuni() as we want an index into
1830                Unicode tables, not a native character number.
1831              */
1832             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1833                                            ckWARN(WARN_UTF8) ?
1834                                            0 : UTF8_ALLOW_ANY);
1835             swatch = swash_get(swash,
1836                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1837                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1838                                 needents);
1839
1840             if (IN_PERL_COMPILETIME)
1841                 CopHINTS_set(PL_curcop, PL_hints);
1842
1843             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1844
1845             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1846                      || (slen << 3) < needents)
1847                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1848         }
1849
1850         PL_last_swash_hv = hv;
1851         assert(klen <= sizeof(PL_last_swash_key));
1852         PL_last_swash_klen = (U8)klen;
1853         /* FIXME change interpvar.h?  */
1854         PL_last_swash_tmps = (U8 *) tmps;
1855         PL_last_swash_slen = slen;
1856         if (klen)
1857             Copy(ptr, PL_last_swash_key, klen, U8);
1858     }
1859
1860     switch ((int)((slen << 3) / needents)) {
1861     case 1:
1862         bit = 1 << (off & 7);
1863         off >>= 3;
1864         return (tmps[off] & bit) != 0;
1865     case 8:
1866         return tmps[off];
1867     case 16:
1868         off <<= 1;
1869         return (tmps[off] << 8) + tmps[off + 1] ;
1870     case 32:
1871         off <<= 2;
1872         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1873     }
1874     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1875     NORETURN_FUNCTION_END;
1876 }
1877
1878 /* Note:
1879  * Returns a swatch (a bit vector string) for a code point sequence
1880  * that starts from the value C<start> and comprises the number C<span>.
1881  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1882  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1883  */
1884 STATIC SV*
1885 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1886 {
1887     SV *swatch;
1888     U8 *l, *lend, *x, *xend, *s;
1889     STRLEN lcur, xcur, scur;
1890     HV *const hv = MUTABLE_HV(SvRV(swash));
1891     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1892     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1893     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1894     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1895     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1896     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1897     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1898     const STRLEN bits  = SvUV(*bitssvp);
1899     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1900     const UV     none  = SvUV(*nonesvp);
1901     const UV     end   = start + span;
1902
1903     PERL_ARGS_ASSERT_SWASH_GET;
1904
1905     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1906         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1907                                                  (UV)bits);
1908     }
1909
1910     /* create and initialize $swatch */
1911     scur   = octets ? (span * octets) : (span + 7) / 8;
1912     swatch = newSV(scur);
1913     SvPOK_on(swatch);
1914     s = (U8*)SvPVX(swatch);
1915     if (octets && none) {
1916         const U8* const e = s + scur;
1917         while (s < e) {
1918             if (bits == 8)
1919                 *s++ = (U8)(none & 0xff);
1920             else if (bits == 16) {
1921                 *s++ = (U8)((none >>  8) & 0xff);
1922                 *s++ = (U8)( none        & 0xff);
1923             }
1924             else if (bits == 32) {
1925                 *s++ = (U8)((none >> 24) & 0xff);
1926                 *s++ = (U8)((none >> 16) & 0xff);
1927                 *s++ = (U8)((none >>  8) & 0xff);
1928                 *s++ = (U8)( none        & 0xff);
1929             }
1930         }
1931         *s = '\0';
1932     }
1933     else {
1934         (void)memzero((U8*)s, scur + 1);
1935     }
1936     SvCUR_set(swatch, scur);
1937     s = (U8*)SvPVX(swatch);
1938
1939     /* read $swash->{LIST} */
1940     l = (U8*)SvPV(*listsvp, lcur);
1941     lend = l + lcur;
1942     while (l < lend) {
1943         UV min, max, val;
1944         STRLEN numlen;
1945         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1946
1947         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1948
1949         numlen = lend - l;
1950         min = grok_hex((char *)l, &numlen, &flags, NULL);
1951         if (numlen)
1952             l += numlen;
1953         else if (nl) {
1954             l = nl + 1; /* 1 is length of "\n" */
1955             continue;
1956         }
1957         else {
1958             l = lend; /* to LIST's end at which \n is not found */
1959             break;
1960         }
1961
1962         if (isBLANK(*l)) {
1963             ++l;
1964             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1965             numlen = lend - l;
1966             max = grok_hex((char *)l, &numlen, &flags, NULL);
1967             if (numlen)
1968                 l += numlen;
1969             else
1970                 max = min;
1971
1972             if (octets) {
1973                 if (isBLANK(*l)) {
1974                     ++l;
1975                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1976                             PERL_SCAN_DISALLOW_PREFIX;
1977                     numlen = lend - l;
1978                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1979                     if (numlen)
1980                         l += numlen;
1981                     else
1982                         val = 0;
1983                 }
1984                 else {
1985                     val = 0;
1986                     if (typeto) {
1987                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1988                                          typestr, l);
1989                     }
1990                 }
1991             }
1992             else
1993                 val = 0; /* bits == 1, then val should be ignored */
1994         }
1995         else {
1996             max = min;
1997             if (octets) {
1998                 val = 0;
1999                 if (typeto) {
2000                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2001                 }
2002             }
2003             else
2004                 val = 0; /* bits == 1, then val should be ignored */
2005         }
2006
2007         if (nl)
2008             l = nl + 1;
2009         else
2010             l = lend;
2011
2012         if (max < start)
2013             continue;
2014
2015         if (octets) {
2016             UV key;
2017             if (min < start) {
2018                 if (!none || val < none) {
2019                     val += start - min;
2020                 }
2021                 min = start;
2022             }
2023             for (key = min; key <= max; key++) {
2024                 STRLEN offset;
2025                 if (key >= end)
2026                     goto go_out_list;
2027                 /* offset must be non-negative (start <= min <= key < end) */
2028                 offset = octets * (key - start);
2029                 if (bits == 8)
2030                     s[offset] = (U8)(val & 0xff);
2031                 else if (bits == 16) {
2032                     s[offset    ] = (U8)((val >>  8) & 0xff);
2033                     s[offset + 1] = (U8)( val        & 0xff);
2034                 }
2035                 else if (bits == 32) {
2036                     s[offset    ] = (U8)((val >> 24) & 0xff);
2037                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2038                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2039                     s[offset + 3] = (U8)( val        & 0xff);
2040                 }
2041
2042                 if (!none || val < none)
2043                     ++val;
2044             }
2045         }
2046         else { /* bits == 1, then val should be ignored */
2047             UV key;
2048             if (min < start)
2049                 min = start;
2050             for (key = min; key <= max; key++) {
2051                 const STRLEN offset = (STRLEN)(key - start);
2052                 if (key >= end)
2053                     goto go_out_list;
2054                 s[offset >> 3] |= 1 << (offset & 7);
2055             }
2056         }
2057     } /* while */
2058   go_out_list:
2059
2060     /* read $swash->{EXTRAS} */
2061     x = (U8*)SvPV(*extssvp, xcur);
2062     xend = x + xcur;
2063     while (x < xend) {
2064         STRLEN namelen;
2065         U8 *namestr;
2066         SV** othersvp;
2067         HV* otherhv;
2068         STRLEN otherbits;
2069         SV **otherbitssvp, *other;
2070         U8 *s, *o, *nl;
2071         STRLEN slen, olen;
2072
2073         const U8 opc = *x++;
2074         if (opc == '\n')
2075             continue;
2076
2077         nl = (U8*)memchr(x, '\n', xend - x);
2078
2079         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2080             if (nl) {
2081                 x = nl + 1; /* 1 is length of "\n" */
2082                 continue;
2083             }
2084             else {
2085                 x = xend; /* to EXTRAS' end at which \n is not found */
2086                 break;
2087             }
2088         }
2089
2090         namestr = x;
2091         if (nl) {
2092             namelen = nl - namestr;
2093             x = nl + 1;
2094         }
2095         else {
2096             namelen = xend - namestr;
2097             x = xend;
2098         }
2099
2100         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2101         otherhv = MUTABLE_HV(SvRV(*othersvp));
2102         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2103         otherbits = (STRLEN)SvUV(*otherbitssvp);
2104         if (bits < otherbits)
2105             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2106
2107         /* The "other" swatch must be destroyed after. */
2108         other = swash_get(*othersvp, start, span);
2109         o = (U8*)SvPV(other, olen);
2110
2111         if (!olen)
2112             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2113
2114         s = (U8*)SvPV(swatch, slen);
2115         if (bits == 1 && otherbits == 1) {
2116             if (slen != olen)
2117                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2118
2119             switch (opc) {
2120             case '+':
2121                 while (slen--)
2122                     *s++ |= *o++;
2123                 break;
2124             case '!':
2125                 while (slen--)
2126                     *s++ |= ~*o++;
2127                 break;
2128             case '-':
2129                 while (slen--)
2130                     *s++ &= ~*o++;
2131                 break;
2132             case '&':
2133                 while (slen--)
2134                     *s++ &= *o++;
2135                 break;
2136             default:
2137                 break;
2138             }
2139         }
2140         else {
2141             STRLEN otheroctets = otherbits >> 3;
2142             STRLEN offset = 0;
2143             U8* const send = s + slen;
2144
2145             while (s < send) {
2146                 UV otherval = 0;
2147
2148                 if (otherbits == 1) {
2149                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2150                     ++offset;
2151                 }
2152                 else {
2153                     STRLEN vlen = otheroctets;
2154                     otherval = *o++;
2155                     while (--vlen) {
2156                         otherval <<= 8;
2157                         otherval |= *o++;
2158                     }
2159                 }
2160
2161                 if (opc == '+' && otherval)
2162                     NOOP;   /* replace with otherval */
2163                 else if (opc == '!' && !otherval)
2164                     otherval = 1;
2165                 else if (opc == '-' && otherval)
2166                     otherval = 0;
2167                 else if (opc == '&' && !otherval)
2168                     otherval = 0;
2169                 else {
2170                     s += octets; /* no replacement */
2171                     continue;
2172                 }
2173
2174                 if (bits == 8)
2175                     *s++ = (U8)( otherval & 0xff);
2176                 else if (bits == 16) {
2177                     *s++ = (U8)((otherval >>  8) & 0xff);
2178                     *s++ = (U8)( otherval        & 0xff);
2179                 }
2180                 else if (bits == 32) {
2181                     *s++ = (U8)((otherval >> 24) & 0xff);
2182                     *s++ = (U8)((otherval >> 16) & 0xff);
2183                     *s++ = (U8)((otherval >>  8) & 0xff);
2184                     *s++ = (U8)( otherval        & 0xff);
2185                 }
2186             }
2187         }
2188         sv_free(other); /* through with it! */
2189     } /* while */
2190     return swatch;
2191 }
2192
2193 /*
2194 =for apidoc uvchr_to_utf8
2195
2196 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2197 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2198 bytes available. The return value is the pointer to the byte after the
2199 end of the new character. In other words,
2200
2201     d = uvchr_to_utf8(d, uv);
2202
2203 is the recommended wide native character-aware way of saying
2204
2205     *(d++) = uv;
2206
2207 =cut
2208 */
2209
2210 /* On ASCII machines this is normally a macro but we want a
2211    real function in case XS code wants it
2212 */
2213 U8 *
2214 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2215 {
2216     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2217
2218     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2219 }
2220
2221 U8 *
2222 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2223 {
2224     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2225
2226     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2227 }
2228
2229 /*
2230 =for apidoc utf8n_to_uvchr
2231 flags
2232
2233 Returns the native character value of the first character in the string
2234 C<s>
2235 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2236 length, in bytes, of that character.
2237
2238 Allows length and flags to be passed to low level routine.
2239
2240 =cut
2241 */
2242 /* On ASCII machines this is normally a macro but we want
2243    a real function in case XS code wants it
2244 */
2245 UV
2246 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2247 U32 flags)
2248 {
2249     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2250
2251     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2252
2253     return UNI_TO_NATIVE(uv);
2254 }
2255
2256 /*
2257 =for apidoc pv_uni_display
2258
2259 Build to the scalar dsv a displayable version of the string spv,
2260 length len, the displayable version being at most pvlim bytes long
2261 (if longer, the rest is truncated and "..." will be appended).
2262
2263 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2264 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2265 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2266 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2267 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2268 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2269
2270 The pointer to the PV of the dsv is returned.
2271
2272 =cut */
2273 char *
2274 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2275 {
2276     int truncated = 0;
2277     const char *s, *e;
2278
2279     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2280
2281     sv_setpvs(dsv, "");
2282     SvUTF8_off(dsv);
2283     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2284          UV u;
2285           /* This serves double duty as a flag and a character to print after
2286              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2287           */
2288          char ok = 0;
2289
2290          if (pvlim && SvCUR(dsv) >= pvlim) {
2291               truncated++;
2292               break;
2293          }
2294          u = utf8_to_uvchr((U8*)s, 0);
2295          if (u < 256) {
2296              const unsigned char c = (unsigned char)u & 0xFF;
2297              if (flags & UNI_DISPLAY_BACKSLASH) {
2298                  switch (c) {
2299                  case '\n':
2300                      ok = 'n'; break;
2301                  case '\r':
2302                      ok = 'r'; break;
2303                  case '\t':
2304                      ok = 't'; break;
2305                  case '\f':
2306                      ok = 'f'; break;
2307                  case '\a':
2308                      ok = 'a'; break;
2309                  case '\\':
2310                      ok = '\\'; break;
2311                  default: break;
2312                  }
2313                  if (ok) {
2314                      const char string = ok;
2315                      sv_catpvs(dsv, "\\");
2316                      sv_catpvn(dsv, &string, 1);
2317                  }
2318              }
2319              /* isPRINT() is the locale-blind version. */
2320              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2321                  const char string = c;
2322                  sv_catpvn(dsv, &string, 1);
2323                  ok = 1;
2324              }
2325          }
2326          if (!ok)
2327              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2328     }
2329     if (truncated)
2330          sv_catpvs(dsv, "...");
2331
2332     return SvPVX(dsv);
2333 }
2334
2335 /*
2336 =for apidoc sv_uni_display
2337
2338 Build to the scalar dsv a displayable version of the scalar sv,
2339 the displayable version being at most pvlim bytes long
2340 (if longer, the rest is truncated and "..." will be appended).
2341
2342 The flags argument is as in pv_uni_display().
2343
2344 The pointer to the PV of the dsv is returned.
2345
2346 =cut
2347 */
2348 char *
2349 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2350 {
2351     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2352
2353      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2354                                 SvCUR(ssv), pvlim, flags);
2355 }
2356
2357 /*
2358 =for apidoc ibcmp_utf8
2359
2360 Return true if the strings s1 and s2 differ case-insensitively, false
2361 if not (if they are equal case-insensitively).  If u1 is true, the
2362 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2363 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2364 are false, the respective string is assumed to be in native 8-bit
2365 encoding.
2366
2367 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2368 in there (they will point at the beginning of the I<next> character).
2369 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2370 pointers beyond which scanning will not continue under any
2371 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2372 s2+l2 will be used as goal end pointers that will also stop the scan,
2373 and which qualify towards defining a successful match: all the scans
2374 that define an explicit length must reach their goal pointers for
2375 a match to succeed).
2376
2377 For case-insensitiveness, the "casefolding" of Unicode is used
2378 instead of upper/lowercasing both the characters, see
2379 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2380
2381 =cut */
2382 I32
2383 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2384 {
2385      dVAR;
2386      register const U8 *p1  = (const U8*)s1;
2387      register const U8 *p2  = (const U8*)s2;
2388      register const U8 *f1 = NULL;
2389      register const U8 *f2 = NULL;
2390      register U8 *e1 = NULL;
2391      register U8 *q1 = NULL;
2392      register U8 *e2 = NULL;
2393      register U8 *q2 = NULL;
2394      STRLEN n1 = 0, n2 = 0;
2395      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2396      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2397      U8 natbuf[1+1];
2398      STRLEN foldlen1, foldlen2;
2399      bool match;
2400
2401      PERL_ARGS_ASSERT_IBCMP_UTF8;
2402
2403      if (pe1)
2404           e1 = *(U8**)pe1;
2405      /* assert(e1 || l1); */
2406      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2407           f1 = (const U8*)s1 + l1;
2408      if (pe2)
2409           e2 = *(U8**)pe2;
2410      /* assert(e2 || l2); */
2411      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2412           f2 = (const U8*)s2 + l2;
2413
2414      /* This shouldn't happen. However, putting an assert() there makes some
2415       * tests fail. */
2416      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2417      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2418           return 1; /* mismatch; possible infinite loop or false positive */
2419
2420      if (!u1 || !u2)
2421           natbuf[1] = 0; /* Need to terminate the buffer. */
2422
2423      while ((e1 == 0 || p1 < e1) &&
2424             (f1 == 0 || p1 < f1) &&
2425             (e2 == 0 || p2 < e2) &&
2426             (f2 == 0 || p2 < f2)) {
2427           if (n1 == 0) {
2428                if (u1)
2429                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2430                else {
2431                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2432                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2433                }
2434                q1 = foldbuf1;
2435                n1 = foldlen1;
2436           }
2437           if (n2 == 0) {
2438                if (u2)
2439                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2440                else {
2441                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2442                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2443                }
2444                q2 = foldbuf2;
2445                n2 = foldlen2;
2446           }
2447           while (n1 && n2) {
2448                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2449                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2450                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2451                    return 1; /* mismatch */
2452                n1 -= UTF8SKIP(q1);
2453                q1 += UTF8SKIP(q1);
2454                n2 -= UTF8SKIP(q2);
2455                q2 += UTF8SKIP(q2);
2456           }
2457           if (n1 == 0)
2458                p1 += u1 ? UTF8SKIP(p1) : 1;
2459           if (n2 == 0)
2460                p2 += u2 ? UTF8SKIP(p2) : 1;
2461
2462      }
2463
2464      /* A match is defined by all the scans that specified
2465       * an explicit length reaching their final goals. */
2466      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2467
2468      if (match) {
2469           if (pe1)
2470                *pe1 = (char*)p1;
2471           if (pe2)
2472                *pe2 = (char*)p2;
2473      }
2474
2475      return match ? 0 : 1; /* 0 match, 1 mismatch */
2476 }
2477
2478 /*
2479  * Local variables:
2480  * c-indentation-style: bsd
2481  * c-basic-offset: 4
2482  * indent-tabs-mode: t
2483  * End:
2484  *
2485  * ex: set ts=8 sts=4 sw=4 noet:
2486  */