utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =cut
  55 */
  56
  57 /*
  58 =for apidoc is_ascii_string
  59
  60 Returns true if first C<len> bytes of the given string are ASCII (i.e. none
  61 of them even raise the question of UTF-8-ness).
  62
  63 See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  64
  65 =cut
  66 */
  67
  68 bool
  69 Perl_is_ascii_string(const U8 *s, STRLEN len)
  70 {
  71     const U8* const send = s + (len ? len : strlen((const char *)s));
  72     const U8* x = s;
  73
  74     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  75
  76     for (; x < send; ++x) {
  77         if (!UTF8_IS_INVARIANT(*x))
  78             break;
  79     }
  80
  81     return x == send;
  82 }
  83
  84 /*
  85 =for apidoc uvuni_to_utf8_flags
  86
  87 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  88 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  89 bytes available. The return value is the pointer to the byte after the
  90 end of the new character. In other words,
  91
  92     d = uvuni_to_utf8_flags(d, uv, flags);
  93
  94 or, in most cases,
  95
  96     d = uvuni_to_utf8(d, uv);
  97
  98 (which is equivalent to)
  99
 100     d = uvuni_to_utf8_flags(d, uv, 0);
 101
 102 is the recommended Unicode-aware way of saying
 103
 104     *(d++) = uv;
 105
 106 =cut
 107 */
 108
 109 U8 *
 110 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 111 {
 112     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
 113
 114     if (ckWARN(WARN_UTF8)) {
 115          if (UNICODE_IS_SURROGATE(uv) &&
 116              !(flags & UNICODE_ALLOW_SURROGATE))
 117               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
 118          else if (
 119                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
 120                     !(flags & UNICODE_ALLOW_FDD0))
 121                    ||
 122                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
 123                     !(flags & UNICODE_ALLOW_FFFF))) &&
 124                   /* UNICODE_ALLOW_SUPER includes
 125                    * FFFEs and FFFFs beyond 0x10FFFF. */
 126                   ((uv <= PERL_UNICODE_MAX) ||
 127                    !(flags & UNICODE_ALLOW_SUPER))
 128                   )
 129               Perl_warner(aTHX_ packWARN(WARN_UTF8),
 130                          "Unicode character 0x%04"UVxf" is illegal", uv);
 131     }
 132     if (UNI_IS_INVARIANT(uv)) {
 133         *d++ = (U8)UTF_TO_NATIVE(uv);
 134         return d;
 135     }
 136 #if defined(EBCDIC)
 137     else {
 138         STRLEN len  = UNISKIP(uv);
 139         U8 *p = d+len-1;
 140         while (p > d) {
 141             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 142             uv >>= UTF_ACCUMULATION_SHIFT;
 143         }
 144         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 145         return d+len;
 146     }
 147 #else /* Non loop style */
 148     if (uv < 0x800) {
 149         *d++ = (U8)(( uv >>  6)         | 0xc0);
 150         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 151         return d;
 152     }
 153     if (uv < 0x10000) {
 154         *d++ = (U8)(( uv >> 12)         | 0xe0);
 155         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 156         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 157         return d;
 158     }
 159     if (uv < 0x200000) {
 160         *d++ = (U8)(( uv >> 18)         | 0xf0);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166     if (uv < 0x4000000) {
 167         *d++ = (U8)(( uv >> 24)         | 0xf8);
 168         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 169         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 170         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 171         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 172         return d;
 173     }
 174     if (uv < 0x80000000) {
 175         *d++ = (U8)(( uv >> 30)         | 0xfc);
 176         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 177         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 178         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 179         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 180         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 181         return d;
 182     }
 183 #ifdef HAS_QUAD
 184     if (uv < UTF8_QUAD_MAX)
 185 #endif
 186     {
 187         *d++ =                            0xfe; /* Can't match U+FEFF! */
 188         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 189         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 190         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 191         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 192         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 193         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 194         return d;
 195     }
 196 #ifdef HAS_QUAD
 197     {
 198         *d++ =                            0xff;         /* Can't match U+FFFE! */
 199         *d++ =                            0x80;         /* 6 Reserved bits */
 200         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 201         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 202         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 203         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 205         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 206         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 207         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 208         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 209         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 210         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 211         return d;
 212     }
 213 #endif
 214 #endif /* Loop style */
 215 }
 216
 217 /*
 218
 219 Tests if some arbitrary number of bytes begins in a valid UTF-8
 220 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 221 UTF-8 character.  The actual number of bytes in the UTF-8 character
 222 will be returned if it is valid, otherwise 0.
 223
 224 This is the "slow" version as opposed to the "fast" version which is
 225 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 226 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 227 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 228 you should use the _slow().  In practice this means that the _slow()
 229 will be used very rarely, since the maximum Unicode code point (as of
 230 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 231 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 232 five bytes or more.
 233
 234 =cut */
 235 STATIC STRLEN
 236 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 237 {
 238     U8 u = *s;
 239     STRLEN slen;
 240     UV uv, ouv;
 241
 242     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 243
 244     if (UTF8_IS_INVARIANT(u))
 245         return 1;
 246
 247     if (!UTF8_IS_START(u))
 248         return 0;
 249
 250     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 251         return 0;
 252
 253     slen = len - 1;
 254     s++;
 255 #ifdef EBCDIC
 256     u = NATIVE_TO_UTF(u);
 257 #endif
 258     u &= UTF_START_MASK(len);
 259     uv  = u;
 260     ouv = uv;
 261     while (slen--) {
 262         if (!UTF8_IS_CONTINUATION(*s))
 263             return 0;
 264         uv = UTF8_ACCUMULATE(uv, *s);
 265         if (uv < ouv)
 266             return 0;
 267         ouv = uv;
 268         s++;
 269     }
 270
 271     if ((STRLEN)UNISKIP(uv) < len)
 272         return 0;
 273
 274     return len;
 275 }
 276
 277 /*
 278 =for apidoc is_utf8_char
 279
 280 Tests if some arbitrary number of bytes begins in a valid UTF-8
 281 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 282 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 283 character will be returned if it is valid, otherwise 0.
 284
 285 =cut */
 286 STRLEN
 287 Perl_is_utf8_char(const U8 *s)
 288 {
 289     const STRLEN len = UTF8SKIP(s);
 290
 291     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 292 #ifdef IS_UTF8_CHAR
 293     if (IS_UTF8_CHAR_FAST(len))
 294         return IS_UTF8_CHAR(s, len) ? len : 0;
 295 #endif /* #ifdef IS_UTF8_CHAR */
 296     return is_utf8_char_slow(s, len);
 297 }
 298
 299
 300 /*
 301 =for apidoc is_utf8_string
 302
 303 Returns true if first C<len> bytes of the given string form a valid
 304 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 305 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 306 because a valid ASCII string is a valid UTF-8 string.
 307
 308 See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
 309
 310 =cut
 311 */
 312
 313 bool
 314 Perl_is_utf8_string(const U8 *s, STRLEN len)
 315 {
 316     const U8* const send = s + (len ? len : strlen((const char *)s));
 317     const U8* x = s;
 318
 319     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 320
 321     while (x < send) {
 322         STRLEN c;
 323          /* Inline the easy bits of is_utf8_char() here for speed... */
 324          if (UTF8_IS_INVARIANT(*x))
 325               c = 1;
 326          else if (!UTF8_IS_START(*x))
 327              goto out;
 328          else {
 329               /* ... and call is_utf8_char() only if really needed. */
 330 #ifdef IS_UTF8_CHAR
 331              c = UTF8SKIP(x);
 332              if (IS_UTF8_CHAR_FAST(c)) {
 333                  if (!IS_UTF8_CHAR(x, c))
 334                      c = 0;
 335              }
 336              else
 337                 c = is_utf8_char_slow(x, c);
 338 #else
 339              c = is_utf8_char(x);
 340 #endif /* #ifdef IS_UTF8_CHAR */
 341               if (!c)
 342                   goto out;
 343          }
 344         x += c;
 345     }
 346
 347  out:
 348     if (x != send)
 349         return FALSE;
 350
 351     return TRUE;
 352 }
 353
 354 /*
 355 Implemented as a macro in utf8.h
 356
 357 =for apidoc is_utf8_string_loc
 358
 359 Like is_utf8_string() but stores the location of the failure (in the
 360 case of "utf8ness failure") or the location s+len (in the case of
 361 "utf8ness success") in the C<ep>.
 362
 363 See also is_utf8_string_loclen() and is_utf8_string().
 364
 365 =for apidoc is_utf8_string_loclen
 366
 367 Like is_utf8_string() but stores the location of the failure (in the
 368 case of "utf8ness failure") or the location s+len (in the case of
 369 "utf8ness success") in the C<ep>, and the number of UTF-8
 370 encoded characters in the C<el>.
 371
 372 See also is_utf8_string_loc() and is_utf8_string().
 373
 374 =cut
 375 */
 376
 377 bool
 378 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 379 {
 380     const U8* const send = s + (len ? len : strlen((const char *)s));
 381     const U8* x = s;
 382     STRLEN c;
 383     STRLEN outlen = 0;
 384
 385     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 386
 387     while (x < send) {
 388          /* Inline the easy bits of is_utf8_char() here for speed... */
 389          if (UTF8_IS_INVARIANT(*x))
 390              c = 1;
 391          else if (!UTF8_IS_START(*x))
 392              goto out;
 393          else {
 394              /* ... and call is_utf8_char() only if really needed. */
 395 #ifdef IS_UTF8_CHAR
 396              c = UTF8SKIP(x);
 397              if (IS_UTF8_CHAR_FAST(c)) {
 398                  if (!IS_UTF8_CHAR(x, c))
 399                      c = 0;
 400              } else
 401                  c = is_utf8_char_slow(x, c);
 402 #else
 403              c = is_utf8_char(x);
 404 #endif /* #ifdef IS_UTF8_CHAR */
 405              if (!c)
 406                  goto out;
 407          }
 408          x += c;
 409          outlen++;
 410     }
 411
 412  out:
 413     if (el)
 414         *el = outlen;
 415
 416     if (ep)
 417         *ep = x;
 418     return (x == send);
 419 }
 420
 421 /*
 422
 423 =for apidoc utf8n_to_uvuni
 424
 425 Bottom level UTF-8 decode routine.
 426 Returns the Unicode code point value of the first character in the string C<s>
 427 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 428 C<retlen> will be set to the length, in bytes, of that character.
 429
 430 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 431 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 432 it is assumed that the caller will raise a warning, and this function
 433 will silently just set C<retlen> to C<-1> and return zero.  If the
 434 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 435 malformations will be given, C<retlen> will be set to the expected
 436 length of the UTF-8 character in bytes, and zero will be returned.
 437
 438 The C<flags> can also contain various flags to allow deviations from
 439 the strict UTF-8 encoding (see F<utf8.h>).
 440
 441 Most code should use utf8_to_uvchr() rather than call this directly.
 442
 443 =cut
 444 */
 445
 446 UV
 447 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 448 {
 449     dVAR;
 450     const U8 * const s0 = s;
 451     UV uv = *s, ouv = 0;
 452     STRLEN len = 1;
 453     const bool dowarn = ckWARN_d(WARN_UTF8);
 454     const UV startbyte = *s;
 455     STRLEN expectlen = 0;
 456     U32 warning = 0;
 457
 458     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 459
 460 /* This list is a superset of the UTF8_ALLOW_XXX. */
 461
 462 #define UTF8_WARN_EMPTY                          1
 463 #define UTF8_WARN_CONTINUATION                   2
 464 #define UTF8_WARN_NON_CONTINUATION               3
 465 #define UTF8_WARN_FE_FF                          4
 466 #define UTF8_WARN_SHORT                          5
 467 #define UTF8_WARN_OVERFLOW                       6
 468 #define UTF8_WARN_SURROGATE                      7
 469 #define UTF8_WARN_LONG                           8
 470 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 471
 472     if (curlen == 0 &&
 473         !(flags & UTF8_ALLOW_EMPTY)) {
 474         warning = UTF8_WARN_EMPTY;
 475         goto malformed;
 476     }
 477
 478     if (UTF8_IS_INVARIANT(uv)) {
 479         if (retlen)
 480             *retlen = 1;
 481         return (UV) (NATIVE_TO_UTF(*s));
 482     }
 483
 484     if (UTF8_IS_CONTINUATION(uv) &&
 485         !(flags & UTF8_ALLOW_CONTINUATION)) {
 486         warning = UTF8_WARN_CONTINUATION;
 487         goto malformed;
 488     }
 489
 490     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 491         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 492         warning = UTF8_WARN_NON_CONTINUATION;
 493         goto malformed;
 494     }
 495
 496 #ifdef EBCDIC
 497     uv = NATIVE_TO_UTF(uv);
 498 #else
 499     if ((uv == 0xfe || uv == 0xff) &&
 500         !(flags & UTF8_ALLOW_FE_FF)) {
 501         warning = UTF8_WARN_FE_FF;
 502         goto malformed;
 503     }
 504 #endif
 505
 506     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 507     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 508     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 509     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 510 #ifdef EBCDIC
 511     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 512     else                        { len =  7; uv &= 0x01; }
 513 #else
 514     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 515     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 516     else                        { len = 13; uv = 0; } /* whoa! */
 517 #endif
 518
 519     if (retlen)
 520         *retlen = len;
 521
 522     expectlen = len;
 523
 524     if ((curlen < expectlen) &&
 525         !(flags & UTF8_ALLOW_SHORT)) {
 526         warning = UTF8_WARN_SHORT;
 527         goto malformed;
 528     }
 529
 530     len--;
 531     s++;
 532     ouv = uv;
 533
 534     while (len--) {
 535         if (!UTF8_IS_CONTINUATION(*s) &&
 536             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 537             s--;
 538             warning = UTF8_WARN_NON_CONTINUATION;
 539             goto malformed;
 540         }
 541         else
 542             uv = UTF8_ACCUMULATE(uv, *s);
 543         if (!(uv > ouv)) {
 544             /* These cannot be allowed. */
 545             if (uv == ouv) {
 546                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 547                     warning = UTF8_WARN_LONG;
 548                     goto malformed;
 549                 }
 550             }
 551             else { /* uv < ouv */
 552                 /* This cannot be allowed. */
 553                 warning = UTF8_WARN_OVERFLOW;
 554                 goto malformed;
 555             }
 556         }
 557         s++;
 558         ouv = uv;
 559     }
 560
 561     if (UNICODE_IS_SURROGATE(uv) &&
 562         !(flags & UTF8_ALLOW_SURROGATE)) {
 563         warning = UTF8_WARN_SURROGATE;
 564         goto malformed;
 565     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 566                !(flags & UTF8_ALLOW_LONG)) {
 567         warning = UTF8_WARN_LONG;
 568         goto malformed;
 569     } else if (UNICODE_IS_ILLEGAL(uv) &&
 570                !(flags & UTF8_ALLOW_FFFF)) {
 571         warning = UTF8_WARN_FFFF;
 572         goto malformed;
 573     }
 574
 575     return uv;
 576
 577 malformed:
 578
 579     if (flags & UTF8_CHECK_ONLY) {
 580         if (retlen)
 581             *retlen = ((STRLEN) -1);
 582         return 0;
 583     }
 584
 585     if (dowarn) {
 586         SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 587
 588         switch (warning) {
 589         case 0: /* Intentionally empty. */ break;
 590         case UTF8_WARN_EMPTY:
 591             sv_catpvs(sv, "(empty string)");
 592             break;
 593         case UTF8_WARN_CONTINUATION:
 594             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 595             break;
 596         case UTF8_WARN_NON_CONTINUATION:
 597             if (s == s0)
 598                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 599                            (UV)s[1], startbyte);
 600             else {
 601                 const int len = (int)(s-s0);
 602                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 603                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 604             }
 605
 606             break;
 607         case UTF8_WARN_FE_FF:
 608             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 609             break;
 610         case UTF8_WARN_SHORT:
 611             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 612                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 613             expectlen = curlen;         /* distance for caller to skip */
 614             break;
 615         case UTF8_WARN_OVERFLOW:
 616             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 617                            ouv, *s, startbyte);
 618             break;
 619         case UTF8_WARN_SURROGATE:
 620             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 621             break;
 622         case UTF8_WARN_LONG:
 623             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 624                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 625             break;
 626         case UTF8_WARN_FFFF:
 627             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 628             break;
 629         default:
 630             sv_catpvs(sv, "(unknown reason)");
 631             break;
 632         }
 633
 634         if (warning) {
 635             const char * const s = SvPVX_const(sv);
 636
 637             if (PL_op)
 638                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 639                             "%s in %s", s,  OP_DESC(PL_op));
 640             else
 641                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 642         }
 643     }
 644
 645     if (retlen)
 646         *retlen = expectlen ? expectlen : len;
 647
 648     return 0;
 649 }
 650
 651 /*
 652 =for apidoc utf8_to_uvchr
 653
 654 Returns the native character value of the first character in the string C<s>
 655 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 656 length, in bytes, of that character.
 657
 658 If C<s> does not point to a well-formed UTF-8 character, zero is
 659 returned and retlen is set, if possible, to -1.
 660
 661 =cut
 662 */
 663
 664 UV
 665 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 666 {
 667     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 668
 669     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 670                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 671 }
 672
 673 /*
 674 =for apidoc utf8_to_uvuni
 675
 676 Returns the Unicode code point of the first character in the string C<s>
 677 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 678 length, in bytes, of that character.
 679
 680 This function should only be used when the returned UV is considered
 681 an index into the Unicode semantic tables (e.g. swashes).
 682
 683 If C<s> does not point to a well-formed UTF-8 character, zero is
 684 returned and retlen is set, if possible, to -1.
 685
 686 =cut
 687 */
 688
 689 UV
 690 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 691 {
 692     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 693
 694     /* Call the low level routine asking for checks */
 695     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 696                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 697 }
 698
 699 /*
 700 =for apidoc utf8_length
 701
 702 Return the length of the UTF-8 char encoded string C<s> in characters.
 703 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 704 up past C<e>, croaks.
 705
 706 =cut
 707 */
 708
 709 STRLEN
 710 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 711 {
 712     dVAR;
 713     STRLEN len = 0;
 714
 715     PERL_ARGS_ASSERT_UTF8_LENGTH;
 716
 717     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 718      * the bitops (especially ~) can create illegal UTF-8.
 719      * In other words: in Perl UTF-8 is not just for Unicode. */
 720
 721     if (e < s)
 722         goto warn_and_return;
 723     while (s < e) {
 724         if (!UTF8_IS_INVARIANT(*s))
 725             s += UTF8SKIP(s);
 726         else
 727             s++;
 728         len++;
 729     }
 730
 731     if (e != s) {
 732         len--;
 733         warn_and_return:
 734         if (ckWARN_d(WARN_UTF8)) {
 735             if (PL_op)
 736                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 737                             "%s in %s", unees, OP_DESC(PL_op));
 738             else
 739                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 740         }
 741     }
 742
 743     return len;
 744 }
 745
 746 /*
 747 =for apidoc utf8_distance
 748
 749 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 750 and C<b>.
 751
 752 WARNING: use only if you *know* that the pointers point inside the
 753 same UTF-8 buffer.
 754
 755 =cut
 756 */
 757
 758 IV
 759 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 760 {
 761     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 762
 763     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 764 }
 765
 766 /*
 767 =for apidoc utf8_hop
 768
 769 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 770 forward or backward.
 771
 772 WARNING: do not use the following unless you *know* C<off> is within
 773 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 774 on the first byte of character or just after the last byte of a character.
 775
 776 =cut
 777 */
 778
 779 U8 *
 780 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 781 {
 782     PERL_ARGS_ASSERT_UTF8_HOP;
 783
 784     PERL_UNUSED_CONTEXT;
 785     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 786      * the bitops (especially ~) can create illegal UTF-8.
 787      * In other words: in Perl UTF-8 is not just for Unicode. */
 788
 789     if (off >= 0) {
 790         while (off--)
 791             s += UTF8SKIP(s);
 792     }
 793     else {
 794         while (off++) {
 795             s--;
 796             while (UTF8_IS_CONTINUATION(*s))
 797                 s--;
 798         }
 799     }
 800     return (U8 *)s;
 801 }
 802
 803 /*
 804 =for apidoc utf8_to_bytes
 805
 806 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 807 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 808 updates len to contain the new length.
 809 Returns zero on failure, setting C<len> to -1.
 810
 811 If you need a copy of the string, see C<bytes_from_utf8>.
 812
 813 =cut
 814 */
 815
 816 U8 *
 817 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 818 {
 819     U8 * const save = s;
 820     U8 * const send = s + *len;
 821     U8 *d;
 822
 823     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 824
 825     /* ensure valid UTF-8 and chars < 256 before updating string */
 826     while (s < send) {
 827         U8 c = *s++;
 828
 829         if (!UTF8_IS_INVARIANT(c) &&
 830             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 831              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 832             *len = ((STRLEN) -1);
 833             return 0;
 834         }
 835     }
 836
 837     d = s = save;
 838     while (s < send) {
 839         STRLEN ulen;
 840         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 841         s += ulen;
 842     }
 843     *d = '\0';
 844     *len = d - save;
 845     return save;
 846 }
 847
 848 /*
 849 =for apidoc bytes_from_utf8
 850
 851 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 852 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 853 the newly-created string, and updates C<len> to contain the new
 854 length.  Returns the original string if no conversion occurs, C<len>
 855 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 856 0 if C<s> is converted or consisted entirely of characters that are invariant
 857 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
 858
 859 =cut
 860 */
 861
 862 U8 *
 863 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 864 {
 865     U8 *d;
 866     const U8 *start = s;
 867     const U8 *send;
 868     I32 count = 0;
 869
 870     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 871
 872     PERL_UNUSED_CONTEXT;
 873     if (!*is_utf8)
 874         return (U8 *)start;
 875
 876     /* ensure valid UTF-8 and chars < 256 before converting string */
 877     for (send = s + *len; s < send;) {
 878         U8 c = *s++;
 879         if (!UTF8_IS_INVARIANT(c)) {
 880             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 881                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 882                 count++;
 883             else
 884                 return (U8 *)start;
 885         }
 886     }
 887
 888     *is_utf8 = FALSE;
 889
 890     Newx(d, (*len) - count + 1, U8);
 891     s = start; start = d;
 892     while (s < send) {
 893         U8 c = *s++;
 894         if (!UTF8_IS_INVARIANT(c)) {
 895             /* Then it is two-byte encoded */
 896             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 897             c = ASCII_TO_NATIVE(c);
 898         }
 899         *d++ = c;
 900     }
 901     *d = '\0';
 902     *len = d - start;
 903     return (U8 *)start;
 904 }
 905
 906 /*
 907 =for apidoc bytes_to_utf8
 908
 909 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
 910 Returns a pointer to the newly-created string, and sets C<len> to
 911 reflect the new length.
 912
 913 A NUL character will be written after the end of the string.
 914
 915 If you want to convert to UTF-8 from encodings other than
 916 the native (Latin1 or EBCDIC),
 917 see sv_recode_to_utf8().
 918
 919 =cut
 920 */
 921
 922 U8*
 923 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 924 {
 925     const U8 * const send = s + (*len);
 926     U8 *d;
 927     U8 *dst;
 928
 929     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 930     PERL_UNUSED_CONTEXT;
 931
 932     Newx(d, (*len) * 2 + 1, U8);
 933     dst = d;
 934
 935     while (s < send) {
 936         const UV uv = NATIVE_TO_ASCII(*s++);
 937         if (UNI_IS_INVARIANT(uv))
 938             *d++ = (U8)UTF_TO_NATIVE(uv);
 939         else {
 940             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 941             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 942         }
 943     }
 944     *d = '\0';
 945     *len = d-dst;
 946     return dst;
 947 }
 948
 949 /*
 950  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 951  *
 952  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 953  * We optimize for native, for obvious reasons. */
 954
 955 U8*
 956 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 957 {
 958     U8* pend;
 959     U8* dstart = d;
 960
 961     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 962
 963     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 964          d[0] = 0;
 965          *newlen = 1;
 966          return d;
 967     }
 968
 969     if (bytelen & 1)
 970         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 971
 972     pend = p + bytelen;
 973
 974     while (p < pend) {
 975         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 976         p += 2;
 977         if (uv < 0x80) {
 978 #ifdef EBCDIC
 979             *d++ = UNI_TO_NATIVE(uv);
 980 #else
 981             *d++ = (U8)uv;
 982 #endif
 983             continue;
 984         }
 985         if (uv < 0x800) {
 986             *d++ = (U8)(( uv >>  6)         | 0xc0);
 987             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 988             continue;
 989         }
 990         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 991             UV low = (p[0] << 8) + p[1];
 992             p += 2;
 993             if (low < 0xdc00 || low >= 0xdfff)
 994                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 995             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 996         }
 997         if (uv < 0x10000) {
 998             *d++ = (U8)(( uv >> 12)         | 0xe0);
 999             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1000             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1001             continue;
1002         }
1003         else {
1004             *d++ = (U8)(( uv >> 18)         | 0xf0);
1005             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1006             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1007             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1008             continue;
1009         }
1010     }
1011     *newlen = d - dstart;
1012     return d;
1013 }
1014
1015 /* Note: this one is slightly destructive of the source. */
1016
1017 U8*
1018 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1019 {
1020     U8* s = (U8*)p;
1021     U8* const send = s + bytelen;
1022
1023     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1024
1025     while (s < send) {
1026         const U8 tmp = s[0];
1027         s[0] = s[1];
1028         s[1] = tmp;
1029         s += 2;
1030     }
1031     return utf16_to_utf8(p, d, bytelen, newlen);
1032 }
1033
1034 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1035
1036 bool
1037 Perl_is_uni_alnum(pTHX_ UV c)
1038 {
1039     U8 tmpbuf[UTF8_MAXBYTES+1];
1040     uvchr_to_utf8(tmpbuf, c);
1041     return is_utf8_alnum(tmpbuf);
1042 }
1043
1044 bool
1045 Perl_is_uni_idfirst(pTHX_ UV c)
1046 {
1047     U8 tmpbuf[UTF8_MAXBYTES+1];
1048     uvchr_to_utf8(tmpbuf, c);
1049     return is_utf8_idfirst(tmpbuf);
1050 }
1051
1052 bool
1053 Perl_is_uni_alpha(pTHX_ UV c)
1054 {
1055     U8 tmpbuf[UTF8_MAXBYTES+1];
1056     uvchr_to_utf8(tmpbuf, c);
1057     return is_utf8_alpha(tmpbuf);
1058 }
1059
1060 bool
1061 Perl_is_uni_ascii(pTHX_ UV c)
1062 {
1063     U8 tmpbuf[UTF8_MAXBYTES+1];
1064     uvchr_to_utf8(tmpbuf, c);
1065     return is_utf8_ascii(tmpbuf);
1066 }
1067
1068 bool
1069 Perl_is_uni_space(pTHX_ UV c)
1070 {
1071     U8 tmpbuf[UTF8_MAXBYTES+1];
1072     uvchr_to_utf8(tmpbuf, c);
1073     return is_utf8_space(tmpbuf);
1074 }
1075
1076 bool
1077 Perl_is_uni_digit(pTHX_ UV c)
1078 {
1079     U8 tmpbuf[UTF8_MAXBYTES+1];
1080     uvchr_to_utf8(tmpbuf, c);
1081     return is_utf8_digit(tmpbuf);
1082 }
1083
1084 bool
1085 Perl_is_uni_upper(pTHX_ UV c)
1086 {
1087     U8 tmpbuf[UTF8_MAXBYTES+1];
1088     uvchr_to_utf8(tmpbuf, c);
1089     return is_utf8_upper(tmpbuf);
1090 }
1091
1092 bool
1093 Perl_is_uni_lower(pTHX_ UV c)
1094 {
1095     U8 tmpbuf[UTF8_MAXBYTES+1];
1096     uvchr_to_utf8(tmpbuf, c);
1097     return is_utf8_lower(tmpbuf);
1098 }
1099
1100 bool
1101 Perl_is_uni_cntrl(pTHX_ UV c)
1102 {
1103     U8 tmpbuf[UTF8_MAXBYTES+1];
1104     uvchr_to_utf8(tmpbuf, c);
1105     return is_utf8_cntrl(tmpbuf);
1106 }
1107
1108 bool
1109 Perl_is_uni_graph(pTHX_ UV c)
1110 {
1111     U8 tmpbuf[UTF8_MAXBYTES+1];
1112     uvchr_to_utf8(tmpbuf, c);
1113     return is_utf8_graph(tmpbuf);
1114 }
1115
1116 bool
1117 Perl_is_uni_print(pTHX_ UV c)
1118 {
1119     U8 tmpbuf[UTF8_MAXBYTES+1];
1120     uvchr_to_utf8(tmpbuf, c);
1121     return is_utf8_print(tmpbuf);
1122 }
1123
1124 bool
1125 Perl_is_uni_punct(pTHX_ UV c)
1126 {
1127     U8 tmpbuf[UTF8_MAXBYTES+1];
1128     uvchr_to_utf8(tmpbuf, c);
1129     return is_utf8_punct(tmpbuf);
1130 }
1131
1132 bool
1133 Perl_is_uni_xdigit(pTHX_ UV c)
1134 {
1135     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1136     uvchr_to_utf8(tmpbuf, c);
1137     return is_utf8_xdigit(tmpbuf);
1138 }
1139
1140 UV
1141 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1142 {
1143     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1144
1145     uvchr_to_utf8(p, c);
1146     return to_utf8_upper(p, p, lenp);
1147 }
1148
1149 UV
1150 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1151 {
1152     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1153
1154     uvchr_to_utf8(p, c);
1155     return to_utf8_title(p, p, lenp);
1156 }
1157
1158 UV
1159 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1160 {
1161     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1162
1163     uvchr_to_utf8(p, c);
1164     return to_utf8_lower(p, p, lenp);
1165 }
1166
1167 UV
1168 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1169 {
1170     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1171
1172     uvchr_to_utf8(p, c);
1173     return to_utf8_fold(p, p, lenp);
1174 }
1175
1176 /* for now these all assume no locale info available for Unicode > 255 */
1177
1178 bool
1179 Perl_is_uni_alnum_lc(pTHX_ UV c)
1180 {
1181     return is_uni_alnum(c);     /* XXX no locale support yet */
1182 }
1183
1184 bool
1185 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1186 {
1187     return is_uni_idfirst(c);   /* XXX no locale support yet */
1188 }
1189
1190 bool
1191 Perl_is_uni_alpha_lc(pTHX_ UV c)
1192 {
1193     return is_uni_alpha(c);     /* XXX no locale support yet */
1194 }
1195
1196 bool
1197 Perl_is_uni_ascii_lc(pTHX_ UV c)
1198 {
1199     return is_uni_ascii(c);     /* XXX no locale support yet */
1200 }
1201
1202 bool
1203 Perl_is_uni_space_lc(pTHX_ UV c)
1204 {
1205     return is_uni_space(c);     /* XXX no locale support yet */
1206 }
1207
1208 bool
1209 Perl_is_uni_digit_lc(pTHX_ UV c)
1210 {
1211     return is_uni_digit(c);     /* XXX no locale support yet */
1212 }
1213
1214 bool
1215 Perl_is_uni_upper_lc(pTHX_ UV c)
1216 {
1217     return is_uni_upper(c);     /* XXX no locale support yet */
1218 }
1219
1220 bool
1221 Perl_is_uni_lower_lc(pTHX_ UV c)
1222 {
1223     return is_uni_lower(c);     /* XXX no locale support yet */
1224 }
1225
1226 bool
1227 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1228 {
1229     return is_uni_cntrl(c);     /* XXX no locale support yet */
1230 }
1231
1232 bool
1233 Perl_is_uni_graph_lc(pTHX_ UV c)
1234 {
1235     return is_uni_graph(c);     /* XXX no locale support yet */
1236 }
1237
1238 bool
1239 Perl_is_uni_print_lc(pTHX_ UV c)
1240 {
1241     return is_uni_print(c);     /* XXX no locale support yet */
1242 }
1243
1244 bool
1245 Perl_is_uni_punct_lc(pTHX_ UV c)
1246 {
1247     return is_uni_punct(c);     /* XXX no locale support yet */
1248 }
1249
1250 bool
1251 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1252 {
1253     return is_uni_xdigit(c);    /* XXX no locale support yet */
1254 }
1255
1256 U32
1257 Perl_to_uni_upper_lc(pTHX_ U32 c)
1258 {
1259     /* XXX returns only the first character -- do not use XXX */
1260     /* XXX no locale support yet */
1261     STRLEN len;
1262     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1263     return (U32)to_uni_upper(c, tmpbuf, &len);
1264 }
1265
1266 U32
1267 Perl_to_uni_title_lc(pTHX_ U32 c)
1268 {
1269     /* XXX returns only the first character XXX -- do not use XXX */
1270     /* XXX no locale support yet */
1271     STRLEN len;
1272     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1273     return (U32)to_uni_title(c, tmpbuf, &len);
1274 }
1275
1276 U32
1277 Perl_to_uni_lower_lc(pTHX_ U32 c)
1278 {
1279     /* XXX returns only the first character -- do not use XXX */
1280     /* XXX no locale support yet */
1281     STRLEN len;
1282     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1283     return (U32)to_uni_lower(c, tmpbuf, &len);
1284 }
1285
1286 static bool
1287 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1288                  const char *const swashname)
1289 {
1290     dVAR;
1291
1292     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1293
1294     if (!is_utf8_char(p))
1295         return FALSE;
1296     if (!*swash)
1297         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1298     return swash_fetch(*swash, p, TRUE) != 0;
1299 }
1300
1301 bool
1302 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1303 {
1304     dVAR;
1305
1306     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1307
1308     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1309      * descendant of isalnum(3), in other words, it doesn't
1310      * contain the '_'. --jhi */
1311     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1312 }
1313
1314 bool
1315 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1316 {
1317     dVAR;
1318
1319     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1320
1321     if (*p == '_')
1322         return TRUE;
1323     /* is_utf8_idstart would be more logical. */
1324     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1325 }
1326
1327 bool
1328 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1329 {
1330     dVAR;
1331
1332     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1333
1334     if (*p == '_')
1335         return TRUE;
1336     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1337 }
1338
1339 bool
1340 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1341 {
1342     dVAR;
1343
1344     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1345
1346     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1347 }
1348
1349 bool
1350 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1351 {
1352     dVAR;
1353
1354     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1355
1356     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1357 }
1358
1359 bool
1360 Perl_is_utf8_space(pTHX_ const U8 *p)
1361 {
1362     dVAR;
1363
1364     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1365
1366     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1367 }
1368
1369 bool
1370 Perl_is_utf8_digit(pTHX_ const U8 *p)
1371 {
1372     dVAR;
1373
1374     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1375
1376     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1377 }
1378
1379 bool
1380 Perl_is_utf8_upper(pTHX_ const U8 *p)
1381 {
1382     dVAR;
1383
1384     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1385
1386     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1387 }
1388
1389 bool
1390 Perl_is_utf8_lower(pTHX_ const U8 *p)
1391 {
1392     dVAR;
1393
1394     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1395
1396     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1397 }
1398
1399 bool
1400 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1401 {
1402     dVAR;
1403
1404     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1405
1406     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1407 }
1408
1409 bool
1410 Perl_is_utf8_graph(pTHX_ const U8 *p)
1411 {
1412     dVAR;
1413
1414     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1415
1416     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1417 }
1418
1419 bool
1420 Perl_is_utf8_print(pTHX_ const U8 *p)
1421 {
1422     dVAR;
1423
1424     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1425
1426     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1427 }
1428
1429 bool
1430 Perl_is_utf8_punct(pTHX_ const U8 *p)
1431 {
1432     dVAR;
1433
1434     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1435
1436     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1437 }
1438
1439 bool
1440 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1441 {
1442     dVAR;
1443
1444     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1445
1446     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1447 }
1448
1449 bool
1450 Perl_is_utf8_mark(pTHX_ const U8 *p)
1451 {
1452     dVAR;
1453
1454     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1455
1456     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1457 }
1458
1459 /*
1460 =for apidoc to_utf8_case
1461
1462 The "p" contains the pointer to the UTF-8 string encoding
1463 the character that is being converted.
1464
1465 The "ustrp" is a pointer to the character buffer to put the
1466 conversion result to.  The "lenp" is a pointer to the length
1467 of the result.
1468
1469 The "swashp" is a pointer to the swash to use.
1470
1471 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1472 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1473 but not always, a multicharacter mapping), is tried first.
1474
1475 The "special" is a string like "utf8::ToSpecLower", which means the
1476 hash %utf8::ToSpecLower.  The access to the hash is through
1477 Perl_to_utf8_case().
1478
1479 The "normal" is a string like "ToLower" which means the swash
1480 %utf8::ToLower.
1481
1482 =cut */
1483
1484 UV
1485 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1486                         SV **swashp, const char *normal, const char *special)
1487 {
1488     dVAR;
1489     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1490     STRLEN len = 0;
1491     const UV uv0 = utf8_to_uvchr(p, NULL);
1492     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1493      * are necessary in EBCDIC, they are redundant no-ops
1494      * in ASCII-ish platforms, and hopefully optimized away. */
1495     const UV uv1 = NATIVE_TO_UNI(uv0);
1496
1497     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1498
1499     uvuni_to_utf8(tmpbuf, uv1);
1500
1501     if (!*swashp) /* load on-demand */
1502          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1503
1504     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1505     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1506          /* It might be "special" (sometimes, but not always,
1507           * a multicharacter mapping) */
1508          HV * const hv = get_hv(special, 0);
1509          SV **svp;
1510
1511          if (hv &&
1512              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1513              (*svp)) {
1514              const char *s;
1515
1516               s = SvPV_const(*svp, len);
1517               if (len == 1)
1518                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1519               else {
1520 #ifdef EBCDIC
1521                    /* If we have EBCDIC we need to remap the characters
1522                     * since any characters in the low 256 are Unicode
1523                     * code points, not EBCDIC. */
1524                    U8 *t = (U8*)s, *tend = t + len, *d;
1525
1526                    d = tmpbuf;
1527                    if (SvUTF8(*svp)) {
1528                         STRLEN tlen = 0;
1529
1530                         while (t < tend) {
1531                              const UV c = utf8_to_uvchr(t, &tlen);
1532                              if (tlen > 0) {
1533                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1534                                   t += tlen;
1535                              }
1536                              else
1537                                   break;
1538                         }
1539                    }
1540                    else {
1541                         while (t < tend) {
1542                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1543                              t++;
1544                         }
1545                    }
1546                    len = d - tmpbuf;
1547                    Copy(tmpbuf, ustrp, len, U8);
1548 #else
1549                    Copy(s, ustrp, len, U8);
1550 #endif
1551               }
1552          }
1553     }
1554
1555     if (!len && *swashp) {
1556         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1557
1558          if (uv2) {
1559               /* It was "normal" (a single character mapping). */
1560               const UV uv3 = UNI_TO_NATIVE(uv2);
1561               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1562          }
1563     }
1564
1565     if (!len) /* Neither: just copy. */
1566          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1567
1568     if (lenp)
1569          *lenp = len;
1570
1571     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1572 }
1573
1574 /*
1575 =for apidoc to_utf8_upper
1576
1577 Convert the UTF-8 encoded character at p to its uppercase version and
1578 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1579 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1580 the uppercase version may be longer than the original character.
1581
1582 The first character of the uppercased version is returned
1583 (but note, as explained above, that there may be more.)
1584
1585 =cut */
1586
1587 UV
1588 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1589 {
1590     dVAR;
1591
1592     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1593
1594     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1595                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1596 }
1597
1598 /*
1599 =for apidoc to_utf8_title
1600
1601 Convert the UTF-8 encoded character at p to its titlecase version and
1602 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1603 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1604 titlecase version may be longer than the original character.
1605
1606 The first character of the titlecased version is returned
1607 (but note, as explained above, that there may be more.)
1608
1609 =cut */
1610
1611 UV
1612 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1613 {
1614     dVAR;
1615
1616     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1617
1618     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1619                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1620 }
1621
1622 /*
1623 =for apidoc to_utf8_lower
1624
1625 Convert the UTF-8 encoded character at p to its lowercase version and
1626 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1627 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1628 lowercase version may be longer than the original character.
1629
1630 The first character of the lowercased version is returned
1631 (but note, as explained above, that there may be more.)
1632
1633 =cut */
1634
1635 UV
1636 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1637 {
1638     dVAR;
1639
1640     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1641
1642     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1643                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1644 }
1645
1646 /*
1647 =for apidoc to_utf8_fold
1648
1649 Convert the UTF-8 encoded character at p to its foldcase version and
1650 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1651 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1652 foldcase version may be longer than the original character (up to
1653 three characters).
1654
1655 The first character of the foldcased version is returned
1656 (but note, as explained above, that there may be more.)
1657
1658 =cut */
1659
1660 UV
1661 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1662 {
1663     dVAR;
1664
1665     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1666
1667     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1668                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1669 }
1670
1671 /* Note:
1672  * A "swash" is a swatch hash.
1673  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1674  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1675  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1676  */
1677 SV*
1678 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1679 {
1680     dVAR;
1681     SV* retval;
1682     dSP;
1683     const size_t pkg_len = strlen(pkg);
1684     const size_t name_len = strlen(name);
1685     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1686     SV* errsv_save;
1687
1688     PERL_ARGS_ASSERT_SWASH_INIT;
1689
1690     PUSHSTACKi(PERLSI_MAGIC);
1691     ENTER;
1692     SAVEI32(PL_hints);
1693     PL_hints = 0;
1694     save_re_context();
1695     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1696         ENTER;
1697         errsv_save = newSVsv(ERRSV);
1698         /* It is assumed that callers of this routine are not passing in any
1699            user derived data.  */
1700         /* Need to do this after save_re_context() as it will set PL_tainted to
1701            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1702            Even line to create errsv_save can turn on PL_tainted.  */
1703         SAVEBOOL(PL_tainted);
1704         PL_tainted = 0;
1705         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1706                          NULL);
1707         if (!SvTRUE(ERRSV))
1708             sv_setsv(ERRSV, errsv_save);
1709         SvREFCNT_dec(errsv_save);
1710         LEAVE;
1711     }
1712     SPAGAIN;
1713     PUSHMARK(SP);
1714     EXTEND(SP,5);
1715     mPUSHp(pkg, pkg_len);
1716     mPUSHp(name, name_len);
1717     PUSHs(listsv);
1718     mPUSHi(minbits);
1719     mPUSHi(none);
1720     PUTBACK;
1721     errsv_save = newSVsv(ERRSV);
1722     if (call_method("SWASHNEW", G_SCALAR))
1723         retval = newSVsv(*PL_stack_sp--);
1724     else
1725         retval = &PL_sv_undef;
1726     if (!SvTRUE(ERRSV))
1727         sv_setsv(ERRSV, errsv_save);
1728     SvREFCNT_dec(errsv_save);
1729     LEAVE;
1730     POPSTACK;
1731     if (IN_PERL_COMPILETIME) {
1732         CopHINTS_set(PL_curcop, PL_hints);
1733     }
1734     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1735         if (SvPOK(retval))
1736             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1737                        SVfARG(retval));
1738         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1739     }
1740     return retval;
1741 }
1742
1743
1744 /* This API is wrong for special case conversions since we may need to
1745  * return several Unicode characters for a single Unicode character
1746  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1747  * the lower-level routine, and it is similarly broken for returning
1748  * multiple values.  --jhi */
1749 /* Now SWASHGET is recasted into S_swash_get in this file. */
1750
1751 /* Note:
1752  * Returns the value of property/mapping C<swash> for the first character
1753  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1754  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1755  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1756  */
1757 UV
1758 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1759 {
1760     dVAR;
1761     HV *const hv = MUTABLE_HV(SvRV(swash));
1762     U32 klen;
1763     U32 off;
1764     STRLEN slen;
1765     STRLEN needents;
1766     const U8 *tmps = NULL;
1767     U32 bit;
1768     SV *swatch;
1769     U8 tmputf8[2];
1770     const UV c = NATIVE_TO_ASCII(*ptr);
1771
1772     PERL_ARGS_ASSERT_SWASH_FETCH;
1773
1774     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1775         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1776         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1777         ptr = tmputf8;
1778     }
1779     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1780      * then the "swatch" is a vec() for al the chars which start
1781      * with 0xAA..0xYY
1782      * So the key in the hash (klen) is length of encoded char -1
1783      */
1784     klen = UTF8SKIP(ptr) - 1;
1785     off  = ptr[klen];
1786
1787     if (klen == 0) {
1788       /* If char in invariant then swatch is for all the invariant chars
1789        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1790        */
1791         needents = UTF_CONTINUATION_MARK;
1792         off      = NATIVE_TO_UTF(ptr[klen]);
1793     }
1794     else {
1795       /* If char is encoded then swatch is for the prefix */
1796         needents = (1 << UTF_ACCUMULATION_SHIFT);
1797         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1798     }
1799
1800     /*
1801      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1802      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1803      * it's nothing to sniff at.)  Pity we usually come through at least
1804      * two function calls to get here...
1805      *
1806      * NB: this code assumes that swatches are never modified, once generated!
1807      */
1808
1809     if (hv   == PL_last_swash_hv &&
1810         klen == PL_last_swash_klen &&
1811         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1812     {
1813         tmps = PL_last_swash_tmps;
1814         slen = PL_last_swash_slen;
1815     }
1816     else {
1817         /* Try our second-level swatch cache, kept in a hash. */
1818         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1819
1820         /* If not cached, generate it via swash_get */
1821         if (!svp || !SvPOK(*svp)
1822                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1823             /* We use utf8n_to_uvuni() as we want an index into
1824                Unicode tables, not a native character number.
1825              */
1826             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1827                                            ckWARN(WARN_UTF8) ?
1828                                            0 : UTF8_ALLOW_ANY);
1829             swatch = swash_get(swash,
1830                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1831                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1832                                 needents);
1833
1834             if (IN_PERL_COMPILETIME)
1835                 CopHINTS_set(PL_curcop, PL_hints);
1836
1837             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1838
1839             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1840                      || (slen << 3) < needents)
1841                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1842         }
1843
1844         PL_last_swash_hv = hv;
1845         assert(klen <= sizeof(PL_last_swash_key));
1846         PL_last_swash_klen = (U8)klen;
1847         /* FIXME change interpvar.h?  */
1848         PL_last_swash_tmps = (U8 *) tmps;
1849         PL_last_swash_slen = slen;
1850         if (klen)
1851             Copy(ptr, PL_last_swash_key, klen, U8);
1852     }
1853
1854     switch ((int)((slen << 3) / needents)) {
1855     case 1:
1856         bit = 1 << (off & 7);
1857         off >>= 3;
1858         return (tmps[off] & bit) != 0;
1859     case 8:
1860         return tmps[off];
1861     case 16:
1862         off <<= 1;
1863         return (tmps[off] << 8) + tmps[off + 1] ;
1864     case 32:
1865         off <<= 2;
1866         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1867     }
1868     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1869     NORETURN_FUNCTION_END;
1870 }
1871
1872 /* Note:
1873  * Returns a swatch (a bit vector string) for a code point sequence
1874  * that starts from the value C<start> and comprises the number C<span>.
1875  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1876  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1877  */
1878 STATIC SV*
1879 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1880 {
1881     SV *swatch;
1882     U8 *l, *lend, *x, *xend, *s;
1883     STRLEN lcur, xcur, scur;
1884     HV *const hv = MUTABLE_HV(SvRV(swash));
1885     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1886     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1887     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1888     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1889     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1890     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1891     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1892     const STRLEN bits  = SvUV(*bitssvp);
1893     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1894     const UV     none  = SvUV(*nonesvp);
1895     const UV     end   = start + span;
1896
1897     PERL_ARGS_ASSERT_SWASH_GET;
1898
1899     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1900         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1901                                                  (UV)bits);
1902     }
1903
1904     /* create and initialize $swatch */
1905     scur   = octets ? (span * octets) : (span + 7) / 8;
1906     swatch = newSV(scur);
1907     SvPOK_on(swatch);
1908     s = (U8*)SvPVX(swatch);
1909     if (octets && none) {
1910         const U8* const e = s + scur;
1911         while (s < e) {
1912             if (bits == 8)
1913                 *s++ = (U8)(none & 0xff);
1914             else if (bits == 16) {
1915                 *s++ = (U8)((none >>  8) & 0xff);
1916                 *s++ = (U8)( none        & 0xff);
1917             }
1918             else if (bits == 32) {
1919                 *s++ = (U8)((none >> 24) & 0xff);
1920                 *s++ = (U8)((none >> 16) & 0xff);
1921                 *s++ = (U8)((none >>  8) & 0xff);
1922                 *s++ = (U8)( none        & 0xff);
1923             }
1924         }
1925         *s = '\0';
1926     }
1927     else {
1928         (void)memzero((U8*)s, scur + 1);
1929     }
1930     SvCUR_set(swatch, scur);
1931     s = (U8*)SvPVX(swatch);
1932
1933     /* read $swash->{LIST} */
1934     l = (U8*)SvPV(*listsvp, lcur);
1935     lend = l + lcur;
1936     while (l < lend) {
1937         UV min, max, val;
1938         STRLEN numlen;
1939         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1940
1941         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1942
1943         numlen = lend - l;
1944         min = grok_hex((char *)l, &numlen, &flags, NULL);
1945         if (numlen)
1946             l += numlen;
1947         else if (nl) {
1948             l = nl + 1; /* 1 is length of "\n" */
1949             continue;
1950         }
1951         else {
1952             l = lend; /* to LIST's end at which \n is not found */
1953             break;
1954         }
1955
1956         if (isBLANK(*l)) {
1957             ++l;
1958             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1959             numlen = lend - l;
1960             max = grok_hex((char *)l, &numlen, &flags, NULL);
1961             if (numlen)
1962                 l += numlen;
1963             else
1964                 max = min;
1965
1966             if (octets) {
1967                 if (isBLANK(*l)) {
1968                     ++l;
1969                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1970                             PERL_SCAN_DISALLOW_PREFIX;
1971                     numlen = lend - l;
1972                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1973                     if (numlen)
1974                         l += numlen;
1975                     else
1976                         val = 0;
1977                 }
1978                 else {
1979                     val = 0;
1980                     if (typeto) {
1981                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1982                                          typestr, l);
1983                     }
1984                 }
1985             }
1986             else
1987                 val = 0; /* bits == 1, then val should be ignored */
1988         }
1989         else {
1990             max = min;
1991             if (octets) {
1992                 val = 0;
1993                 if (typeto) {
1994                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1995                 }
1996             }
1997             else
1998                 val = 0; /* bits == 1, then val should be ignored */
1999         }
2000
2001         if (nl)
2002             l = nl + 1;
2003         else
2004             l = lend;
2005
2006         if (max < start)
2007             continue;
2008
2009         if (octets) {
2010             UV key;
2011             if (min < start) {
2012                 if (!none || val < none) {
2013                     val += start - min;
2014                 }
2015                 min = start;
2016             }
2017             for (key = min; key <= max; key++) {
2018                 STRLEN offset;
2019                 if (key >= end)
2020                     goto go_out_list;
2021                 /* offset must be non-negative (start <= min <= key < end) */
2022                 offset = octets * (key - start);
2023                 if (bits == 8)
2024                     s[offset] = (U8)(val & 0xff);
2025                 else if (bits == 16) {
2026                     s[offset    ] = (U8)((val >>  8) & 0xff);
2027                     s[offset + 1] = (U8)( val        & 0xff);
2028                 }
2029                 else if (bits == 32) {
2030                     s[offset    ] = (U8)((val >> 24) & 0xff);
2031                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2032                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2033                     s[offset + 3] = (U8)( val        & 0xff);
2034                 }
2035
2036                 if (!none || val < none)
2037                     ++val;
2038             }
2039         }
2040         else { /* bits == 1, then val should be ignored */
2041             UV key;
2042             if (min < start)
2043                 min = start;
2044             for (key = min; key <= max; key++) {
2045                 const STRLEN offset = (STRLEN)(key - start);
2046                 if (key >= end)
2047                     goto go_out_list;
2048                 s[offset >> 3] |= 1 << (offset & 7);
2049             }
2050         }
2051     } /* while */
2052   go_out_list:
2053
2054     /* read $swash->{EXTRAS} */
2055     x = (U8*)SvPV(*extssvp, xcur);
2056     xend = x + xcur;
2057     while (x < xend) {
2058         STRLEN namelen;
2059         U8 *namestr;
2060         SV** othersvp;
2061         HV* otherhv;
2062         STRLEN otherbits;
2063         SV **otherbitssvp, *other;
2064         U8 *s, *o, *nl;
2065         STRLEN slen, olen;
2066
2067         const U8 opc = *x++;
2068         if (opc == '\n')
2069             continue;
2070
2071         nl = (U8*)memchr(x, '\n', xend - x);
2072
2073         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2074             if (nl) {
2075                 x = nl + 1; /* 1 is length of "\n" */
2076                 continue;
2077             }
2078             else {
2079                 x = xend; /* to EXTRAS' end at which \n is not found */
2080                 break;
2081             }
2082         }
2083
2084         namestr = x;
2085         if (nl) {
2086             namelen = nl - namestr;
2087             x = nl + 1;
2088         }
2089         else {
2090             namelen = xend - namestr;
2091             x = xend;
2092         }
2093
2094         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2095         otherhv = MUTABLE_HV(SvRV(*othersvp));
2096         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2097         otherbits = (STRLEN)SvUV(*otherbitssvp);
2098         if (bits < otherbits)
2099             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2100
2101         /* The "other" swatch must be destroyed after. */
2102         other = swash_get(*othersvp, start, span);
2103         o = (U8*)SvPV(other, olen);
2104
2105         if (!olen)
2106             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2107
2108         s = (U8*)SvPV(swatch, slen);
2109         if (bits == 1 && otherbits == 1) {
2110             if (slen != olen)
2111                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2112
2113             switch (opc) {
2114             case '+':
2115                 while (slen--)
2116                     *s++ |= *o++;
2117                 break;
2118             case '!':
2119                 while (slen--)
2120                     *s++ |= ~*o++;
2121                 break;
2122             case '-':
2123                 while (slen--)
2124                     *s++ &= ~*o++;
2125                 break;
2126             case '&':
2127                 while (slen--)
2128                     *s++ &= *o++;
2129                 break;
2130             default:
2131                 break;
2132             }
2133         }
2134         else {
2135             STRLEN otheroctets = otherbits >> 3;
2136             STRLEN offset = 0;
2137             U8* const send = s + slen;
2138
2139             while (s < send) {
2140                 UV otherval = 0;
2141
2142                 if (otherbits == 1) {
2143                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2144                     ++offset;
2145                 }
2146                 else {
2147                     STRLEN vlen = otheroctets;
2148                     otherval = *o++;
2149                     while (--vlen) {
2150                         otherval <<= 8;
2151                         otherval |= *o++;
2152                     }
2153                 }
2154
2155                 if (opc == '+' && otherval)
2156                     NOOP;   /* replace with otherval */
2157                 else if (opc == '!' && !otherval)
2158                     otherval = 1;
2159                 else if (opc == '-' && otherval)
2160                     otherval = 0;
2161                 else if (opc == '&' && !otherval)
2162                     otherval = 0;
2163                 else {
2164                     s += octets; /* no replacement */
2165                     continue;
2166                 }
2167
2168                 if (bits == 8)
2169                     *s++ = (U8)( otherval & 0xff);
2170                 else if (bits == 16) {
2171                     *s++ = (U8)((otherval >>  8) & 0xff);
2172                     *s++ = (U8)( otherval        & 0xff);
2173                 }
2174                 else if (bits == 32) {
2175                     *s++ = (U8)((otherval >> 24) & 0xff);
2176                     *s++ = (U8)((otherval >> 16) & 0xff);
2177                     *s++ = (U8)((otherval >>  8) & 0xff);
2178                     *s++ = (U8)( otherval        & 0xff);
2179                 }
2180             }
2181         }
2182         sv_free(other); /* through with it! */
2183     } /* while */
2184     return swatch;
2185 }
2186
2187 /*
2188 =for apidoc uvchr_to_utf8
2189
2190 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2191 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2192 bytes available. The return value is the pointer to the byte after the
2193 end of the new character. In other words,
2194
2195     d = uvchr_to_utf8(d, uv);
2196
2197 is the recommended wide native character-aware way of saying
2198
2199     *(d++) = uv;
2200
2201 =cut
2202 */
2203
2204 /* On ASCII machines this is normally a macro but we want a
2205    real function in case XS code wants it
2206 */
2207 U8 *
2208 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2209 {
2210     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2211
2212     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2213 }
2214
2215 U8 *
2216 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2217 {
2218     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2219
2220     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2221 }
2222
2223 /*
2224 =for apidoc utf8n_to_uvchr
2225 flags
2226
2227 Returns the native character value of the first character in the string
2228 C<s>
2229 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2230 length, in bytes, of that character.
2231
2232 Allows length and flags to be passed to low level routine.
2233
2234 =cut
2235 */
2236 /* On ASCII machines this is normally a macro but we want
2237    a real function in case XS code wants it
2238 */
2239 UV
2240 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2241 U32 flags)
2242 {
2243     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2244
2245     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2246
2247     return UNI_TO_NATIVE(uv);
2248 }
2249
2250 /*
2251 =for apidoc pv_uni_display
2252
2253 Build to the scalar dsv a displayable version of the string spv,
2254 length len, the displayable version being at most pvlim bytes long
2255 (if longer, the rest is truncated and "..." will be appended).
2256
2257 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2258 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2259 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2260 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2261 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2262 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2263
2264 The pointer to the PV of the dsv is returned.
2265
2266 =cut */
2267 char *
2268 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2269 {
2270     int truncated = 0;
2271     const char *s, *e;
2272
2273     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2274
2275     sv_setpvs(dsv, "");
2276     SvUTF8_off(dsv);
2277     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2278          UV u;
2279           /* This serves double duty as a flag and a character to print after
2280              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2281           */
2282          char ok = 0;
2283
2284          if (pvlim && SvCUR(dsv) >= pvlim) {
2285               truncated++;
2286               break;
2287          }
2288          u = utf8_to_uvchr((U8*)s, 0);
2289          if (u < 256) {
2290              const unsigned char c = (unsigned char)u & 0xFF;
2291              if (flags & UNI_DISPLAY_BACKSLASH) {
2292                  switch (c) {
2293                  case '\n':
2294                      ok = 'n'; break;
2295                  case '\r':
2296                      ok = 'r'; break;
2297                  case '\t':
2298                      ok = 't'; break;
2299                  case '\f':
2300                      ok = 'f'; break;
2301                  case '\a':
2302                      ok = 'a'; break;
2303                  case '\\':
2304                      ok = '\\'; break;
2305                  default: break;
2306                  }
2307                  if (ok) {
2308                      const char string = ok;
2309                      sv_catpvs(dsv, "\\");
2310                      sv_catpvn(dsv, &string, 1);
2311                  }
2312              }
2313              /* isPRINT() is the locale-blind version. */
2314              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2315                  const char string = c;
2316                  sv_catpvn(dsv, &string, 1);
2317                  ok = 1;
2318              }
2319          }
2320          if (!ok)
2321              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2322     }
2323     if (truncated)
2324          sv_catpvs(dsv, "...");
2325
2326     return SvPVX(dsv);
2327 }
2328
2329 /*
2330 =for apidoc sv_uni_display
2331
2332 Build to the scalar dsv a displayable version of the scalar sv,
2333 the displayable version being at most pvlim bytes long
2334 (if longer, the rest is truncated and "..." will be appended).
2335
2336 The flags argument is as in pv_uni_display().
2337
2338 The pointer to the PV of the dsv is returned.
2339
2340 =cut
2341 */
2342 char *
2343 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2344 {
2345     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2346
2347      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2348                                 SvCUR(ssv), pvlim, flags);
2349 }
2350
2351 /*
2352 =for apidoc ibcmp_utf8
2353
2354 Return true if the strings s1 and s2 differ case-insensitively, false
2355 if not (if they are equal case-insensitively).  If u1 is true, the
2356 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2357 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2358 are false, the respective string is assumed to be in native 8-bit
2359 encoding.
2360
2361 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2362 in there (they will point at the beginning of the I<next> character).
2363 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2364 pointers beyond which scanning will not continue under any
2365 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2366 s2+l2 will be used as goal end pointers that will also stop the scan,
2367 and which qualify towards defining a successful match: all the scans
2368 that define an explicit length must reach their goal pointers for
2369 a match to succeed).
2370
2371 For case-insensitiveness, the "casefolding" of Unicode is used
2372 instead of upper/lowercasing both the characters, see
2373 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2374
2375 =cut */
2376 I32
2377 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2378 {
2379      dVAR;
2380      register const U8 *p1  = (const U8*)s1;
2381      register const U8 *p2  = (const U8*)s2;
2382      register const U8 *f1 = NULL;
2383      register const U8 *f2 = NULL;
2384      register U8 *e1 = NULL;
2385      register U8 *q1 = NULL;
2386      register U8 *e2 = NULL;
2387      register U8 *q2 = NULL;
2388      STRLEN n1 = 0, n2 = 0;
2389      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2390      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2391      U8 natbuf[1+1];
2392      STRLEN foldlen1, foldlen2;
2393      bool match;
2394
2395      PERL_ARGS_ASSERT_IBCMP_UTF8;
2396
2397      if (pe1)
2398           e1 = *(U8**)pe1;
2399      /* assert(e1 || l1); */
2400      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2401           f1 = (const U8*)s1 + l1;
2402      if (pe2)
2403           e2 = *(U8**)pe2;
2404      /* assert(e2 || l2); */
2405      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2406           f2 = (const U8*)s2 + l2;
2407
2408      /* This shouldn't happen. However, putting an assert() there makes some
2409       * tests fail. */
2410      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2411      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2412           return 1; /* mismatch; possible infinite loop or false positive */
2413
2414      if (!u1 || !u2)
2415           natbuf[1] = 0; /* Need to terminate the buffer. */
2416
2417      while ((e1 == 0 || p1 < e1) &&
2418             (f1 == 0 || p1 < f1) &&
2419             (e2 == 0 || p2 < e2) &&
2420             (f2 == 0 || p2 < f2)) {
2421           if (n1 == 0) {
2422                if (u1)
2423                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2424                else {
2425                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2426                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2427                }
2428                q1 = foldbuf1;
2429                n1 = foldlen1;
2430           }
2431           if (n2 == 0) {
2432                if (u2)
2433                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2434                else {
2435                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2436                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2437                }
2438                q2 = foldbuf2;
2439                n2 = foldlen2;
2440           }
2441           while (n1 && n2) {
2442                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2443                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2444                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2445                    return 1; /* mismatch */
2446                n1 -= UTF8SKIP(q1);
2447                q1 += UTF8SKIP(q1);
2448                n2 -= UTF8SKIP(q2);
2449                q2 += UTF8SKIP(q2);
2450           }
2451           if (n1 == 0)
2452                p1 += u1 ? UTF8SKIP(p1) : 1;
2453           if (n2 == 0)
2454                p2 += u2 ? UTF8SKIP(p2) : 1;
2455
2456      }
2457
2458      /* A match is defined by all the scans that specified
2459       * an explicit length reaching their final goals. */
2460      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2461
2462      if (match) {
2463           if (pe1)
2464                *pe1 = (char*)p1;
2465           if (pe2)
2466                *pe2 = (char*)p2;
2467      }
2468
2469      return match ? 0 : 1; /* 0 match, 1 mismatch */
2470 }
2471
2472 /*
2473  * Local variables:
2474  * c-indentation-style: bsd
2475  * c-basic-offset: 4
2476  * indent-tabs-mode: t
2477  * End:
2478  *
2479  * ex: set ts=8 sts=4 sw=4 noet:
2480  */