utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =cut
  55 */
  56
  57 /*
  58 =for apidoc is_ascii_string
  59
  60 Returns true if first C<len> bytes of the given string are ASCII (i.e. none
  61 of them even raise the question of UTF-8-ness).
  62
  63 See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  64
  65 =cut
  66 */
  67
  68 bool
  69 Perl_is_ascii_string(const U8 *s, STRLEN len)
  70 {
  71     const U8* const send = s + (len ? len : strlen((const char *)s));
  72     const U8* x = s;
  73
  74     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  75
  76     for (; x < send; ++x) {
  77         if (!UTF8_IS_INVARIANT(*x))
  78             break;
  79     }
  80
  81     return x == send;
  82 }
  83
  84 /*
  85 =for apidoc uvuni_to_utf8_flags
  86
  87 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  88 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  89 bytes available. The return value is the pointer to the byte after the
  90 end of the new character. In other words,
  91
  92     d = uvuni_to_utf8_flags(d, uv, flags);
  93
  94 or, in most cases,
  95
  96     d = uvuni_to_utf8(d, uv);
  97
  98 (which is equivalent to)
  99
 100     d = uvuni_to_utf8_flags(d, uv, 0);
 101
 102 is the recommended Unicode-aware way of saying
 103
 104     *(d++) = uv;
 105
 106 =cut
 107 */
 108
 109 U8 *
 110 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 111 {
 112     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
 113
 114     if (ckWARN(WARN_UTF8)) {
 115          if (UNICODE_IS_SURROGATE(uv) &&
 116              !(flags & UNICODE_ALLOW_SURROGATE))
 117               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
 118          else if (
 119                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
 120                     !(flags & UNICODE_ALLOW_FDD0))
 121                    ||
 122                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
 123                     !(flags & UNICODE_ALLOW_FFFF))) &&
 124                   /* UNICODE_ALLOW_SUPER includes
 125                    * FFFEs and FFFFs beyond 0x10FFFF. */
 126                   ((uv <= PERL_UNICODE_MAX) ||
 127                    !(flags & UNICODE_ALLOW_SUPER))
 128                   )
 129               Perl_warner(aTHX_ packWARN(WARN_UTF8),
 130                       "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
 131     }
 132     if (UNI_IS_INVARIANT(uv)) {
 133         *d++ = (U8)UTF_TO_NATIVE(uv);
 134         return d;
 135     }
 136 #if defined(EBCDIC)
 137     else {
 138         STRLEN len  = UNISKIP(uv);
 139         U8 *p = d+len-1;
 140         while (p > d) {
 141             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 142             uv >>= UTF_ACCUMULATION_SHIFT;
 143         }
 144         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 145         return d+len;
 146     }
 147 #else /* Non loop style */
 148     if (uv < 0x800) {
 149         *d++ = (U8)(( uv >>  6)         | 0xc0);
 150         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 151         return d;
 152     }
 153     if (uv < 0x10000) {
 154         *d++ = (U8)(( uv >> 12)         | 0xe0);
 155         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 156         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 157         return d;
 158     }
 159     if (uv < 0x200000) {
 160         *d++ = (U8)(( uv >> 18)         | 0xf0);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166     if (uv < 0x4000000) {
 167         *d++ = (U8)(( uv >> 24)         | 0xf8);
 168         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 169         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 170         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 171         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 172         return d;
 173     }
 174     if (uv < 0x80000000) {
 175         *d++ = (U8)(( uv >> 30)         | 0xfc);
 176         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 177         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 178         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 179         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 180         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 181         return d;
 182     }
 183 #ifdef HAS_QUAD
 184     if (uv < UTF8_QUAD_MAX)
 185 #endif
 186     {
 187         *d++ =                            0xfe; /* Can't match U+FEFF! */
 188         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 189         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 190         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 191         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 192         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 193         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 194         return d;
 195     }
 196 #ifdef HAS_QUAD
 197     {
 198         *d++ =                            0xff;         /* Can't match U+FFFE! */
 199         *d++ =                            0x80;         /* 6 Reserved bits */
 200         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 201         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 202         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 203         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 205         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 206         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 207         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 208         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 209         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 210         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 211         return d;
 212     }
 213 #endif
 214 #endif /* Loop style */
 215 }
 216
 217 /*
 218
 219 Tests if some arbitrary number of bytes begins in a valid UTF-8
 220 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 221 UTF-8 character.  The actual number of bytes in the UTF-8 character
 222 will be returned if it is valid, otherwise 0.
 223
 224 This is the "slow" version as opposed to the "fast" version which is
 225 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 226 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 227 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 228 you should use the _slow().  In practice this means that the _slow()
 229 will be used very rarely, since the maximum Unicode code point (as of
 230 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 231 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 232 five bytes or more.
 233
 234 =cut */
 235 STATIC STRLEN
 236 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 237 {
 238     U8 u = *s;
 239     STRLEN slen;
 240     UV uv, ouv;
 241
 242     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 243
 244     if (UTF8_IS_INVARIANT(u))
 245         return 1;
 246
 247     if (!UTF8_IS_START(u))
 248         return 0;
 249
 250     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 251         return 0;
 252
 253     slen = len - 1;
 254     s++;
 255 #ifdef EBCDIC
 256     u = NATIVE_TO_UTF(u);
 257 #endif
 258     u &= UTF_START_MASK(len);
 259     uv  = u;
 260     ouv = uv;
 261     while (slen--) {
 262         if (!UTF8_IS_CONTINUATION(*s))
 263             return 0;
 264         uv = UTF8_ACCUMULATE(uv, *s);
 265         if (uv < ouv)
 266             return 0;
 267         ouv = uv;
 268         s++;
 269     }
 270
 271     if ((STRLEN)UNISKIP(uv) < len)
 272         return 0;
 273
 274     return len;
 275 }
 276
 277 /*
 278 =for apidoc is_utf8_char
 279
 280 Tests if some arbitrary number of bytes begins in a valid UTF-8
 281 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 282 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 283 character will be returned if it is valid, otherwise 0.
 284
 285 =cut */
 286 STRLEN
 287 Perl_is_utf8_char(const U8 *s)
 288 {
 289     const STRLEN len = UTF8SKIP(s);
 290
 291     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 292 #ifdef IS_UTF8_CHAR
 293     if (IS_UTF8_CHAR_FAST(len))
 294         return IS_UTF8_CHAR(s, len) ? len : 0;
 295 #endif /* #ifdef IS_UTF8_CHAR */
 296     return is_utf8_char_slow(s, len);
 297 }
 298
 299
 300 /*
 301 =for apidoc is_utf8_string
 302
 303 Returns true if first C<len> bytes of the given string form a valid
 304 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 305 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 306 because a valid ASCII string is a valid UTF-8 string.
 307
 308 See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
 309
 310 =cut
 311 */
 312
 313 bool
 314 Perl_is_utf8_string(const U8 *s, STRLEN len)
 315 {
 316     const U8* const send = s + (len ? len : strlen((const char *)s));
 317     const U8* x = s;
 318
 319     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 320
 321     while (x < send) {
 322         STRLEN c;
 323          /* Inline the easy bits of is_utf8_char() here for speed... */
 324          if (UTF8_IS_INVARIANT(*x))
 325               c = 1;
 326          else if (!UTF8_IS_START(*x))
 327              goto out;
 328          else {
 329               /* ... and call is_utf8_char() only if really needed. */
 330 #ifdef IS_UTF8_CHAR
 331              c = UTF8SKIP(x);
 332              if (IS_UTF8_CHAR_FAST(c)) {
 333                  if (!IS_UTF8_CHAR(x, c))
 334                      c = 0;
 335              }
 336              else
 337                 c = is_utf8_char_slow(x, c);
 338 #else
 339              c = is_utf8_char(x);
 340 #endif /* #ifdef IS_UTF8_CHAR */
 341               if (!c)
 342                   goto out;
 343          }
 344         x += c;
 345     }
 346
 347  out:
 348     if (x != send)
 349         return FALSE;
 350
 351     return TRUE;
 352 }
 353
 354 /*
 355 Implemented as a macro in utf8.h
 356
 357 =for apidoc is_utf8_string_loc
 358
 359 Like is_utf8_string() but stores the location of the failure (in the
 360 case of "utf8ness failure") or the location s+len (in the case of
 361 "utf8ness success") in the C<ep>.
 362
 363 See also is_utf8_string_loclen() and is_utf8_string().
 364
 365 =for apidoc is_utf8_string_loclen
 366
 367 Like is_utf8_string() but stores the location of the failure (in the
 368 case of "utf8ness failure") or the location s+len (in the case of
 369 "utf8ness success") in the C<ep>, and the number of UTF-8
 370 encoded characters in the C<el>.
 371
 372 See also is_utf8_string_loc() and is_utf8_string().
 373
 374 =cut
 375 */
 376
 377 bool
 378 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 379 {
 380     const U8* const send = s + (len ? len : strlen((const char *)s));
 381     const U8* x = s;
 382     STRLEN c;
 383     STRLEN outlen = 0;
 384
 385     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 386
 387     while (x < send) {
 388          /* Inline the easy bits of is_utf8_char() here for speed... */
 389          if (UTF8_IS_INVARIANT(*x))
 390              c = 1;
 391          else if (!UTF8_IS_START(*x))
 392              goto out;
 393          else {
 394              /* ... and call is_utf8_char() only if really needed. */
 395 #ifdef IS_UTF8_CHAR
 396              c = UTF8SKIP(x);
 397              if (IS_UTF8_CHAR_FAST(c)) {
 398                  if (!IS_UTF8_CHAR(x, c))
 399                      c = 0;
 400              } else
 401                  c = is_utf8_char_slow(x, c);
 402 #else
 403              c = is_utf8_char(x);
 404 #endif /* #ifdef IS_UTF8_CHAR */
 405              if (!c)
 406                  goto out;
 407          }
 408          x += c;
 409          outlen++;
 410     }
 411
 412  out:
 413     if (el)
 414         *el = outlen;
 415
 416     if (ep)
 417         *ep = x;
 418     return (x == send);
 419 }
 420
 421 /*
 422
 423 =for apidoc utf8n_to_uvuni
 424
 425 Bottom level UTF-8 decode routine.
 426 Returns the Unicode code point value of the first character in the string C<s>
 427 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 428 C<retlen> will be set to the length, in bytes, of that character.
 429
 430 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 431 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 432 it is assumed that the caller will raise a warning, and this function
 433 will silently just set C<retlen> to C<-1> and return zero.  If the
 434 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 435 malformations will be given, C<retlen> will be set to the expected
 436 length of the UTF-8 character in bytes, and zero will be returned.
 437
 438 The C<flags> can also contain various flags to allow deviations from
 439 the strict UTF-8 encoding (see F<utf8.h>).
 440
 441 Most code should use utf8_to_uvchr() rather than call this directly.
 442
 443 =cut
 444 */
 445
 446 UV
 447 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 448 {
 449     dVAR;
 450     const U8 * const s0 = s;
 451     UV uv = *s, ouv = 0;
 452     STRLEN len = 1;
 453     const bool dowarn = ckWARN_d(WARN_UTF8);
 454     const UV startbyte = *s;
 455     STRLEN expectlen = 0;
 456     U32 warning = 0;
 457
 458     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 459
 460 /* This list is a superset of the UTF8_ALLOW_XXX. */
 461
 462 #define UTF8_WARN_EMPTY                          1
 463 #define UTF8_WARN_CONTINUATION                   2
 464 #define UTF8_WARN_NON_CONTINUATION               3
 465 #define UTF8_WARN_FE_FF                          4
 466 #define UTF8_WARN_SHORT                          5
 467 #define UTF8_WARN_OVERFLOW                       6
 468 #define UTF8_WARN_SURROGATE                      7
 469 #define UTF8_WARN_LONG                           8
 470 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 471
 472     if (curlen == 0 &&
 473         !(flags & UTF8_ALLOW_EMPTY)) {
 474         warning = UTF8_WARN_EMPTY;
 475         goto malformed;
 476     }
 477
 478     if (UTF8_IS_INVARIANT(uv)) {
 479         if (retlen)
 480             *retlen = 1;
 481         return (UV) (NATIVE_TO_UTF(*s));
 482     }
 483
 484     if (UTF8_IS_CONTINUATION(uv) &&
 485         !(flags & UTF8_ALLOW_CONTINUATION)) {
 486         warning = UTF8_WARN_CONTINUATION;
 487         goto malformed;
 488     }
 489
 490     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 491         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 492         warning = UTF8_WARN_NON_CONTINUATION;
 493         goto malformed;
 494     }
 495
 496 #ifdef EBCDIC
 497     uv = NATIVE_TO_UTF(uv);
 498 #else
 499     if ((uv == 0xfe || uv == 0xff) &&
 500         !(flags & UTF8_ALLOW_FE_FF)) {
 501         warning = UTF8_WARN_FE_FF;
 502         goto malformed;
 503     }
 504 #endif
 505
 506     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 507     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 508     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 509     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 510 #ifdef EBCDIC
 511     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 512     else                        { len =  7; uv &= 0x01; }
 513 #else
 514     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 515     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 516     else                        { len = 13; uv = 0; } /* whoa! */
 517 #endif
 518
 519     if (retlen)
 520         *retlen = len;
 521
 522     expectlen = len;
 523
 524     if ((curlen < expectlen) &&
 525         !(flags & UTF8_ALLOW_SHORT)) {
 526         warning = UTF8_WARN_SHORT;
 527         goto malformed;
 528     }
 529
 530     len--;
 531     s++;
 532     ouv = uv;
 533
 534     while (len--) {
 535         if (!UTF8_IS_CONTINUATION(*s) &&
 536             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 537             s--;
 538             warning = UTF8_WARN_NON_CONTINUATION;
 539             goto malformed;
 540         }
 541         else
 542             uv = UTF8_ACCUMULATE(uv, *s);
 543         if (!(uv > ouv)) {
 544             /* These cannot be allowed. */
 545             if (uv == ouv) {
 546                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 547                     warning = UTF8_WARN_LONG;
 548                     goto malformed;
 549                 }
 550             }
 551             else { /* uv < ouv */
 552                 /* This cannot be allowed. */
 553                 warning = UTF8_WARN_OVERFLOW;
 554                 goto malformed;
 555             }
 556         }
 557         s++;
 558         ouv = uv;
 559     }
 560
 561     if (UNICODE_IS_SURROGATE(uv) &&
 562         !(flags & UTF8_ALLOW_SURROGATE)) {
 563         warning = UTF8_WARN_SURROGATE;
 564         goto malformed;
 565     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 566                !(flags & UTF8_ALLOW_LONG)) {
 567         warning = UTF8_WARN_LONG;
 568         goto malformed;
 569     } else if (UNICODE_IS_ILLEGAL(uv) &&
 570                !(flags & UTF8_ALLOW_FFFF)) {
 571         warning = UTF8_WARN_FFFF;
 572         goto malformed;
 573     }
 574
 575     return uv;
 576
 577 malformed:
 578
 579     if (flags & UTF8_CHECK_ONLY) {
 580         if (retlen)
 581             *retlen = ((STRLEN) -1);
 582         return 0;
 583     }
 584
 585     if (dowarn) {
 586         SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 587
 588         switch (warning) {
 589         case 0: /* Intentionally empty. */ break;
 590         case UTF8_WARN_EMPTY:
 591             sv_catpvs(sv, "(empty string)");
 592             break;
 593         case UTF8_WARN_CONTINUATION:
 594             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 595             break;
 596         case UTF8_WARN_NON_CONTINUATION:
 597             if (s == s0)
 598                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 599                            (UV)s[1], startbyte);
 600             else {
 601                 const int len = (int)(s-s0);
 602                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 603                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 604             }
 605
 606             break;
 607         case UTF8_WARN_FE_FF:
 608             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 609             break;
 610         case UTF8_WARN_SHORT:
 611             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 612                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 613             expectlen = curlen;         /* distance for caller to skip */
 614             break;
 615         case UTF8_WARN_OVERFLOW:
 616             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 617                            ouv, *s, startbyte);
 618             break;
 619         case UTF8_WARN_SURROGATE:
 620             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 621             break;
 622         case UTF8_WARN_LONG:
 623             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 624                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 625             break;
 626         case UTF8_WARN_FFFF:
 627             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 628             break;
 629         default:
 630             sv_catpvs(sv, "(unknown reason)");
 631             break;
 632         }
 633
 634         if (warning) {
 635             const char * const s = SvPVX_const(sv);
 636
 637             if (PL_op)
 638                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 639                             "%s in %s", s,  OP_DESC(PL_op));
 640             else
 641                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 642         }
 643     }
 644
 645     if (retlen)
 646         *retlen = expectlen ? expectlen : len;
 647
 648     return 0;
 649 }
 650
 651 /*
 652 =for apidoc utf8_to_uvchr
 653
 654 Returns the native character value of the first character in the string C<s>
 655 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 656 length, in bytes, of that character.
 657
 658 If C<s> does not point to a well-formed UTF-8 character, zero is
 659 returned and retlen is set, if possible, to -1.
 660
 661 =cut
 662 */
 663
 664 UV
 665 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 666 {
 667     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 668
 669     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 670                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 671 }
 672
 673 /*
 674 =for apidoc utf8_to_uvuni
 675
 676 Returns the Unicode code point of the first character in the string C<s>
 677 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 678 length, in bytes, of that character.
 679
 680 This function should only be used when the returned UV is considered
 681 an index into the Unicode semantic tables (e.g. swashes).
 682
 683 If C<s> does not point to a well-formed UTF-8 character, zero is
 684 returned and retlen is set, if possible, to -1.
 685
 686 =cut
 687 */
 688
 689 UV
 690 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 691 {
 692     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 693
 694     /* Call the low level routine asking for checks */
 695     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 696                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 697 }
 698
 699 /*
 700 =for apidoc utf8_length
 701
 702 Return the length of the UTF-8 char encoded string C<s> in characters.
 703 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 704 up past C<e>, croaks.
 705
 706 =cut
 707 */
 708
 709 STRLEN
 710 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 711 {
 712     dVAR;
 713     STRLEN len = 0;
 714
 715     PERL_ARGS_ASSERT_UTF8_LENGTH;
 716
 717     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 718      * the bitops (especially ~) can create illegal UTF-8.
 719      * In other words: in Perl UTF-8 is not just for Unicode. */
 720
 721     if (e < s)
 722         goto warn_and_return;
 723     while (s < e) {
 724         if (!UTF8_IS_INVARIANT(*s))
 725             s += UTF8SKIP(s);
 726         else
 727             s++;
 728         len++;
 729     }
 730
 731     if (e != s) {
 732         len--;
 733         warn_and_return:
 734         if (PL_op)
 735             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
 736                              "%s in %s", unees, OP_DESC(PL_op));
 737         else
 738             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
 739     }
 740
 741     return len;
 742 }
 743
 744 /*
 745 =for apidoc utf8_distance
 746
 747 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 748 and C<b>.
 749
 750 WARNING: use only if you *know* that the pointers point inside the
 751 same UTF-8 buffer.
 752
 753 =cut
 754 */
 755
 756 IV
 757 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 758 {
 759     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 760
 761     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 762 }
 763
 764 /*
 765 =for apidoc utf8_hop
 766
 767 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 768 forward or backward.
 769
 770 WARNING: do not use the following unless you *know* C<off> is within
 771 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 772 on the first byte of character or just after the last byte of a character.
 773
 774 =cut
 775 */
 776
 777 U8 *
 778 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 779 {
 780     PERL_ARGS_ASSERT_UTF8_HOP;
 781
 782     PERL_UNUSED_CONTEXT;
 783     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 784      * the bitops (especially ~) can create illegal UTF-8.
 785      * In other words: in Perl UTF-8 is not just for Unicode. */
 786
 787     if (off >= 0) {
 788         while (off--)
 789             s += UTF8SKIP(s);
 790     }
 791     else {
 792         while (off++) {
 793             s--;
 794             while (UTF8_IS_CONTINUATION(*s))
 795                 s--;
 796         }
 797     }
 798     return (U8 *)s;
 799 }
 800
 801 /*
 802 =for apidoc utf8_to_bytes
 803
 804 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 805 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 806 updates len to contain the new length.
 807 Returns zero on failure, setting C<len> to -1.
 808
 809 If you need a copy of the string, see C<bytes_from_utf8>.
 810
 811 =cut
 812 */
 813
 814 U8 *
 815 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 816 {
 817     U8 * const save = s;
 818     U8 * const send = s + *len;
 819     U8 *d;
 820
 821     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 822
 823     /* ensure valid UTF-8 and chars < 256 before updating string */
 824     while (s < send) {
 825         U8 c = *s++;
 826
 827         if (!UTF8_IS_INVARIANT(c) &&
 828             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 829              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 830             *len = ((STRLEN) -1);
 831             return 0;
 832         }
 833     }
 834
 835     d = s = save;
 836     while (s < send) {
 837         STRLEN ulen;
 838         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 839         s += ulen;
 840     }
 841     *d = '\0';
 842     *len = d - save;
 843     return save;
 844 }
 845
 846 /*
 847 =for apidoc bytes_from_utf8
 848
 849 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 850 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 851 the newly-created string, and updates C<len> to contain the new
 852 length.  Returns the original string if no conversion occurs, C<len>
 853 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 854 0 if C<s> is converted or consisted entirely of characters that are invariant
 855 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
 856
 857 =cut
 858 */
 859
 860 U8 *
 861 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 862 {
 863     U8 *d;
 864     const U8 *start = s;
 865     const U8 *send;
 866     I32 count = 0;
 867
 868     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 869
 870     PERL_UNUSED_CONTEXT;
 871     if (!*is_utf8)
 872         return (U8 *)start;
 873
 874     /* ensure valid UTF-8 and chars < 256 before converting string */
 875     for (send = s + *len; s < send;) {
 876         U8 c = *s++;
 877         if (!UTF8_IS_INVARIANT(c)) {
 878             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 879                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 880                 count++;
 881             else
 882                 return (U8 *)start;
 883         }
 884     }
 885
 886     *is_utf8 = FALSE;
 887
 888     Newx(d, (*len) - count + 1, U8);
 889     s = start; start = d;
 890     while (s < send) {
 891         U8 c = *s++;
 892         if (!UTF8_IS_INVARIANT(c)) {
 893             /* Then it is two-byte encoded */
 894             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 895             c = ASCII_TO_NATIVE(c);
 896         }
 897         *d++ = c;
 898     }
 899     *d = '\0';
 900     *len = d - start;
 901     return (U8 *)start;
 902 }
 903
 904 /*
 905 =for apidoc bytes_to_utf8
 906
 907 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
 908 Returns a pointer to the newly-created string, and sets C<len> to
 909 reflect the new length.
 910
 911 A NUL character will be written after the end of the string.
 912
 913 If you want to convert to UTF-8 from encodings other than
 914 the native (Latin1 or EBCDIC),
 915 see sv_recode_to_utf8().
 916
 917 =cut
 918 */
 919
 920 U8*
 921 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 922 {
 923     const U8 * const send = s + (*len);
 924     U8 *d;
 925     U8 *dst;
 926
 927     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 928     PERL_UNUSED_CONTEXT;
 929
 930     Newx(d, (*len) * 2 + 1, U8);
 931     dst = d;
 932
 933     while (s < send) {
 934         const UV uv = NATIVE_TO_ASCII(*s++);
 935         if (UNI_IS_INVARIANT(uv))
 936             *d++ = (U8)UTF_TO_NATIVE(uv);
 937         else {
 938             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 939             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 940         }
 941     }
 942     *d = '\0';
 943     *len = d-dst;
 944     return dst;
 945 }
 946
 947 /*
 948  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 949  *
 950  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 951  * We optimize for native, for obvious reasons. */
 952
 953 U8*
 954 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 955 {
 956     U8* pend;
 957     U8* dstart = d;
 958
 959     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 960
 961     if (bytelen & 1)
 962         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 963
 964     pend = p + bytelen;
 965
 966     while (p < pend) {
 967         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 968         p += 2;
 969         if (uv < 0x80) {
 970 #ifdef EBCDIC
 971             *d++ = UNI_TO_NATIVE(uv);
 972 #else
 973             *d++ = (U8)uv;
 974 #endif
 975             continue;
 976         }
 977         if (uv < 0x800) {
 978             *d++ = (U8)(( uv >>  6)         | 0xc0);
 979             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 980             continue;
 981         }
 982         if (uv >= 0xd800 && uv <= 0xdbff) {     /* surrogates */
 983             if (p >= pend) {
 984                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 985             } else {
 986                 UV low = (p[0] << 8) + p[1];
 987                 p += 2;
 988                 if (low < 0xdc00 || low > 0xdfff)
 989                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 990                 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 991             }
 992         } else if (uv >= 0xdc00 && uv <= 0xdfff) {
 993             Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 994         }
 995         if (uv < 0x10000) {
 996             *d++ = (U8)(( uv >> 12)         | 0xe0);
 997             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 998             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 999             continue;
1000         }
1001         else {
1002             *d++ = (U8)(( uv >> 18)         | 0xf0);
1003             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1004             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1005             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1006             continue;
1007         }
1008     }
1009     *newlen = d - dstart;
1010     return d;
1011 }
1012
1013 /* Note: this one is slightly destructive of the source. */
1014
1015 U8*
1016 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1017 {
1018     U8* s = (U8*)p;
1019     U8* const send = s + bytelen;
1020
1021     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1022
1023     if (bytelen & 1)
1024         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1025                    (UV)bytelen);
1026
1027     while (s < send) {
1028         const U8 tmp = s[0];
1029         s[0] = s[1];
1030         s[1] = tmp;
1031         s += 2;
1032     }
1033     return utf16_to_utf8(p, d, bytelen, newlen);
1034 }
1035
1036 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1037
1038 bool
1039 Perl_is_uni_alnum(pTHX_ UV c)
1040 {
1041     U8 tmpbuf[UTF8_MAXBYTES+1];
1042     uvchr_to_utf8(tmpbuf, c);
1043     return is_utf8_alnum(tmpbuf);
1044 }
1045
1046 bool
1047 Perl_is_uni_idfirst(pTHX_ UV c)
1048 {
1049     U8 tmpbuf[UTF8_MAXBYTES+1];
1050     uvchr_to_utf8(tmpbuf, c);
1051     return is_utf8_idfirst(tmpbuf);
1052 }
1053
1054 bool
1055 Perl_is_uni_alpha(pTHX_ UV c)
1056 {
1057     U8 tmpbuf[UTF8_MAXBYTES+1];
1058     uvchr_to_utf8(tmpbuf, c);
1059     return is_utf8_alpha(tmpbuf);
1060 }
1061
1062 bool
1063 Perl_is_uni_ascii(pTHX_ UV c)
1064 {
1065     U8 tmpbuf[UTF8_MAXBYTES+1];
1066     uvchr_to_utf8(tmpbuf, c);
1067     return is_utf8_ascii(tmpbuf);
1068 }
1069
1070 bool
1071 Perl_is_uni_space(pTHX_ UV c)
1072 {
1073     U8 tmpbuf[UTF8_MAXBYTES+1];
1074     uvchr_to_utf8(tmpbuf, c);
1075     return is_utf8_space(tmpbuf);
1076 }
1077
1078 bool
1079 Perl_is_uni_digit(pTHX_ UV c)
1080 {
1081     U8 tmpbuf[UTF8_MAXBYTES+1];
1082     uvchr_to_utf8(tmpbuf, c);
1083     return is_utf8_digit(tmpbuf);
1084 }
1085
1086 bool
1087 Perl_is_uni_upper(pTHX_ UV c)
1088 {
1089     U8 tmpbuf[UTF8_MAXBYTES+1];
1090     uvchr_to_utf8(tmpbuf, c);
1091     return is_utf8_upper(tmpbuf);
1092 }
1093
1094 bool
1095 Perl_is_uni_lower(pTHX_ UV c)
1096 {
1097     U8 tmpbuf[UTF8_MAXBYTES+1];
1098     uvchr_to_utf8(tmpbuf, c);
1099     return is_utf8_lower(tmpbuf);
1100 }
1101
1102 bool
1103 Perl_is_uni_cntrl(pTHX_ UV c)
1104 {
1105     U8 tmpbuf[UTF8_MAXBYTES+1];
1106     uvchr_to_utf8(tmpbuf, c);
1107     return is_utf8_cntrl(tmpbuf);
1108 }
1109
1110 bool
1111 Perl_is_uni_graph(pTHX_ UV c)
1112 {
1113     U8 tmpbuf[UTF8_MAXBYTES+1];
1114     uvchr_to_utf8(tmpbuf, c);
1115     return is_utf8_graph(tmpbuf);
1116 }
1117
1118 bool
1119 Perl_is_uni_print(pTHX_ UV c)
1120 {
1121     U8 tmpbuf[UTF8_MAXBYTES+1];
1122     uvchr_to_utf8(tmpbuf, c);
1123     return is_utf8_print(tmpbuf);
1124 }
1125
1126 bool
1127 Perl_is_uni_punct(pTHX_ UV c)
1128 {
1129     U8 tmpbuf[UTF8_MAXBYTES+1];
1130     uvchr_to_utf8(tmpbuf, c);
1131     return is_utf8_punct(tmpbuf);
1132 }
1133
1134 bool
1135 Perl_is_uni_xdigit(pTHX_ UV c)
1136 {
1137     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1138     uvchr_to_utf8(tmpbuf, c);
1139     return is_utf8_xdigit(tmpbuf);
1140 }
1141
1142 UV
1143 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1144 {
1145     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1146
1147     uvchr_to_utf8(p, c);
1148     return to_utf8_upper(p, p, lenp);
1149 }
1150
1151 UV
1152 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1153 {
1154     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1155
1156     uvchr_to_utf8(p, c);
1157     return to_utf8_title(p, p, lenp);
1158 }
1159
1160 UV
1161 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1162 {
1163     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1164
1165     uvchr_to_utf8(p, c);
1166     return to_utf8_lower(p, p, lenp);
1167 }
1168
1169 UV
1170 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1171 {
1172     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1173
1174     uvchr_to_utf8(p, c);
1175     return to_utf8_fold(p, p, lenp);
1176 }
1177
1178 /* for now these all assume no locale info available for Unicode > 255 */
1179
1180 bool
1181 Perl_is_uni_alnum_lc(pTHX_ UV c)
1182 {
1183     return is_uni_alnum(c);     /* XXX no locale support yet */
1184 }
1185
1186 bool
1187 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1188 {
1189     return is_uni_idfirst(c);   /* XXX no locale support yet */
1190 }
1191
1192 bool
1193 Perl_is_uni_alpha_lc(pTHX_ UV c)
1194 {
1195     return is_uni_alpha(c);     /* XXX no locale support yet */
1196 }
1197
1198 bool
1199 Perl_is_uni_ascii_lc(pTHX_ UV c)
1200 {
1201     return is_uni_ascii(c);     /* XXX no locale support yet */
1202 }
1203
1204 bool
1205 Perl_is_uni_space_lc(pTHX_ UV c)
1206 {
1207     return is_uni_space(c);     /* XXX no locale support yet */
1208 }
1209
1210 bool
1211 Perl_is_uni_digit_lc(pTHX_ UV c)
1212 {
1213     return is_uni_digit(c);     /* XXX no locale support yet */
1214 }
1215
1216 bool
1217 Perl_is_uni_upper_lc(pTHX_ UV c)
1218 {
1219     return is_uni_upper(c);     /* XXX no locale support yet */
1220 }
1221
1222 bool
1223 Perl_is_uni_lower_lc(pTHX_ UV c)
1224 {
1225     return is_uni_lower(c);     /* XXX no locale support yet */
1226 }
1227
1228 bool
1229 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1230 {
1231     return is_uni_cntrl(c);     /* XXX no locale support yet */
1232 }
1233
1234 bool
1235 Perl_is_uni_graph_lc(pTHX_ UV c)
1236 {
1237     return is_uni_graph(c);     /* XXX no locale support yet */
1238 }
1239
1240 bool
1241 Perl_is_uni_print_lc(pTHX_ UV c)
1242 {
1243     return is_uni_print(c);     /* XXX no locale support yet */
1244 }
1245
1246 bool
1247 Perl_is_uni_punct_lc(pTHX_ UV c)
1248 {
1249     return is_uni_punct(c);     /* XXX no locale support yet */
1250 }
1251
1252 bool
1253 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1254 {
1255     return is_uni_xdigit(c);    /* XXX no locale support yet */
1256 }
1257
1258 U32
1259 Perl_to_uni_upper_lc(pTHX_ U32 c)
1260 {
1261     /* XXX returns only the first character -- do not use XXX */
1262     /* XXX no locale support yet */
1263     STRLEN len;
1264     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1265     return (U32)to_uni_upper(c, tmpbuf, &len);
1266 }
1267
1268 U32
1269 Perl_to_uni_title_lc(pTHX_ U32 c)
1270 {
1271     /* XXX returns only the first character XXX -- do not use XXX */
1272     /* XXX no locale support yet */
1273     STRLEN len;
1274     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1275     return (U32)to_uni_title(c, tmpbuf, &len);
1276 }
1277
1278 U32
1279 Perl_to_uni_lower_lc(pTHX_ U32 c)
1280 {
1281     /* XXX returns only the first character -- do not use XXX */
1282     /* XXX no locale support yet */
1283     STRLEN len;
1284     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1285     return (U32)to_uni_lower(c, tmpbuf, &len);
1286 }
1287
1288 static bool
1289 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1290                  const char *const swashname)
1291 {
1292     dVAR;
1293
1294     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1295
1296     if (!is_utf8_char(p))
1297         return FALSE;
1298     if (!*swash)
1299         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1300     return swash_fetch(*swash, p, TRUE) != 0;
1301 }
1302
1303 bool
1304 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1305 {
1306     dVAR;
1307
1308     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1309
1310     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1311      * descendant of isalnum(3), in other words, it doesn't
1312      * contain the '_'. --jhi */
1313     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1314 }
1315
1316 bool
1317 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1318 {
1319     dVAR;
1320
1321     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1322
1323     if (*p == '_')
1324         return TRUE;
1325     /* is_utf8_idstart would be more logical. */
1326     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1327 }
1328
1329 bool
1330 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1331 {
1332     dVAR;
1333
1334     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1335
1336     if (*p == '_')
1337         return TRUE;
1338     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1339 }
1340
1341 bool
1342 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1343 {
1344     dVAR;
1345
1346     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1347
1348     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1349 }
1350
1351 bool
1352 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1353 {
1354     dVAR;
1355
1356     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1357
1358     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1359 }
1360
1361 bool
1362 Perl_is_utf8_space(pTHX_ const U8 *p)
1363 {
1364     dVAR;
1365
1366     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1367
1368     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1369 }
1370
1371 bool
1372 Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1373 {
1374     dVAR;
1375
1376     PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
1377
1378     return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
1379 }
1380
1381 bool
1382 Perl_is_utf8_perl_word(pTHX_ const U8 *p)
1383 {
1384     dVAR;
1385
1386     PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
1387
1388     return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
1389 }
1390
1391 bool
1392 Perl_is_utf8_digit(pTHX_ const U8 *p)
1393 {
1394     dVAR;
1395
1396     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1397
1398     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1399 }
1400
1401 bool
1402 Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
1403 {
1404     dVAR;
1405
1406     PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
1407
1408     return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
1409 }
1410
1411 bool
1412 Perl_is_utf8_upper(pTHX_ const U8 *p)
1413 {
1414     dVAR;
1415
1416     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1417
1418     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1419 }
1420
1421 bool
1422 Perl_is_utf8_lower(pTHX_ const U8 *p)
1423 {
1424     dVAR;
1425
1426     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1427
1428     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1429 }
1430
1431 bool
1432 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1433 {
1434     dVAR;
1435
1436     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1437
1438     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1439 }
1440
1441 bool
1442 Perl_is_utf8_graph(pTHX_ const U8 *p)
1443 {
1444     dVAR;
1445
1446     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1447
1448     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1449 }
1450
1451 bool
1452 Perl_is_utf8_print(pTHX_ const U8 *p)
1453 {
1454     dVAR;
1455
1456     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1457
1458     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1459 }
1460
1461 bool
1462 Perl_is_utf8_punct(pTHX_ const U8 *p)
1463 {
1464     dVAR;
1465
1466     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1467
1468     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1469 }
1470
1471 bool
1472 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1473 {
1474     dVAR;
1475
1476     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1477
1478     return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
1479 }
1480
1481 bool
1482 Perl_is_utf8_mark(pTHX_ const U8 *p)
1483 {
1484     dVAR;
1485
1486     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1487
1488     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1489 }
1490
1491 /*
1492 =for apidoc to_utf8_case
1493
1494 The "p" contains the pointer to the UTF-8 string encoding
1495 the character that is being converted.
1496
1497 The "ustrp" is a pointer to the character buffer to put the
1498 conversion result to.  The "lenp" is a pointer to the length
1499 of the result.
1500
1501 The "swashp" is a pointer to the swash to use.
1502
1503 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1504 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1505 but not always, a multicharacter mapping), is tried first.
1506
1507 The "special" is a string like "utf8::ToSpecLower", which means the
1508 hash %utf8::ToSpecLower.  The access to the hash is through
1509 Perl_to_utf8_case().
1510
1511 The "normal" is a string like "ToLower" which means the swash
1512 %utf8::ToLower.
1513
1514 =cut */
1515
1516 UV
1517 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1518                         SV **swashp, const char *normal, const char *special)
1519 {
1520     dVAR;
1521     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1522     STRLEN len = 0;
1523     const UV uv0 = utf8_to_uvchr(p, NULL);
1524     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1525      * are necessary in EBCDIC, they are redundant no-ops
1526      * in ASCII-ish platforms, and hopefully optimized away. */
1527     const UV uv1 = NATIVE_TO_UNI(uv0);
1528
1529     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1530
1531     uvuni_to_utf8(tmpbuf, uv1);
1532
1533     if (!*swashp) /* load on-demand */
1534          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1535
1536     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1537     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1538          /* It might be "special" (sometimes, but not always,
1539           * a multicharacter mapping) */
1540          HV * const hv = get_hv(special, 0);
1541          SV **svp;
1542
1543          if (hv &&
1544              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1545              (*svp)) {
1546              const char *s;
1547
1548               s = SvPV_const(*svp, len);
1549               if (len == 1)
1550                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1551               else {
1552 #ifdef EBCDIC
1553                    /* If we have EBCDIC we need to remap the characters
1554                     * since any characters in the low 256 are Unicode
1555                     * code points, not EBCDIC. */
1556                    U8 *t = (U8*)s, *tend = t + len, *d;
1557
1558                    d = tmpbuf;
1559                    if (SvUTF8(*svp)) {
1560                         STRLEN tlen = 0;
1561
1562                         while (t < tend) {
1563                              const UV c = utf8_to_uvchr(t, &tlen);
1564                              if (tlen > 0) {
1565                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1566                                   t += tlen;
1567                              }
1568                              else
1569                                   break;
1570                         }
1571                    }
1572                    else {
1573                         while (t < tend) {
1574                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1575                              t++;
1576                         }
1577                    }
1578                    len = d - tmpbuf;
1579                    Copy(tmpbuf, ustrp, len, U8);
1580 #else
1581                    Copy(s, ustrp, len, U8);
1582 #endif
1583               }
1584          }
1585     }
1586
1587     if (!len && *swashp) {
1588         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1589
1590          if (uv2) {
1591               /* It was "normal" (a single character mapping). */
1592               const UV uv3 = UNI_TO_NATIVE(uv2);
1593               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1594          }
1595     }
1596
1597     if (!len) /* Neither: just copy. */
1598          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1599
1600     if (lenp)
1601          *lenp = len;
1602
1603     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1604 }
1605
1606 /*
1607 =for apidoc to_utf8_upper
1608
1609 Convert the UTF-8 encoded character at p to its uppercase version and
1610 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1611 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1612 the uppercase version may be longer than the original character.
1613
1614 The first character of the uppercased version is returned
1615 (but note, as explained above, that there may be more.)
1616
1617 =cut */
1618
1619 UV
1620 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1621 {
1622     dVAR;
1623
1624     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1625
1626     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1627                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1628 }
1629
1630 /*
1631 =for apidoc to_utf8_title
1632
1633 Convert the UTF-8 encoded character at p to its titlecase version and
1634 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1635 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1636 titlecase version may be longer than the original character.
1637
1638 The first character of the titlecased version is returned
1639 (but note, as explained above, that there may be more.)
1640
1641 =cut */
1642
1643 UV
1644 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1645 {
1646     dVAR;
1647
1648     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1649
1650     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1651                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1652 }
1653
1654 /*
1655 =for apidoc to_utf8_lower
1656
1657 Convert the UTF-8 encoded character at p to its lowercase version and
1658 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1659 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1660 lowercase version may be longer than the original character.
1661
1662 The first character of the lowercased version is returned
1663 (but note, as explained above, that there may be more.)
1664
1665 =cut */
1666
1667 UV
1668 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1669 {
1670     dVAR;
1671
1672     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1673
1674     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1675                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1676 }
1677
1678 /*
1679 =for apidoc to_utf8_fold
1680
1681 Convert the UTF-8 encoded character at p to its foldcase version and
1682 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1683 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1684 foldcase version may be longer than the original character (up to
1685 three characters).
1686
1687 The first character of the foldcased version is returned
1688 (but note, as explained above, that there may be more.)
1689
1690 =cut */
1691
1692 UV
1693 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1694 {
1695     dVAR;
1696
1697     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1698
1699     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1700                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1701 }
1702
1703 /* Note:
1704  * A "swash" is a swatch hash.
1705  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1706  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1707  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1708  */
1709 SV*
1710 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1711 {
1712     dVAR;
1713     SV* retval;
1714     dSP;
1715     const size_t pkg_len = strlen(pkg);
1716     const size_t name_len = strlen(name);
1717     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1718     SV* errsv_save;
1719
1720     PERL_ARGS_ASSERT_SWASH_INIT;
1721
1722     PUSHSTACKi(PERLSI_MAGIC);
1723     ENTER;
1724     SAVEI32(PL_hints);
1725     PL_hints = 0;
1726     save_re_context();
1727     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1728         ENTER;
1729         errsv_save = newSVsv(ERRSV);
1730         /* It is assumed that callers of this routine are not passing in any
1731            user derived data.  */
1732         /* Need to do this after save_re_context() as it will set PL_tainted to
1733            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1734            Even line to create errsv_save can turn on PL_tainted.  */
1735         SAVEBOOL(PL_tainted);
1736         PL_tainted = 0;
1737         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1738                          NULL);
1739         if (!SvTRUE(ERRSV))
1740             sv_setsv(ERRSV, errsv_save);
1741         SvREFCNT_dec(errsv_save);
1742         LEAVE;
1743     }
1744     SPAGAIN;
1745     PUSHMARK(SP);
1746     EXTEND(SP,5);
1747     mPUSHp(pkg, pkg_len);
1748     mPUSHp(name, name_len);
1749     PUSHs(listsv);
1750     mPUSHi(minbits);
1751     mPUSHi(none);
1752     PUTBACK;
1753     errsv_save = newSVsv(ERRSV);
1754     if (call_method("SWASHNEW", G_SCALAR))
1755         retval = newSVsv(*PL_stack_sp--);
1756     else
1757         retval = &PL_sv_undef;
1758     if (!SvTRUE(ERRSV))
1759         sv_setsv(ERRSV, errsv_save);
1760     SvREFCNT_dec(errsv_save);
1761     LEAVE;
1762     POPSTACK;
1763     if (IN_PERL_COMPILETIME) {
1764         CopHINTS_set(PL_curcop, PL_hints);
1765     }
1766     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1767         if (SvPOK(retval))
1768             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1769                        SVfARG(retval));
1770         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1771     }
1772     return retval;
1773 }
1774
1775
1776 /* This API is wrong for special case conversions since we may need to
1777  * return several Unicode characters for a single Unicode character
1778  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1779  * the lower-level routine, and it is similarly broken for returning
1780  * multiple values.  --jhi */
1781 /* Now SWASHGET is recasted into S_swash_get in this file. */
1782
1783 /* Note:
1784  * Returns the value of property/mapping C<swash> for the first character
1785  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1786  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1787  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1788  */
1789 UV
1790 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1791 {
1792     dVAR;
1793     HV *const hv = MUTABLE_HV(SvRV(swash));
1794     U32 klen;
1795     U32 off;
1796     STRLEN slen;
1797     STRLEN needents;
1798     const U8 *tmps = NULL;
1799     U32 bit;
1800     SV *swatch;
1801     U8 tmputf8[2];
1802     const UV c = NATIVE_TO_ASCII(*ptr);
1803
1804     PERL_ARGS_ASSERT_SWASH_FETCH;
1805
1806     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1807         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1808         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1809         ptr = tmputf8;
1810     }
1811     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1812      * then the "swatch" is a vec() for al the chars which start
1813      * with 0xAA..0xYY
1814      * So the key in the hash (klen) is length of encoded char -1
1815      */
1816     klen = UTF8SKIP(ptr) - 1;
1817     off  = ptr[klen];
1818
1819     if (klen == 0) {
1820       /* If char in invariant then swatch is for all the invariant chars
1821        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1822        */
1823         needents = UTF_CONTINUATION_MARK;
1824         off      = NATIVE_TO_UTF(ptr[klen]);
1825     }
1826     else {
1827       /* If char is encoded then swatch is for the prefix */
1828         needents = (1 << UTF_ACCUMULATION_SHIFT);
1829         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1830     }
1831
1832     /*
1833      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1834      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1835      * it's nothing to sniff at.)  Pity we usually come through at least
1836      * two function calls to get here...
1837      *
1838      * NB: this code assumes that swatches are never modified, once generated!
1839      */
1840
1841     if (hv   == PL_last_swash_hv &&
1842         klen == PL_last_swash_klen &&
1843         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1844     {
1845         tmps = PL_last_swash_tmps;
1846         slen = PL_last_swash_slen;
1847     }
1848     else {
1849         /* Try our second-level swatch cache, kept in a hash. */
1850         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1851
1852         /* If not cached, generate it via swash_get */
1853         if (!svp || !SvPOK(*svp)
1854                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1855             /* We use utf8n_to_uvuni() as we want an index into
1856                Unicode tables, not a native character number.
1857              */
1858             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1859                                            ckWARN(WARN_UTF8) ?
1860                                            0 : UTF8_ALLOW_ANY);
1861             swatch = swash_get(swash,
1862                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1863                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1864                                 needents);
1865
1866             if (IN_PERL_COMPILETIME)
1867                 CopHINTS_set(PL_curcop, PL_hints);
1868
1869             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1870
1871             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1872                      || (slen << 3) < needents)
1873                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1874         }
1875
1876         PL_last_swash_hv = hv;
1877         assert(klen <= sizeof(PL_last_swash_key));
1878         PL_last_swash_klen = (U8)klen;
1879         /* FIXME change interpvar.h?  */
1880         PL_last_swash_tmps = (U8 *) tmps;
1881         PL_last_swash_slen = slen;
1882         if (klen)
1883             Copy(ptr, PL_last_swash_key, klen, U8);
1884     }
1885
1886     switch ((int)((slen << 3) / needents)) {
1887     case 1:
1888         bit = 1 << (off & 7);
1889         off >>= 3;
1890         return (tmps[off] & bit) != 0;
1891     case 8:
1892         return tmps[off];
1893     case 16:
1894         off <<= 1;
1895         return (tmps[off] << 8) + tmps[off + 1] ;
1896     case 32:
1897         off <<= 2;
1898         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1899     }
1900     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1901     NORETURN_FUNCTION_END;
1902 }
1903
1904 /* Note:
1905  * Returns a swatch (a bit vector string) for a code point sequence
1906  * that starts from the value C<start> and comprises the number C<span>.
1907  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1908  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1909  */
1910 STATIC SV*
1911 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1912 {
1913     SV *swatch;
1914     U8 *l, *lend, *x, *xend, *s;
1915     STRLEN lcur, xcur, scur;
1916     HV *const hv = MUTABLE_HV(SvRV(swash));
1917     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1918     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1919     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1920     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1921     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1922     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1923     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1924     const STRLEN bits  = SvUV(*bitssvp);
1925     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1926     const UV     none  = SvUV(*nonesvp);
1927     const UV     end   = start + span;
1928
1929     PERL_ARGS_ASSERT_SWASH_GET;
1930
1931     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1932         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1933                                                  (UV)bits);
1934     }
1935
1936     /* create and initialize $swatch */
1937     scur   = octets ? (span * octets) : (span + 7) / 8;
1938     swatch = newSV(scur);
1939     SvPOK_on(swatch);
1940     s = (U8*)SvPVX(swatch);
1941     if (octets && none) {
1942         const U8* const e = s + scur;
1943         while (s < e) {
1944             if (bits == 8)
1945                 *s++ = (U8)(none & 0xff);
1946             else if (bits == 16) {
1947                 *s++ = (U8)((none >>  8) & 0xff);
1948                 *s++ = (U8)( none        & 0xff);
1949             }
1950             else if (bits == 32) {
1951                 *s++ = (U8)((none >> 24) & 0xff);
1952                 *s++ = (U8)((none >> 16) & 0xff);
1953                 *s++ = (U8)((none >>  8) & 0xff);
1954                 *s++ = (U8)( none        & 0xff);
1955             }
1956         }
1957         *s = '\0';
1958     }
1959     else {
1960         (void)memzero((U8*)s, scur + 1);
1961     }
1962     SvCUR_set(swatch, scur);
1963     s = (U8*)SvPVX(swatch);
1964
1965     /* read $swash->{LIST} */
1966     l = (U8*)SvPV(*listsvp, lcur);
1967     lend = l + lcur;
1968     while (l < lend) {
1969         UV min, max, val;
1970         STRLEN numlen;
1971         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1972
1973         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1974
1975         numlen = lend - l;
1976         min = grok_hex((char *)l, &numlen, &flags, NULL);
1977         if (numlen)
1978             l += numlen;
1979         else if (nl) {
1980             l = nl + 1; /* 1 is length of "\n" */
1981             continue;
1982         }
1983         else {
1984             l = lend; /* to LIST's end at which \n is not found */
1985             break;
1986         }
1987
1988         if (isBLANK(*l)) {
1989             ++l;
1990             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1991             numlen = lend - l;
1992             max = grok_hex((char *)l, &numlen, &flags, NULL);
1993             if (numlen)
1994                 l += numlen;
1995             else
1996                 max = min;
1997
1998             if (octets) {
1999                 if (isBLANK(*l)) {
2000                     ++l;
2001                     flags = PERL_SCAN_SILENT_ILLDIGIT |
2002                             PERL_SCAN_DISALLOW_PREFIX;
2003                     numlen = lend - l;
2004                     val = grok_hex((char *)l, &numlen, &flags, NULL);
2005                     if (numlen)
2006                         l += numlen;
2007                     else
2008                         val = 0;
2009                 }
2010                 else {
2011                     val = 0;
2012                     if (typeto) {
2013                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2014                                          typestr, l);
2015                     }
2016                 }
2017             }
2018             else
2019                 val = 0; /* bits == 1, then val should be ignored */
2020         }
2021         else {
2022             max = min;
2023             if (octets) {
2024                 val = 0;
2025                 if (typeto) {
2026                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2027                 }
2028             }
2029             else
2030                 val = 0; /* bits == 1, then val should be ignored */
2031         }
2032
2033         if (nl)
2034             l = nl + 1;
2035         else
2036             l = lend;
2037
2038         if (max < start)
2039             continue;
2040
2041         if (octets) {
2042             UV key;
2043             if (min < start) {
2044                 if (!none || val < none) {
2045                     val += start - min;
2046                 }
2047                 min = start;
2048             }
2049             for (key = min; key <= max; key++) {
2050                 STRLEN offset;
2051                 if (key >= end)
2052                     goto go_out_list;
2053                 /* offset must be non-negative (start <= min <= key < end) */
2054                 offset = octets * (key - start);
2055                 if (bits == 8)
2056                     s[offset] = (U8)(val & 0xff);
2057                 else if (bits == 16) {
2058                     s[offset    ] = (U8)((val >>  8) & 0xff);
2059                     s[offset + 1] = (U8)( val        & 0xff);
2060                 }
2061                 else if (bits == 32) {
2062                     s[offset    ] = (U8)((val >> 24) & 0xff);
2063                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2064                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2065                     s[offset + 3] = (U8)( val        & 0xff);
2066                 }
2067
2068                 if (!none || val < none)
2069                     ++val;
2070             }
2071         }
2072         else { /* bits == 1, then val should be ignored */
2073             UV key;
2074             if (min < start)
2075                 min = start;
2076             for (key = min; key <= max; key++) {
2077                 const STRLEN offset = (STRLEN)(key - start);
2078                 if (key >= end)
2079                     goto go_out_list;
2080                 s[offset >> 3] |= 1 << (offset & 7);
2081             }
2082         }
2083     } /* while */
2084   go_out_list:
2085
2086     /* read $swash->{EXTRAS} */
2087     x = (U8*)SvPV(*extssvp, xcur);
2088     xend = x + xcur;
2089     while (x < xend) {
2090         STRLEN namelen;
2091         U8 *namestr;
2092         SV** othersvp;
2093         HV* otherhv;
2094         STRLEN otherbits;
2095         SV **otherbitssvp, *other;
2096         U8 *s, *o, *nl;
2097         STRLEN slen, olen;
2098
2099         const U8 opc = *x++;
2100         if (opc == '\n')
2101             continue;
2102
2103         nl = (U8*)memchr(x, '\n', xend - x);
2104
2105         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2106             if (nl) {
2107                 x = nl + 1; /* 1 is length of "\n" */
2108                 continue;
2109             }
2110             else {
2111                 x = xend; /* to EXTRAS' end at which \n is not found */
2112                 break;
2113             }
2114         }
2115
2116         namestr = x;
2117         if (nl) {
2118             namelen = nl - namestr;
2119             x = nl + 1;
2120         }
2121         else {
2122             namelen = xend - namestr;
2123             x = xend;
2124         }
2125
2126         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2127         otherhv = MUTABLE_HV(SvRV(*othersvp));
2128         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2129         otherbits = (STRLEN)SvUV(*otherbitssvp);
2130         if (bits < otherbits)
2131             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2132
2133         /* The "other" swatch must be destroyed after. */
2134         other = swash_get(*othersvp, start, span);
2135         o = (U8*)SvPV(other, olen);
2136
2137         if (!olen)
2138             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2139
2140         s = (U8*)SvPV(swatch, slen);
2141         if (bits == 1 && otherbits == 1) {
2142             if (slen != olen)
2143                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2144
2145             switch (opc) {
2146             case '+':
2147                 while (slen--)
2148                     *s++ |= *o++;
2149                 break;
2150             case '!':
2151                 while (slen--)
2152                     *s++ |= ~*o++;
2153                 break;
2154             case '-':
2155                 while (slen--)
2156                     *s++ &= ~*o++;
2157                 break;
2158             case '&':
2159                 while (slen--)
2160                     *s++ &= *o++;
2161                 break;
2162             default:
2163                 break;
2164             }
2165         }
2166         else {
2167             STRLEN otheroctets = otherbits >> 3;
2168             STRLEN offset = 0;
2169             U8* const send = s + slen;
2170
2171             while (s < send) {
2172                 UV otherval = 0;
2173
2174                 if (otherbits == 1) {
2175                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2176                     ++offset;
2177                 }
2178                 else {
2179                     STRLEN vlen = otheroctets;
2180                     otherval = *o++;
2181                     while (--vlen) {
2182                         otherval <<= 8;
2183                         otherval |= *o++;
2184                     }
2185                 }
2186
2187                 if (opc == '+' && otherval)
2188                     NOOP;   /* replace with otherval */
2189                 else if (opc == '!' && !otherval)
2190                     otherval = 1;
2191                 else if (opc == '-' && otherval)
2192                     otherval = 0;
2193                 else if (opc == '&' && !otherval)
2194                     otherval = 0;
2195                 else {
2196                     s += octets; /* no replacement */
2197                     continue;
2198                 }
2199
2200                 if (bits == 8)
2201                     *s++ = (U8)( otherval & 0xff);
2202                 else if (bits == 16) {
2203                     *s++ = (U8)((otherval >>  8) & 0xff);
2204                     *s++ = (U8)( otherval        & 0xff);
2205                 }
2206                 else if (bits == 32) {
2207                     *s++ = (U8)((otherval >> 24) & 0xff);
2208                     *s++ = (U8)((otherval >> 16) & 0xff);
2209                     *s++ = (U8)((otherval >>  8) & 0xff);
2210                     *s++ = (U8)( otherval        & 0xff);
2211                 }
2212             }
2213         }
2214         sv_free(other); /* through with it! */
2215     } /* while */
2216     return swatch;
2217 }
2218
2219 /*
2220 =for apidoc uvchr_to_utf8
2221
2222 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2223 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2224 bytes available. The return value is the pointer to the byte after the
2225 end of the new character. In other words,
2226
2227     d = uvchr_to_utf8(d, uv);
2228
2229 is the recommended wide native character-aware way of saying
2230
2231     *(d++) = uv;
2232
2233 =cut
2234 */
2235
2236 /* On ASCII machines this is normally a macro but we want a
2237    real function in case XS code wants it
2238 */
2239 U8 *
2240 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2241 {
2242     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2243
2244     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2245 }
2246
2247 U8 *
2248 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2249 {
2250     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2251
2252     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2253 }
2254
2255 /*
2256 =for apidoc utf8n_to_uvchr
2257 flags
2258
2259 Returns the native character value of the first character in the string
2260 C<s>
2261 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2262 length, in bytes, of that character.
2263
2264 Allows length and flags to be passed to low level routine.
2265
2266 =cut
2267 */
2268 /* On ASCII machines this is normally a macro but we want
2269    a real function in case XS code wants it
2270 */
2271 UV
2272 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2273 U32 flags)
2274 {
2275     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2276
2277     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2278
2279     return UNI_TO_NATIVE(uv);
2280 }
2281
2282 /*
2283 =for apidoc pv_uni_display
2284
2285 Build to the scalar dsv a displayable version of the string spv,
2286 length len, the displayable version being at most pvlim bytes long
2287 (if longer, the rest is truncated and "..." will be appended).
2288
2289 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2290 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2291 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2292 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2293 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2294 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2295
2296 The pointer to the PV of the dsv is returned.
2297
2298 =cut */
2299 char *
2300 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2301 {
2302     int truncated = 0;
2303     const char *s, *e;
2304
2305     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2306
2307     sv_setpvs(dsv, "");
2308     SvUTF8_off(dsv);
2309     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2310          UV u;
2311           /* This serves double duty as a flag and a character to print after
2312              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2313           */
2314          char ok = 0;
2315
2316          if (pvlim && SvCUR(dsv) >= pvlim) {
2317               truncated++;
2318               break;
2319          }
2320          u = utf8_to_uvchr((U8*)s, 0);
2321          if (u < 256) {
2322              const unsigned char c = (unsigned char)u & 0xFF;
2323              if (flags & UNI_DISPLAY_BACKSLASH) {
2324                  switch (c) {
2325                  case '\n':
2326                      ok = 'n'; break;
2327                  case '\r':
2328                      ok = 'r'; break;
2329                  case '\t':
2330                      ok = 't'; break;
2331                  case '\f':
2332                      ok = 'f'; break;
2333                  case '\a':
2334                      ok = 'a'; break;
2335                  case '\\':
2336                      ok = '\\'; break;
2337                  default: break;
2338                  }
2339                  if (ok) {
2340                      const char string = ok;
2341                      sv_catpvs(dsv, "\\");
2342                      sv_catpvn(dsv, &string, 1);
2343                  }
2344              }
2345              /* isPRINT() is the locale-blind version. */
2346              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2347                  const char string = c;
2348                  sv_catpvn(dsv, &string, 1);
2349                  ok = 1;
2350              }
2351          }
2352          if (!ok)
2353              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2354     }
2355     if (truncated)
2356          sv_catpvs(dsv, "...");
2357
2358     return SvPVX(dsv);
2359 }
2360
2361 /*
2362 =for apidoc sv_uni_display
2363
2364 Build to the scalar dsv a displayable version of the scalar sv,
2365 the displayable version being at most pvlim bytes long
2366 (if longer, the rest is truncated and "..." will be appended).
2367
2368 The flags argument is as in pv_uni_display().
2369
2370 The pointer to the PV of the dsv is returned.
2371
2372 =cut
2373 */
2374 char *
2375 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2376 {
2377     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2378
2379      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2380                                 SvCUR(ssv), pvlim, flags);
2381 }
2382
2383 /*
2384 =for apidoc ibcmp_utf8
2385
2386 Return true if the strings s1 and s2 differ case-insensitively, false
2387 if not (if they are equal case-insensitively).  If u1 is true, the
2388 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2389 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2390 are false, the respective string is assumed to be in native 8-bit
2391 encoding.
2392
2393 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2394 in there (they will point at the beginning of the I<next> character).
2395 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2396 pointers beyond which scanning will not continue under any
2397 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2398 s2+l2 will be used as goal end pointers that will also stop the scan,
2399 and which qualify towards defining a successful match: all the scans
2400 that define an explicit length must reach their goal pointers for
2401 a match to succeed).
2402
2403 For case-insensitiveness, the "casefolding" of Unicode is used
2404 instead of upper/lowercasing both the characters, see
2405 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2406
2407 =cut */
2408 I32
2409 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2410 {
2411      dVAR;
2412      register const U8 *p1  = (const U8*)s1;
2413      register const U8 *p2  = (const U8*)s2;
2414      register const U8 *f1 = NULL;
2415      register const U8 *f2 = NULL;
2416      register U8 *e1 = NULL;
2417      register U8 *q1 = NULL;
2418      register U8 *e2 = NULL;
2419      register U8 *q2 = NULL;
2420      STRLEN n1 = 0, n2 = 0;
2421      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2422      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2423      U8 natbuf[1+1];
2424      STRLEN foldlen1, foldlen2;
2425      bool match;
2426
2427      PERL_ARGS_ASSERT_IBCMP_UTF8;
2428
2429      if (pe1)
2430           e1 = *(U8**)pe1;
2431      /* assert(e1 || l1); */
2432      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2433           f1 = (const U8*)s1 + l1;
2434      if (pe2)
2435           e2 = *(U8**)pe2;
2436      /* assert(e2 || l2); */
2437      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2438           f2 = (const U8*)s2 + l2;
2439
2440      /* This shouldn't happen. However, putting an assert() there makes some
2441       * tests fail. */
2442      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2443      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2444           return 1; /* mismatch; possible infinite loop or false positive */
2445
2446      if (!u1 || !u2)
2447           natbuf[1] = 0; /* Need to terminate the buffer. */
2448
2449      while ((e1 == 0 || p1 < e1) &&
2450             (f1 == 0 || p1 < f1) &&
2451             (e2 == 0 || p2 < e2) &&
2452             (f2 == 0 || p2 < f2)) {
2453           if (n1 == 0) {
2454                if (u1)
2455                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2456                else {
2457                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2458                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2459                }
2460                q1 = foldbuf1;
2461                n1 = foldlen1;
2462           }
2463           if (n2 == 0) {
2464                if (u2)
2465                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2466                else {
2467                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2468                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2469                }
2470                q2 = foldbuf2;
2471                n2 = foldlen2;
2472           }
2473           while (n1 && n2) {
2474                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2475                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2476                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2477                    return 1; /* mismatch */
2478                n1 -= UTF8SKIP(q1);
2479                q1 += UTF8SKIP(q1);
2480                n2 -= UTF8SKIP(q2);
2481                q2 += UTF8SKIP(q2);
2482           }
2483           if (n1 == 0)
2484                p1 += u1 ? UTF8SKIP(p1) : 1;
2485           if (n2 == 0)
2486                p2 += u2 ? UTF8SKIP(p2) : 1;
2487
2488      }
2489
2490      /* A match is defined by all the scans that specified
2491       * an explicit length reaching their final goals. */
2492      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2493
2494      if (match) {
2495           if (pe1)
2496                *pe1 = (char*)p1;
2497           if (pe2)
2498                *pe2 = (char*)p2;
2499      }
2500
2501      return match ? 0 : 1; /* 0 match, 1 mismatch */
2502 }
2503
2504 /*
2505  * Local variables:
2506  * c-indentation-style: bsd
2507  * c-basic-offset: 4
2508  * indent-tabs-mode: t
2509  * End:
2510  *
2511  * ex: set ts=8 sts=4 sw=4 noet:
2512  */