utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241     PERL_UNUSED_CONTEXT;
 242 #ifdef IS_UTF8_CHAR
 243     if (IS_UTF8_CHAR_FAST(len))
 244         return IS_UTF8_CHAR(s, len) ? len : 0;
 245 #endif /* #ifdef IS_UTF8_CHAR */
 246     return is_utf8_char_slow(s, len);
 247 }
 248
 249 /*
 250 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 251
 252 Returns true if first C<len> bytes of the given string form a valid
 253 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 254 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 255 because a valid ASCII string is a valid UTF-8 string.
 256
 257 See also is_utf8_string_loclen() and is_utf8_string_loc().
 258
 259 =cut
 260 */
 261
 262 bool
 263 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 264 {
 265     const U8* const send = s + (len ? len : strlen((const char *)s));
 266     const U8* x = s;
 267
 268     PERL_UNUSED_CONTEXT;
 269
 270     while (x < send) {
 271         STRLEN c;
 272          /* Inline the easy bits of is_utf8_char() here for speed... */
 273          if (UTF8_IS_INVARIANT(*x))
 274               c = 1;
 275          else if (!UTF8_IS_START(*x))
 276              goto out;
 277          else {
 278               /* ... and call is_utf8_char() only if really needed. */
 279 #ifdef IS_UTF8_CHAR
 280              c = UTF8SKIP(x);
 281              if (IS_UTF8_CHAR_FAST(c)) {
 282                  if (!IS_UTF8_CHAR(x, c))
 283                      c = 0;
 284              }
 285              else
 286                 c = is_utf8_char_slow(x, c);
 287 #else
 288              c = is_utf8_char(x);
 289 #endif /* #ifdef IS_UTF8_CHAR */
 290               if (!c)
 291                   goto out;
 292          }
 293         x += c;
 294     }
 295
 296  out:
 297     if (x != send)
 298         return FALSE;
 299
 300     return TRUE;
 301 }
 302
 303 /*
 304 Implemented as a macro in utf8.h
 305
 306 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 307
 308 Like is_utf8_string() but stores the location of the failure (in the
 309 case of "utf8ness failure") or the location s+len (in the case of
 310 "utf8ness success") in the C<ep>.
 311
 312 See also is_utf8_string_loclen() and is_utf8_string().
 313
 314 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 315
 316 Like is_utf8_string() but stores the location of the failure (in the
 317 case of "utf8ness failure") or the location s+len (in the case of
 318 "utf8ness success") in the C<ep>, and the number of UTF-8
 319 encoded characters in the C<el>.
 320
 321 See also is_utf8_string_loc() and is_utf8_string().
 322
 323 =cut
 324 */
 325
 326 bool
 327 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 328 {
 329     const U8* const send = s + (len ? len : strlen((const char *)s));
 330     const U8* x = s;
 331     STRLEN c;
 332     STRLEN outlen = 0;
 333     PERL_UNUSED_CONTEXT;
 334
 335     while (x < send) {
 336          /* Inline the easy bits of is_utf8_char() here for speed... */
 337          if (UTF8_IS_INVARIANT(*x))
 338              c = 1;
 339          else if (!UTF8_IS_START(*x))
 340              goto out;
 341          else {
 342              /* ... and call is_utf8_char() only if really needed. */
 343 #ifdef IS_UTF8_CHAR
 344              c = UTF8SKIP(x);
 345              if (IS_UTF8_CHAR_FAST(c)) {
 346                  if (!IS_UTF8_CHAR(x, c))
 347                      c = 0;
 348              } else
 349                  c = is_utf8_char_slow(x, c);
 350 #else
 351              c = is_utf8_char(x);
 352 #endif /* #ifdef IS_UTF8_CHAR */
 353              if (!c)
 354                  goto out;
 355          }
 356          x += c;
 357          outlen++;
 358     }
 359
 360  out:
 361     if (el)
 362         *el = outlen;
 363
 364     if (ep)
 365         *ep = x;
 366     return (x == send);
 367 }
 368
 369 /*
 370
 371 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 372
 373 Bottom level UTF-8 decode routine.
 374 Returns the unicode code point value of the first character in the string C<s>
 375 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 376 C<retlen> will be set to the length, in bytes, of that character.
 377
 378 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 379 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 380 it is assumed that the caller will raise a warning, and this function
 381 will silently just set C<retlen> to C<-1> and return zero.  If the
 382 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 383 malformations will be given, C<retlen> will be set to the expected
 384 length of the UTF-8 character in bytes, and zero will be returned.
 385
 386 The C<flags> can also contain various flags to allow deviations from
 387 the strict UTF-8 encoding (see F<utf8.h>).
 388
 389 Most code should use utf8_to_uvchr() rather than call this directly.
 390
 391 =cut
 392 */
 393
 394 UV
 395 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 396 {
 397     dVAR;
 398     const U8 * const s0 = s;
 399     UV uv = *s, ouv = 0;
 400     STRLEN len = 1;
 401     const bool dowarn = ckWARN_d(WARN_UTF8);
 402     const UV startbyte = *s;
 403     STRLEN expectlen = 0;
 404     U32 warning = 0;
 405
 406 /* This list is a superset of the UTF8_ALLOW_XXX. */
 407
 408 #define UTF8_WARN_EMPTY                          1
 409 #define UTF8_WARN_CONTINUATION                   2
 410 #define UTF8_WARN_NON_CONTINUATION               3
 411 #define UTF8_WARN_FE_FF                          4
 412 #define UTF8_WARN_SHORT                          5
 413 #define UTF8_WARN_OVERFLOW                       6
 414 #define UTF8_WARN_SURROGATE                      7
 415 #define UTF8_WARN_LONG                           8
 416 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 417
 418     if (curlen == 0 &&
 419         !(flags & UTF8_ALLOW_EMPTY)) {
 420         warning = UTF8_WARN_EMPTY;
 421         goto malformed;
 422     }
 423
 424     if (UTF8_IS_INVARIANT(uv)) {
 425         if (retlen)
 426             *retlen = 1;
 427         return (UV) (NATIVE_TO_UTF(*s));
 428     }
 429
 430     if (UTF8_IS_CONTINUATION(uv) &&
 431         !(flags & UTF8_ALLOW_CONTINUATION)) {
 432         warning = UTF8_WARN_CONTINUATION;
 433         goto malformed;
 434     }
 435
 436     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 437         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 438         warning = UTF8_WARN_NON_CONTINUATION;
 439         goto malformed;
 440     }
 441
 442 #ifdef EBCDIC
 443     uv = NATIVE_TO_UTF(uv);
 444 #else
 445     if ((uv == 0xfe || uv == 0xff) &&
 446         !(flags & UTF8_ALLOW_FE_FF)) {
 447         warning = UTF8_WARN_FE_FF;
 448         goto malformed;
 449     }
 450 #endif
 451
 452     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 453     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 454     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 455     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 456 #ifdef EBCDIC
 457     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 458     else                        { len =  7; uv &= 0x01; }
 459 #else
 460     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 461     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 462     else                        { len = 13; uv = 0; } /* whoa! */
 463 #endif
 464
 465     if (retlen)
 466         *retlen = len;
 467
 468     expectlen = len;
 469
 470     if ((curlen < expectlen) &&
 471         !(flags & UTF8_ALLOW_SHORT)) {
 472         warning = UTF8_WARN_SHORT;
 473         goto malformed;
 474     }
 475
 476     len--;
 477     s++;
 478     ouv = uv;
 479
 480     while (len--) {
 481         if (!UTF8_IS_CONTINUATION(*s) &&
 482             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 483             s--;
 484             warning = UTF8_WARN_NON_CONTINUATION;
 485             goto malformed;
 486         }
 487         else
 488             uv = UTF8_ACCUMULATE(uv, *s);
 489         if (!(uv > ouv)) {
 490             /* These cannot be allowed. */
 491             if (uv == ouv) {
 492                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 493                     warning = UTF8_WARN_LONG;
 494                     goto malformed;
 495                 }
 496             }
 497             else { /* uv < ouv */
 498                 /* This cannot be allowed. */
 499                 warning = UTF8_WARN_OVERFLOW;
 500                 goto malformed;
 501             }
 502         }
 503         s++;
 504         ouv = uv;
 505     }
 506
 507     if (UNICODE_IS_SURROGATE(uv) &&
 508         !(flags & UTF8_ALLOW_SURROGATE)) {
 509         warning = UTF8_WARN_SURROGATE;
 510         goto malformed;
 511     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 512                !(flags & UTF8_ALLOW_LONG)) {
 513         warning = UTF8_WARN_LONG;
 514         goto malformed;
 515     } else if (UNICODE_IS_ILLEGAL(uv) &&
 516                !(flags & UTF8_ALLOW_FFFF)) {
 517         warning = UTF8_WARN_FFFF;
 518         goto malformed;
 519     }
 520
 521     return uv;
 522
 523 malformed:
 524
 525     if (flags & UTF8_CHECK_ONLY) {
 526         if (retlen)
 527             *retlen = -1;
 528         return 0;
 529     }
 530
 531     if (dowarn) {
 532         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 533
 534         switch (warning) {
 535         case 0: /* Intentionally empty. */ break;
 536         case UTF8_WARN_EMPTY:
 537             sv_catpvs(sv, "(empty string)");
 538             break;
 539         case UTF8_WARN_CONTINUATION:
 540             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 541             break;
 542         case UTF8_WARN_NON_CONTINUATION:
 543             if (s == s0)
 544                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 545                            (UV)s[1], startbyte);
 546             else {
 547                 const int len = (int)(s-s0);
 548                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 549                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 550             }
 551
 552             break;
 553         case UTF8_WARN_FE_FF:
 554             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 555             break;
 556         case UTF8_WARN_SHORT:
 557             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 558                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 559             expectlen = curlen;         /* distance for caller to skip */
 560             break;
 561         case UTF8_WARN_OVERFLOW:
 562             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 563                            ouv, *s, startbyte);
 564             break;
 565         case UTF8_WARN_SURROGATE:
 566             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 567             break;
 568         case UTF8_WARN_LONG:
 569             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 570                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 571             break;
 572         case UTF8_WARN_FFFF:
 573             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 574             break;
 575         default:
 576             sv_catpvs(sv, "(unknown reason)");
 577             break;
 578         }
 579
 580         if (warning) {
 581             const char * const s = SvPVX_const(sv);
 582
 583             if (PL_op)
 584                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 585                             "%s in %s", s,  OP_DESC(PL_op));
 586             else
 587                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 588         }
 589     }
 590
 591     if (retlen)
 592         *retlen = expectlen ? expectlen : len;
 593
 594     return 0;
 595 }
 596
 597 /*
 598 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 599
 600 Returns the native character value of the first character in the string C<s>
 601 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 602 length, in bytes, of that character.
 603
 604 If C<s> does not point to a well-formed UTF-8 character, zero is
 605 returned and retlen is set, if possible, to -1.
 606
 607 =cut
 608 */
 609
 610 UV
 611 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 612 {
 613     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 614                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 615 }
 616
 617 /*
 618 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 619
 620 Returns the Unicode code point of the first character in the string C<s>
 621 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 622 length, in bytes, of that character.
 623
 624 This function should only be used when returned UV is considered
 625 an index into the Unicode semantic tables (e.g. swashes).
 626
 627 If C<s> does not point to a well-formed UTF-8 character, zero is
 628 returned and retlen is set, if possible, to -1.
 629
 630 =cut
 631 */
 632
 633 UV
 634 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 635 {
 636     /* Call the low level routine asking for checks */
 637     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 638                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 639 }
 640
 641 /*
 642 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 643
 644 Return the length of the UTF-8 char encoded string C<s> in characters.
 645 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 646 up past C<e>, croaks.
 647
 648 =cut
 649 */
 650
 651 STRLEN
 652 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 653 {
 654     dVAR;
 655     STRLEN len = 0;
 656
 657     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 658      * the bitops (especially ~) can create illegal UTF-8.
 659      * In other words: in Perl UTF-8 is not just for Unicode. */
 660
 661     if (e < s)
 662         goto warn_and_return;
 663     while (s < e) {
 664         const U8 t = UTF8SKIP(s);
 665         if (e - s < t) {
 666             warn_and_return:
 667             if (ckWARN_d(WARN_UTF8)) {
 668                 if (PL_op)
 669                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 670                             "%s in %s", unees, OP_DESC(PL_op));
 671                 else
 672                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 673             }
 674             return len;
 675         }
 676         s += t;
 677         len++;
 678     }
 679
 680     return len;
 681 }
 682
 683 /*
 684 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 685
 686 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 687 and C<b>.
 688
 689 WARNING: use only if you *know* that the pointers point inside the
 690 same UTF-8 buffer.
 691
 692 =cut
 693 */
 694
 695 IV
 696 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 697 {
 698     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 699 }
 700
 701 /*
 702 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 703
 704 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 705 forward or backward.
 706
 707 WARNING: do not use the following unless you *know* C<off> is within
 708 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 709 on the first byte of character or just after the last byte of a character.
 710
 711 =cut
 712 */
 713
 714 U8 *
 715 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 716 {
 717     PERL_UNUSED_CONTEXT;
 718     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 719      * the bitops (especially ~) can create illegal UTF-8.
 720      * In other words: in Perl UTF-8 is not just for Unicode. */
 721
 722     if (off >= 0) {
 723         while (off--)
 724             s += UTF8SKIP(s);
 725     }
 726     else {
 727         while (off++) {
 728             s--;
 729             while (UTF8_IS_CONTINUATION(*s))
 730                 s--;
 731         }
 732     }
 733     return (U8 *)s;
 734 }
 735
 736 /*
 737 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 738
 739 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 740 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 741 updates len to contain the new length.
 742 Returns zero on failure, setting C<len> to -1.
 743
 744 If you need a copy of the string, see C<bytes_from_utf8>.
 745
 746 =cut
 747 */
 748
 749 U8 *
 750 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 751 {
 752     U8 * const save = s;
 753     U8 * const send = s + *len;
 754     U8 *d;
 755
 756     /* ensure valid UTF-8 and chars < 256 before updating string */
 757     while (s < send) {
 758         U8 c = *s++;
 759
 760         if (!UTF8_IS_INVARIANT(c) &&
 761             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 762              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 763             *len = -1;
 764             return 0;
 765         }
 766     }
 767
 768     d = s = save;
 769     while (s < send) {
 770         STRLEN ulen;
 771         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 772         s += ulen;
 773     }
 774     *d = '\0';
 775     *len = d - save;
 776     return save;
 777 }
 778
 779 /*
 780 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 781
 782 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 783 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 784 the newly-created string, and updates C<len> to contain the new
 785 length.  Returns the original string if no conversion occurs, C<len>
 786 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 787 0 if C<s> is converted or contains all 7bit characters.
 788
 789 =cut
 790 */
 791
 792 U8 *
 793 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 794 {
 795     U8 *d;
 796     const U8 *start = s;
 797     const U8 *send;
 798     I32 count = 0;
 799
 800     PERL_UNUSED_CONTEXT;
 801     if (!*is_utf8)
 802         return (U8 *)start;
 803
 804     /* ensure valid UTF-8 and chars < 256 before converting string */
 805     for (send = s + *len; s < send;) {
 806         U8 c = *s++;
 807         if (!UTF8_IS_INVARIANT(c)) {
 808             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 809                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 810                 count++;
 811             else
 812                 return (U8 *)start;
 813         }
 814     }
 815
 816     *is_utf8 = FALSE;
 817
 818     Newx(d, (*len) - count + 1, U8);
 819     s = start; start = d;
 820     while (s < send) {
 821         U8 c = *s++;
 822         if (!UTF8_IS_INVARIANT(c)) {
 823             /* Then it is two-byte encoded */
 824             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 825             c = ASCII_TO_NATIVE(c);
 826         }
 827         *d++ = c;
 828     }
 829     *d = '\0';
 830     *len = d - start;
 831     return (U8 *)start;
 832 }
 833
 834 /*
 835 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 836
 837 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 838 Returns a pointer to the newly-created string, and sets C<len> to
 839 reflect the new length.
 840
 841 If you want to convert to UTF-8 from other encodings than ASCII,
 842 see sv_recode_to_utf8().
 843
 844 =cut
 845 */
 846
 847 U8*
 848 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 849 {
 850     const U8 * const send = s + (*len);
 851     U8 *d;
 852     U8 *dst;
 853     PERL_UNUSED_CONTEXT;
 854
 855     Newx(d, (*len) * 2 + 1, U8);
 856     dst = d;
 857
 858     while (s < send) {
 859         const UV uv = NATIVE_TO_ASCII(*s++);
 860         if (UNI_IS_INVARIANT(uv))
 861             *d++ = (U8)UTF_TO_NATIVE(uv);
 862         else {
 863             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 864             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 865         }
 866     }
 867     *d = '\0';
 868     *len = d-dst;
 869     return dst;
 870 }
 871
 872 /*
 873  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 874  *
 875  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 876  * We optimize for native, for obvious reasons. */
 877
 878 U8*
 879 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 880 {
 881     U8* pend;
 882     U8* dstart = d;
 883
 884     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 885          d[0] = 0;
 886          *newlen = 1;
 887          return d;
 888     }
 889
 890     if (bytelen & 1)
 891         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 892
 893     pend = p + bytelen;
 894
 895     while (p < pend) {
 896         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 897         p += 2;
 898         if (uv < 0x80) {
 899             *d++ = (U8)uv;
 900             continue;
 901         }
 902         if (uv < 0x800) {
 903             *d++ = (U8)(( uv >>  6)         | 0xc0);
 904             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 905             continue;
 906         }
 907         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 908             UV low = (p[0] << 8) + p[1];
 909             p += 2;
 910             if (low < 0xdc00 || low >= 0xdfff)
 911                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 912             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 913         }
 914         if (uv < 0x10000) {
 915             *d++ = (U8)(( uv >> 12)         | 0xe0);
 916             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 917             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 918             continue;
 919         }
 920         else {
 921             *d++ = (U8)(( uv >> 18)         | 0xf0);
 922             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 923             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 924             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 925             continue;
 926         }
 927     }
 928     *newlen = d - dstart;
 929     return d;
 930 }
 931
 932 /* Note: this one is slightly destructive of the source. */
 933
 934 U8*
 935 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 936 {
 937     U8* s = (U8*)p;
 938     U8* const send = s + bytelen;
 939     while (s < send) {
 940         const U8 tmp = s[0];
 941         s[0] = s[1];
 942         s[1] = tmp;
 943         s += 2;
 944     }
 945     return utf16_to_utf8(p, d, bytelen, newlen);
 946 }
 947
 948 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 949
 950 bool
 951 Perl_is_uni_alnum(pTHX_ UV c)
 952 {
 953     U8 tmpbuf[UTF8_MAXBYTES+1];
 954     uvchr_to_utf8(tmpbuf, c);
 955     return is_utf8_alnum(tmpbuf);
 956 }
 957
 958 bool
 959 Perl_is_uni_alnumc(pTHX_ UV c)
 960 {
 961     U8 tmpbuf[UTF8_MAXBYTES+1];
 962     uvchr_to_utf8(tmpbuf, c);
 963     return is_utf8_alnumc(tmpbuf);
 964 }
 965
 966 bool
 967 Perl_is_uni_idfirst(pTHX_ UV c)
 968 {
 969     U8 tmpbuf[UTF8_MAXBYTES+1];
 970     uvchr_to_utf8(tmpbuf, c);
 971     return is_utf8_idfirst(tmpbuf);
 972 }
 973
 974 bool
 975 Perl_is_uni_alpha(pTHX_ UV c)
 976 {
 977     U8 tmpbuf[UTF8_MAXBYTES+1];
 978     uvchr_to_utf8(tmpbuf, c);
 979     return is_utf8_alpha(tmpbuf);
 980 }
 981
 982 bool
 983 Perl_is_uni_ascii(pTHX_ UV c)
 984 {
 985     U8 tmpbuf[UTF8_MAXBYTES+1];
 986     uvchr_to_utf8(tmpbuf, c);
 987     return is_utf8_ascii(tmpbuf);
 988 }
 989
 990 bool
 991 Perl_is_uni_space(pTHX_ UV c)
 992 {
 993     U8 tmpbuf[UTF8_MAXBYTES+1];
 994     uvchr_to_utf8(tmpbuf, c);
 995     return is_utf8_space(tmpbuf);
 996 }
 997
 998 bool
 999 Perl_is_uni_digit(pTHX_ UV c)
1000 {
1001     U8 tmpbuf[UTF8_MAXBYTES+1];
1002     uvchr_to_utf8(tmpbuf, c);
1003     return is_utf8_digit(tmpbuf);
1004 }
1005
1006 bool
1007 Perl_is_uni_upper(pTHX_ UV c)
1008 {
1009     U8 tmpbuf[UTF8_MAXBYTES+1];
1010     uvchr_to_utf8(tmpbuf, c);
1011     return is_utf8_upper(tmpbuf);
1012 }
1013
1014 bool
1015 Perl_is_uni_lower(pTHX_ UV c)
1016 {
1017     U8 tmpbuf[UTF8_MAXBYTES+1];
1018     uvchr_to_utf8(tmpbuf, c);
1019     return is_utf8_lower(tmpbuf);
1020 }
1021
1022 bool
1023 Perl_is_uni_cntrl(pTHX_ UV c)
1024 {
1025     U8 tmpbuf[UTF8_MAXBYTES+1];
1026     uvchr_to_utf8(tmpbuf, c);
1027     return is_utf8_cntrl(tmpbuf);
1028 }
1029
1030 bool
1031 Perl_is_uni_graph(pTHX_ UV c)
1032 {
1033     U8 tmpbuf[UTF8_MAXBYTES+1];
1034     uvchr_to_utf8(tmpbuf, c);
1035     return is_utf8_graph(tmpbuf);
1036 }
1037
1038 bool
1039 Perl_is_uni_print(pTHX_ UV c)
1040 {
1041     U8 tmpbuf[UTF8_MAXBYTES+1];
1042     uvchr_to_utf8(tmpbuf, c);
1043     return is_utf8_print(tmpbuf);
1044 }
1045
1046 bool
1047 Perl_is_uni_punct(pTHX_ UV c)
1048 {
1049     U8 tmpbuf[UTF8_MAXBYTES+1];
1050     uvchr_to_utf8(tmpbuf, c);
1051     return is_utf8_punct(tmpbuf);
1052 }
1053
1054 bool
1055 Perl_is_uni_xdigit(pTHX_ UV c)
1056 {
1057     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1058     uvchr_to_utf8(tmpbuf, c);
1059     return is_utf8_xdigit(tmpbuf);
1060 }
1061
1062 UV
1063 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1064 {
1065     uvchr_to_utf8(p, c);
1066     return to_utf8_upper(p, p, lenp);
1067 }
1068
1069 UV
1070 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1071 {
1072     uvchr_to_utf8(p, c);
1073     return to_utf8_title(p, p, lenp);
1074 }
1075
1076 UV
1077 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1078 {
1079     uvchr_to_utf8(p, c);
1080     return to_utf8_lower(p, p, lenp);
1081 }
1082
1083 UV
1084 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1085 {
1086     uvchr_to_utf8(p, c);
1087     return to_utf8_fold(p, p, lenp);
1088 }
1089
1090 /* for now these all assume no locale info available for Unicode > 255 */
1091
1092 bool
1093 Perl_is_uni_alnum_lc(pTHX_ UV c)
1094 {
1095     return is_uni_alnum(c);     /* XXX no locale support yet */
1096 }
1097
1098 bool
1099 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1100 {
1101     return is_uni_alnumc(c);    /* XXX no locale support yet */
1102 }
1103
1104 bool
1105 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1106 {
1107     return is_uni_idfirst(c);   /* XXX no locale support yet */
1108 }
1109
1110 bool
1111 Perl_is_uni_alpha_lc(pTHX_ UV c)
1112 {
1113     return is_uni_alpha(c);     /* XXX no locale support yet */
1114 }
1115
1116 bool
1117 Perl_is_uni_ascii_lc(pTHX_ UV c)
1118 {
1119     return is_uni_ascii(c);     /* XXX no locale support yet */
1120 }
1121
1122 bool
1123 Perl_is_uni_space_lc(pTHX_ UV c)
1124 {
1125     return is_uni_space(c);     /* XXX no locale support yet */
1126 }
1127
1128 bool
1129 Perl_is_uni_digit_lc(pTHX_ UV c)
1130 {
1131     return is_uni_digit(c);     /* XXX no locale support yet */
1132 }
1133
1134 bool
1135 Perl_is_uni_upper_lc(pTHX_ UV c)
1136 {
1137     return is_uni_upper(c);     /* XXX no locale support yet */
1138 }
1139
1140 bool
1141 Perl_is_uni_lower_lc(pTHX_ UV c)
1142 {
1143     return is_uni_lower(c);     /* XXX no locale support yet */
1144 }
1145
1146 bool
1147 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1148 {
1149     return is_uni_cntrl(c);     /* XXX no locale support yet */
1150 }
1151
1152 bool
1153 Perl_is_uni_graph_lc(pTHX_ UV c)
1154 {
1155     return is_uni_graph(c);     /* XXX no locale support yet */
1156 }
1157
1158 bool
1159 Perl_is_uni_print_lc(pTHX_ UV c)
1160 {
1161     return is_uni_print(c);     /* XXX no locale support yet */
1162 }
1163
1164 bool
1165 Perl_is_uni_punct_lc(pTHX_ UV c)
1166 {
1167     return is_uni_punct(c);     /* XXX no locale support yet */
1168 }
1169
1170 bool
1171 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1172 {
1173     return is_uni_xdigit(c);    /* XXX no locale support yet */
1174 }
1175
1176 U32
1177 Perl_to_uni_upper_lc(pTHX_ U32 c)
1178 {
1179     /* XXX returns only the first character -- do not use XXX */
1180     /* XXX no locale support yet */
1181     STRLEN len;
1182     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1183     return (U32)to_uni_upper(c, tmpbuf, &len);
1184 }
1185
1186 U32
1187 Perl_to_uni_title_lc(pTHX_ U32 c)
1188 {
1189     /* XXX returns only the first character XXX -- do not use XXX */
1190     /* XXX no locale support yet */
1191     STRLEN len;
1192     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1193     return (U32)to_uni_title(c, tmpbuf, &len);
1194 }
1195
1196 U32
1197 Perl_to_uni_lower_lc(pTHX_ U32 c)
1198 {
1199     /* XXX returns only the first character -- do not use XXX */
1200     /* XXX no locale support yet */
1201     STRLEN len;
1202     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1203     return (U32)to_uni_lower(c, tmpbuf, &len);
1204 }
1205
1206 static bool
1207 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1208                  const char *const swashname)
1209 {
1210     dVAR;
1211     if (!is_utf8_char(p))
1212         return FALSE;
1213     if (!*swash)
1214         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1215     return swash_fetch(*swash, p, TRUE) != 0;
1216 }
1217
1218 bool
1219 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1220 {
1221     dVAR;
1222     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1223      * descendant of isalnum(3), in other words, it doesn't
1224      * contain the '_'. --jhi */
1225     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1226 }
1227
1228 bool
1229 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1230 {
1231     dVAR;
1232     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1233 }
1234
1235 bool
1236 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1237 {
1238     dVAR;
1239     if (*p == '_')
1240         return TRUE;
1241     /* is_utf8_idstart would be more logical. */
1242     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1243 }
1244
1245 bool
1246 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1247 {
1248     dVAR;
1249     if (*p == '_')
1250         return TRUE;
1251     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1252 }
1253
1254 bool
1255 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1256 {
1257     dVAR;
1258     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1259 }
1260
1261 bool
1262 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1263 {
1264     dVAR;
1265     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1266 }
1267
1268 bool
1269 Perl_is_utf8_space(pTHX_ const U8 *p)
1270 {
1271     dVAR;
1272     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1273 }
1274
1275 bool
1276 Perl_is_utf8_digit(pTHX_ const U8 *p)
1277 {
1278     dVAR;
1279     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1280 }
1281
1282 bool
1283 Perl_is_utf8_upper(pTHX_ const U8 *p)
1284 {
1285     dVAR;
1286     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1287 }
1288
1289 bool
1290 Perl_is_utf8_lower(pTHX_ const U8 *p)
1291 {
1292     dVAR;
1293     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1294 }
1295
1296 bool
1297 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1298 {
1299     dVAR;
1300     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1301 }
1302
1303 bool
1304 Perl_is_utf8_graph(pTHX_ const U8 *p)
1305 {
1306     dVAR;
1307     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1308 }
1309
1310 bool
1311 Perl_is_utf8_print(pTHX_ const U8 *p)
1312 {
1313     dVAR;
1314     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1315 }
1316
1317 bool
1318 Perl_is_utf8_punct(pTHX_ const U8 *p)
1319 {
1320     dVAR;
1321     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1322 }
1323
1324 bool
1325 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1326 {
1327     dVAR;
1328     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1329 }
1330
1331 bool
1332 Perl_is_utf8_mark(pTHX_ const U8 *p)
1333 {
1334     dVAR;
1335     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1336 }
1337
1338 /*
1339 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1340
1341 The "p" contains the pointer to the UTF-8 string encoding
1342 the character that is being converted.
1343
1344 The "ustrp" is a pointer to the character buffer to put the
1345 conversion result to.  The "lenp" is a pointer to the length
1346 of the result.
1347
1348 The "swashp" is a pointer to the swash to use.
1349
1350 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1351 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1352 but not always, a multicharacter mapping), is tried first.
1353
1354 The "special" is a string like "utf8::ToSpecLower", which means the
1355 hash %utf8::ToSpecLower.  The access to the hash is through
1356 Perl_to_utf8_case().
1357
1358 The "normal" is a string like "ToLower" which means the swash
1359 %utf8::ToLower.
1360
1361 =cut */
1362
1363 UV
1364 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1365                         SV **swashp, const char *normal, const char *special)
1366 {
1367     dVAR;
1368     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1369     STRLEN len = 0;
1370
1371     const UV uv0 = utf8_to_uvchr(p, NULL);
1372     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1373      * are necessary in EBCDIC, they are redundant no-ops
1374      * in ASCII-ish platforms, and hopefully optimized away. */
1375     const UV uv1 = NATIVE_TO_UNI(uv0);
1376     uvuni_to_utf8(tmpbuf, uv1);
1377
1378     if (!*swashp) /* load on-demand */
1379          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1380
1381     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1382     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1383          /* It might be "special" (sometimes, but not always,
1384           * a multicharacter mapping) */
1385          HV * const hv = get_hv(special, FALSE);
1386          SV **svp;
1387
1388          if (hv &&
1389              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1390              (*svp)) {
1391              const char *s;
1392
1393               s = SvPV_const(*svp, len);
1394               if (len == 1)
1395                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1396               else {
1397 #ifdef EBCDIC
1398                    /* If we have EBCDIC we need to remap the characters
1399                     * since any characters in the low 256 are Unicode
1400                     * code points, not EBCDIC. */
1401                    U8 *t = (U8*)s, *tend = t + len, *d;
1402
1403                    d = tmpbuf;
1404                    if (SvUTF8(*svp)) {
1405                         STRLEN tlen = 0;
1406
1407                         while (t < tend) {
1408                              const UV c = utf8_to_uvchr(t, &tlen);
1409                              if (tlen > 0) {
1410                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1411                                   t += tlen;
1412                              }
1413                              else
1414                                   break;
1415                         }
1416                    }
1417                    else {
1418                         while (t < tend) {
1419                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1420                              t++;
1421                         }
1422                    }
1423                    len = d - tmpbuf;
1424                    Copy(tmpbuf, ustrp, len, U8);
1425 #else
1426                    Copy(s, ustrp, len, U8);
1427 #endif
1428               }
1429          }
1430     }
1431
1432     if (!len && *swashp) {
1433         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1434
1435          if (uv2) {
1436               /* It was "normal" (a single character mapping). */
1437               const UV uv3 = UNI_TO_NATIVE(uv2);
1438               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1439          }
1440     }
1441
1442     if (!len) /* Neither: just copy. */
1443          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1444
1445     if (lenp)
1446          *lenp = len;
1447
1448     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1449 }
1450
1451 /*
1452 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1453
1454 Convert the UTF-8 encoded character at p to its uppercase version and
1455 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1456 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1457 the uppercase version may be longer than the original character.
1458
1459 The first character of the uppercased version is returned
1460 (but note, as explained above, that there may be more.)
1461
1462 =cut */
1463
1464 UV
1465 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1466 {
1467     dVAR;
1468     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1469                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1470 }
1471
1472 /*
1473 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1474
1475 Convert the UTF-8 encoded character at p to its titlecase version and
1476 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1477 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1478 titlecase version may be longer than the original character.
1479
1480 The first character of the titlecased version is returned
1481 (but note, as explained above, that there may be more.)
1482
1483 =cut */
1484
1485 UV
1486 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1487 {
1488     dVAR;
1489     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1490                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1491 }
1492
1493 /*
1494 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1495
1496 Convert the UTF-8 encoded character at p to its lowercase version and
1497 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1498 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1499 lowercase version may be longer than the original character.
1500
1501 The first character of the lowercased version is returned
1502 (but note, as explained above, that there may be more.)
1503
1504 =cut */
1505
1506 UV
1507 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1508 {
1509     dVAR;
1510     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1511                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1512 }
1513
1514 /*
1515 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1516
1517 Convert the UTF-8 encoded character at p to its foldcase version and
1518 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1519 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1520 foldcase version may be longer than the original character (up to
1521 three characters).
1522
1523 The first character of the foldcased version is returned
1524 (but note, as explained above, that there may be more.)
1525
1526 =cut */
1527
1528 UV
1529 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1530 {
1531     dVAR;
1532     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1533                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1534 }
1535
1536 /* Note:
1537  * A "swash" is a swatch hash.
1538  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1539  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1540  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1541  */
1542 SV*
1543 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1544 {
1545     dVAR;
1546     SV* retval;
1547     SV* const tokenbufsv = sv_newmortal();
1548     dSP;
1549     const size_t pkg_len = strlen(pkg);
1550     const size_t name_len = strlen(name);
1551     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1552     SV* errsv_save;
1553
1554     PUSHSTACKi(PERLSI_MAGIC);
1555     ENTER;
1556     SAVEI32(PL_hints);
1557     PL_hints = 0;
1558     save_re_context();
1559     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1560         ENTER;
1561         errsv_save = newSVsv(ERRSV);
1562         /* It is assumed that callers of this routine are not passing in any
1563            user derived data.  */
1564         /* Need to do this after save_re_context() as it will set PL_tainted to
1565            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1566            Even line to create errsv_save can turn on PL_tainted.  */
1567         SAVEBOOL(PL_tainted);
1568         PL_tainted = 0;
1569         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1570                          NULL);
1571         if (!SvTRUE(ERRSV))
1572             sv_setsv(ERRSV, errsv_save);
1573         SvREFCNT_dec(errsv_save);
1574         LEAVE;
1575     }
1576     SPAGAIN;
1577     PUSHMARK(SP);
1578     EXTEND(SP,5);
1579     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1580     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1581     PUSHs(listsv);
1582     PUSHs(sv_2mortal(newSViv(minbits)));
1583     PUSHs(sv_2mortal(newSViv(none)));
1584     PUTBACK;
1585     if (IN_PERL_COMPILETIME) {
1586         /* XXX ought to be handled by lex_start */
1587         SAVEI32(PL_in_my);
1588         PL_in_my = 0;
1589         sv_setpv(tokenbufsv, PL_tokenbuf);
1590     }
1591     errsv_save = newSVsv(ERRSV);
1592     if (call_method("SWASHNEW", G_SCALAR))
1593         retval = newSVsv(*PL_stack_sp--);
1594     else
1595         retval = &PL_sv_undef;
1596     if (!SvTRUE(ERRSV))
1597         sv_setsv(ERRSV, errsv_save);
1598     SvREFCNT_dec(errsv_save);
1599     LEAVE;
1600     POPSTACK;
1601     if (IN_PERL_COMPILETIME) {
1602         STRLEN len;
1603         const char* const pv = SvPV_const(tokenbufsv, len);
1604
1605         Copy(pv, PL_tokenbuf, len+1, char);
1606         CopHINTS_set(PL_curcop, PL_hints);
1607     }
1608     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1609         if (SvPOK(retval))
1610             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1611                        (void*)retval);
1612         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1613     }
1614     return retval;
1615 }
1616
1617
1618 /* This API is wrong for special case conversions since we may need to
1619  * return several Unicode characters for a single Unicode character
1620  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1621  * the lower-level routine, and it is similarly broken for returning
1622  * multiple values.  --jhi */
1623 /* Now SWASHGET is recasted into S_swash_get in this file. */
1624
1625 /* Note:
1626  * Returns the value of property/mapping C<swash> for the first character
1627  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1628  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1629  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1630  */
1631 UV
1632 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1633 {
1634     dVAR;
1635     HV* const hv = (HV*)SvRV(swash);
1636     U32 klen;
1637     U32 off;
1638     STRLEN slen;
1639     STRLEN needents;
1640     const U8 *tmps = NULL;
1641     U32 bit;
1642     SV *swatch;
1643     U8 tmputf8[2];
1644     const UV c = NATIVE_TO_ASCII(*ptr);
1645
1646     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1647         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1648         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1649         ptr = tmputf8;
1650     }
1651     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1652      * then the "swatch" is a vec() for al the chars which start
1653      * with 0xAA..0xYY
1654      * So the key in the hash (klen) is length of encoded char -1
1655      */
1656     klen = UTF8SKIP(ptr) - 1;
1657     off  = ptr[klen];
1658
1659     if (klen == 0) {
1660       /* If char in invariant then swatch is for all the invariant chars
1661        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1662        */
1663         needents = UTF_CONTINUATION_MARK;
1664         off      = NATIVE_TO_UTF(ptr[klen]);
1665     }
1666     else {
1667       /* If char is encoded then swatch is for the prefix */
1668         needents = (1 << UTF_ACCUMULATION_SHIFT);
1669         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1670     }
1671
1672     /*
1673      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1674      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1675      * it's nothing to sniff at.)  Pity we usually come through at least
1676      * two function calls to get here...
1677      *
1678      * NB: this code assumes that swatches are never modified, once generated!
1679      */
1680
1681     if (hv   == PL_last_swash_hv &&
1682         klen == PL_last_swash_klen &&
1683         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1684     {
1685         tmps = PL_last_swash_tmps;
1686         slen = PL_last_swash_slen;
1687     }
1688     else {
1689         /* Try our second-level swatch cache, kept in a hash. */
1690         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1691
1692         /* If not cached, generate it via swash_get */
1693         if (!svp || !SvPOK(*svp)
1694                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1695             /* We use utf8n_to_uvuni() as we want an index into
1696                Unicode tables, not a native character number.
1697              */
1698             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1699                                            ckWARN(WARN_UTF8) ?
1700                                            0 : UTF8_ALLOW_ANY);
1701             swatch = swash_get(swash,
1702                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1703                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1704                                 needents);
1705
1706             if (IN_PERL_COMPILETIME)
1707                 CopHINTS_set(PL_curcop, PL_hints);
1708
1709             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1710
1711             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1712                      || (slen << 3) < needents)
1713                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1714         }
1715
1716         PL_last_swash_hv = hv;
1717         PL_last_swash_klen = klen;
1718         /* FIXME change interpvar.h?  */
1719         PL_last_swash_tmps = (U8 *) tmps;
1720         PL_last_swash_slen = slen;
1721         if (klen)
1722             Copy(ptr, PL_last_swash_key, klen, U8);
1723     }
1724
1725     switch ((int)((slen << 3) / needents)) {
1726     case 1:
1727         bit = 1 << (off & 7);
1728         off >>= 3;
1729         return (tmps[off] & bit) != 0;
1730     case 8:
1731         return tmps[off];
1732     case 16:
1733         off <<= 1;
1734         return (tmps[off] << 8) + tmps[off + 1] ;
1735     case 32:
1736         off <<= 2;
1737         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1738     }
1739     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1740 }
1741
1742 /* Note:
1743  * Returns a swatch (a bit vector string) for a code point sequence
1744  * that starts from the value C<start> and comprises the number C<span>.
1745  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1746  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1747  */
1748 STATIC SV*
1749 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1750 {
1751     SV *swatch;
1752     U8 *l, *lend, *x, *xend, *s;
1753     STRLEN lcur, xcur, scur;
1754
1755     HV* const hv = (HV*)SvRV(swash);
1756     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1757     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1758     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1759     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1760     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1761     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1762     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1763     const STRLEN bits  = SvUV(*bitssvp);
1764     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1765     const UV     none  = SvUV(*nonesvp);
1766     const UV     end   = start + span;
1767
1768     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1769         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1770                                                  (UV)bits);
1771     }
1772
1773     /* create and initialize $swatch */
1774     swatch = newSVpvs("");
1775     scur   = octets ? (span * octets) : (span + 7) / 8;
1776     SvGROW(swatch, scur + 1);
1777     s = (U8*)SvPVX(swatch);
1778     if (octets && none) {
1779         const U8* const e = s + scur;
1780         while (s < e) {
1781             if (bits == 8)
1782                 *s++ = (U8)(none & 0xff);
1783             else if (bits == 16) {
1784                 *s++ = (U8)((none >>  8) & 0xff);
1785                 *s++ = (U8)( none        & 0xff);
1786             }
1787             else if (bits == 32) {
1788                 *s++ = (U8)((none >> 24) & 0xff);
1789                 *s++ = (U8)((none >> 16) & 0xff);
1790                 *s++ = (U8)((none >>  8) & 0xff);
1791                 *s++ = (U8)( none        & 0xff);
1792             }
1793         }
1794         *s = '\0';
1795     }
1796     else {
1797         (void)memzero((U8*)s, scur + 1);
1798     }
1799     SvCUR_set(swatch, scur);
1800     s = (U8*)SvPVX(swatch);
1801
1802     /* read $swash->{LIST} */
1803     l = (U8*)SvPV(*listsvp, lcur);
1804     lend = l + lcur;
1805     while (l < lend) {
1806         UV min, max, val;
1807         STRLEN numlen;
1808         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1809
1810         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1811
1812         numlen = lend - l;
1813         min = grok_hex((char *)l, &numlen, &flags, NULL);
1814         if (numlen)
1815             l += numlen;
1816         else if (nl) {
1817             l = nl + 1; /* 1 is length of "\n" */
1818             continue;
1819         }
1820         else {
1821             l = lend; /* to LIST's end at which \n is not found */
1822             break;
1823         }
1824
1825         if (isBLANK(*l)) {
1826             ++l;
1827             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1828             numlen = lend - l;
1829             max = grok_hex((char *)l, &numlen, &flags, NULL);
1830             if (numlen)
1831                 l += numlen;
1832             else
1833                 max = min;
1834
1835             if (octets) {
1836                 if (isBLANK(*l)) {
1837                     ++l;
1838                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1839                             PERL_SCAN_DISALLOW_PREFIX;
1840                     numlen = lend - l;
1841                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1842                     if (numlen)
1843                         l += numlen;
1844                     else
1845                         val = 0;
1846                 }
1847                 else {
1848                     val = 0;
1849                     if (typeto) {
1850                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1851                                          typestr, l);
1852                     }
1853                 }
1854             }
1855             else
1856                 val = 0; /* bits == 1, then val should be ignored */
1857         }
1858         else {
1859             max = min;
1860             if (octets) {
1861                 val = 0;
1862                 if (typeto) {
1863                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1864                 }
1865             }
1866             else
1867                 val = 0; /* bits == 1, then val should be ignored */
1868         }
1869
1870         if (nl)
1871             l = nl + 1;
1872         else
1873             l = lend;
1874
1875         if (max < start)
1876             continue;
1877
1878         if (octets) {
1879             UV key;
1880             if (min < start) {
1881                 if (!none || val < none) {
1882                     val += start - min;
1883                 }
1884                 min = start;
1885             }
1886             for (key = min; key <= max; key++) {
1887                 STRLEN offset;
1888                 if (key >= end)
1889                     goto go_out_list;
1890                 /* offset must be non-negative (start <= min <= key < end) */
1891                 offset = octets * (key - start);
1892                 if (bits == 8)
1893                     s[offset] = (U8)(val & 0xff);
1894                 else if (bits == 16) {
1895                     s[offset    ] = (U8)((val >>  8) & 0xff);
1896                     s[offset + 1] = (U8)( val        & 0xff);
1897                 }
1898                 else if (bits == 32) {
1899                     s[offset    ] = (U8)((val >> 24) & 0xff);
1900                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1901                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1902                     s[offset + 3] = (U8)( val        & 0xff);
1903                 }
1904
1905                 if (!none || val < none)
1906                     ++val;
1907             }
1908         }
1909         else { /* bits == 1, then val should be ignored */
1910             UV key;
1911             if (min < start)
1912                 min = start;
1913             for (key = min; key <= max; key++) {
1914                 const STRLEN offset = (STRLEN)(key - start);
1915                 if (key >= end)
1916                     goto go_out_list;
1917                 s[offset >> 3] |= 1 << (offset & 7);
1918             }
1919         }
1920     } /* while */
1921   go_out_list:
1922
1923     /* read $swash->{EXTRAS} */
1924     x = (U8*)SvPV(*extssvp, xcur);
1925     xend = x + xcur;
1926     while (x < xend) {
1927         STRLEN namelen;
1928         U8 *namestr;
1929         SV** othersvp;
1930         HV* otherhv;
1931         STRLEN otherbits;
1932         SV **otherbitssvp, *other;
1933         U8 *s, *o, *nl;
1934         STRLEN slen, olen;
1935
1936         const U8 opc = *x++;
1937         if (opc == '\n')
1938             continue;
1939
1940         nl = (U8*)memchr(x, '\n', xend - x);
1941
1942         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1943             if (nl) {
1944                 x = nl + 1; /* 1 is length of "\n" */
1945                 continue;
1946             }
1947             else {
1948                 x = xend; /* to EXTRAS' end at which \n is not found */
1949                 break;
1950             }
1951         }
1952
1953         namestr = x;
1954         if (nl) {
1955             namelen = nl - namestr;
1956             x = nl + 1;
1957         }
1958         else {
1959             namelen = xend - namestr;
1960             x = xend;
1961         }
1962
1963         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1964         otherhv = (HV*)SvRV(*othersvp);
1965         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1966         otherbits = (STRLEN)SvUV(*otherbitssvp);
1967         if (bits < otherbits)
1968             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1969
1970         /* The "other" swatch must be destroyed after. */
1971         other = swash_get(*othersvp, start, span);
1972         o = (U8*)SvPV(other, olen);
1973
1974         if (!olen)
1975             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1976
1977         s = (U8*)SvPV(swatch, slen);
1978         if (bits == 1 && otherbits == 1) {
1979             if (slen != olen)
1980                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1981
1982             switch (opc) {
1983             case '+':
1984                 while (slen--)
1985                     *s++ |= *o++;
1986                 break;
1987             case '!':
1988                 while (slen--)
1989                     *s++ |= ~*o++;
1990                 break;
1991             case '-':
1992                 while (slen--)
1993                     *s++ &= ~*o++;
1994                 break;
1995             case '&':
1996                 while (slen--)
1997                     *s++ &= *o++;
1998                 break;
1999             default:
2000                 break;
2001             }
2002         }
2003         else {
2004             STRLEN otheroctets = otherbits >> 3;
2005             STRLEN offset = 0;
2006             U8* const send = s + slen;
2007
2008             while (s < send) {
2009                 UV otherval = 0;
2010
2011                 if (otherbits == 1) {
2012                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2013                     ++offset;
2014                 }
2015                 else {
2016                     STRLEN vlen = otheroctets;
2017                     otherval = *o++;
2018                     while (--vlen) {
2019                         otherval <<= 8;
2020                         otherval |= *o++;
2021                     }
2022                 }
2023
2024                 if (opc == '+' && otherval)
2025                     NOOP;   /* replace with otherval */
2026                 else if (opc == '!' && !otherval)
2027                     otherval = 1;
2028                 else if (opc == '-' && otherval)
2029                     otherval = 0;
2030                 else if (opc == '&' && !otherval)
2031                     otherval = 0;
2032                 else {
2033                     s += octets; /* no replacement */
2034                     continue;
2035                 }
2036
2037                 if (bits == 8)
2038                     *s++ = (U8)( otherval & 0xff);
2039                 else if (bits == 16) {
2040                     *s++ = (U8)((otherval >>  8) & 0xff);
2041                     *s++ = (U8)( otherval        & 0xff);
2042                 }
2043                 else if (bits == 32) {
2044                     *s++ = (U8)((otherval >> 24) & 0xff);
2045                     *s++ = (U8)((otherval >> 16) & 0xff);
2046                     *s++ = (U8)((otherval >>  8) & 0xff);
2047                     *s++ = (U8)( otherval        & 0xff);
2048                 }
2049             }
2050         }
2051         sv_free(other); /* through with it! */
2052     } /* while */
2053     return swatch;
2054 }
2055
2056 /*
2057 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2058
2059 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2060 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2061 bytes available. The return value is the pointer to the byte after the
2062 end of the new character. In other words,
2063
2064     d = uvchr_to_utf8(d, uv);
2065
2066 is the recommended wide native character-aware way of saying
2067
2068     *(d++) = uv;
2069
2070 =cut
2071 */
2072
2073 /* On ASCII machines this is normally a macro but we want a
2074    real function in case XS code wants it
2075 */
2076 U8 *
2077 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2078 {
2079     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2080 }
2081
2082 U8 *
2083 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2084 {
2085     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2086 }
2087
2088 /*
2089 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2090 flags
2091
2092 Returns the native character value of the first character in the string
2093 C<s>
2094 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2095 length, in bytes, of that character.
2096
2097 Allows length and flags to be passed to low level routine.
2098
2099 =cut
2100 */
2101 /* On ASCII machines this is normally a macro but we want
2102    a real function in case XS code wants it
2103 */
2104 UV
2105 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2106 U32 flags)
2107 {
2108     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2109     return UNI_TO_NATIVE(uv);
2110 }
2111
2112 /*
2113 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2114
2115 Build to the scalar dsv a displayable version of the string spv,
2116 length len, the displayable version being at most pvlim bytes long
2117 (if longer, the rest is truncated and "..." will be appended).
2118
2119 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2120 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2121 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2122 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2123 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2124 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2125
2126 The pointer to the PV of the dsv is returned.
2127
2128 =cut */
2129 char *
2130 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2131 {
2132     int truncated = 0;
2133     const char *s, *e;
2134
2135     sv_setpvn(dsv, "", 0);
2136     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2137          UV u;
2138           /* This serves double duty as a flag and a character to print after
2139              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2140           */
2141          char ok = 0;
2142
2143          if (pvlim && SvCUR(dsv) >= pvlim) {
2144               truncated++;
2145               break;
2146          }
2147          u = utf8_to_uvchr((U8*)s, 0);
2148          if (u < 256) {
2149              const unsigned char c = (unsigned char)u & 0xFF;
2150              if (flags & UNI_DISPLAY_BACKSLASH) {
2151                  switch (c) {
2152                  case '\n':
2153                      ok = 'n'; break;
2154                  case '\r':
2155                      ok = 'r'; break;
2156                  case '\t':
2157                      ok = 't'; break;
2158                  case '\f':
2159                      ok = 'f'; break;
2160                  case '\a':
2161                      ok = 'a'; break;
2162                  case '\\':
2163                      ok = '\\'; break;
2164                  default: break;
2165                  }
2166                  if (ok) {
2167                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2168                  }
2169              }
2170              /* isPRINT() is the locale-blind version. */
2171              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2172                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2173                  ok = 1;
2174              }
2175          }
2176          if (!ok)
2177              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2178     }
2179     if (truncated)
2180          sv_catpvs(dsv, "...");
2181
2182     return SvPVX(dsv);
2183 }
2184
2185 /*
2186 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2187
2188 Build to the scalar dsv a displayable version of the scalar sv,
2189 the displayable version being at most pvlim bytes long
2190 (if longer, the rest is truncated and "..." will be appended).
2191
2192 The flags argument is as in pv_uni_display().
2193
2194 The pointer to the PV of the dsv is returned.
2195
2196 =cut
2197 */
2198 char *
2199 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2200 {
2201      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2202                                 SvCUR(ssv), pvlim, flags);
2203 }
2204
2205 /*
2206 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2207
2208 Return true if the strings s1 and s2 differ case-insensitively, false
2209 if not (if they are equal case-insensitively).  If u1 is true, the
2210 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2211 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2212 are false, the respective string is assumed to be in native 8-bit
2213 encoding.
2214
2215 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2216 in there (they will point at the beginning of the I<next> character).
2217 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2218 pointers beyond which scanning will not continue under any
2219 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2220 s2+l2 will be used as goal end pointers that will also stop the scan,
2221 and which qualify towards defining a successful match: all the scans
2222 that define an explicit length must reach their goal pointers for
2223 a match to succeed).
2224
2225 For case-insensitiveness, the "casefolding" of Unicode is used
2226 instead of upper/lowercasing both the characters, see
2227 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2228
2229 =cut */
2230 I32
2231 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2232 {
2233      dVAR;
2234      register const U8 *p1  = (const U8*)s1;
2235      register const U8 *p2  = (const U8*)s2;
2236      register const U8 *f1 = NULL;
2237      register const U8 *f2 = NULL;
2238      register U8 *e1 = NULL;
2239      register U8 *q1 = NULL;
2240      register U8 *e2 = NULL;
2241      register U8 *q2 = NULL;
2242      STRLEN n1 = 0, n2 = 0;
2243      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2244      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2245      U8 natbuf[1+1];
2246      STRLEN foldlen1, foldlen2;
2247      bool match;
2248
2249      if (pe1)
2250           e1 = *(U8**)pe1;
2251      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2252           f1 = (const U8*)s1 + l1;
2253      if (pe2)
2254           e2 = *(U8**)pe2;
2255      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2256           f2 = (const U8*)s2 + l2;
2257
2258      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2259           return 1; /* mismatch; possible infinite loop or false positive */
2260
2261      if (!u1 || !u2)
2262           natbuf[1] = 0; /* Need to terminate the buffer. */
2263
2264      while ((e1 == 0 || p1 < e1) &&
2265             (f1 == 0 || p1 < f1) &&
2266             (e2 == 0 || p2 < e2) &&
2267             (f2 == 0 || p2 < f2)) {
2268           if (n1 == 0) {
2269                if (u1)
2270                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2271                else {
2272                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2273                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2274                }
2275                q1 = foldbuf1;
2276                n1 = foldlen1;
2277           }
2278           if (n2 == 0) {
2279                if (u2)
2280                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2281                else {
2282                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2283                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2284                }
2285                q2 = foldbuf2;
2286                n2 = foldlen2;
2287           }
2288           while (n1 && n2) {
2289                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2290                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2291                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2292                    return 1; /* mismatch */
2293                n1 -= UTF8SKIP(q1);
2294                q1 += UTF8SKIP(q1);
2295                n2 -= UTF8SKIP(q2);
2296                q2 += UTF8SKIP(q2);
2297           }
2298           if (n1 == 0)
2299                p1 += u1 ? UTF8SKIP(p1) : 1;
2300           if (n2 == 0)
2301                p2 += u2 ? UTF8SKIP(p2) : 1;
2302
2303      }
2304
2305      /* A match is defined by all the scans that specified
2306       * an explicit length reaching their final goals. */
2307      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2308
2309      if (match) {
2310           if (pe1)
2311                *pe1 = (char*)p1;
2312           if (pe2)
2313                *pe2 = (char*)p2;
2314      }
2315
2316      return match ? 0 : 1; /* 0 match, 1 mismatch */
2317 }
2318
2319 /*
2320  * Local variables:
2321  * c-indentation-style: bsd
2322  * c-basic-offset: 4
2323  * indent-tabs-mode: t
2324  * End:
2325  *
2326  * ex: set ts=8 sts=4 sw=4 noet:
2327  */