utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241     PERL_UNUSED_CONTEXT;
 242 #ifdef IS_UTF8_CHAR
 243     if (IS_UTF8_CHAR_FAST(len))
 244         return IS_UTF8_CHAR(s, len) ? len : 0;
 245 #endif /* #ifdef IS_UTF8_CHAR */
 246     return is_utf8_char_slow(s, len);
 247 }
 248
 249 /*
 250 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 251
 252 Returns true if first C<len> bytes of the given string form a valid
 253 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 254 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 255 because a valid ASCII string is a valid UTF-8 string.
 256
 257 See also is_utf8_string_loclen() and is_utf8_string_loc().
 258
 259 =cut
 260 */
 261
 262 bool
 263 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 264 {
 265     const U8* const send = s + (len ? len : strlen((const char *)s));
 266     const U8* x = s;
 267
 268     PERL_UNUSED_CONTEXT;
 269
 270     while (x < send) {
 271         STRLEN c;
 272          /* Inline the easy bits of is_utf8_char() here for speed... */
 273          if (UTF8_IS_INVARIANT(*x))
 274               c = 1;
 275          else if (!UTF8_IS_START(*x))
 276              goto out;
 277          else {
 278               /* ... and call is_utf8_char() only if really needed. */
 279 #ifdef IS_UTF8_CHAR
 280              c = UTF8SKIP(x);
 281              if (IS_UTF8_CHAR_FAST(c)) {
 282                  if (!IS_UTF8_CHAR(x, c))
 283                      c = 0;
 284              }
 285              else
 286                 c = is_utf8_char_slow(x, c);
 287 #else
 288              c = is_utf8_char(x);
 289 #endif /* #ifdef IS_UTF8_CHAR */
 290               if (!c)
 291                   goto out;
 292          }
 293         x += c;
 294     }
 295
 296  out:
 297     if (x != send)
 298         return FALSE;
 299
 300     return TRUE;
 301 }
 302
 303 /*
 304 Implemented as a macro in utf8.h
 305
 306 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 307
 308 Like is_utf8_string() but stores the location of the failure (in the
 309 case of "utf8ness failure") or the location s+len (in the case of
 310 "utf8ness success") in the C<ep>.
 311
 312 See also is_utf8_string_loclen() and is_utf8_string().
 313
 314 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 315
 316 Like is_utf8_string() but stores the location of the failure (in the
 317 case of "utf8ness failure") or the location s+len (in the case of
 318 "utf8ness success") in the C<ep>, and the number of UTF-8
 319 encoded characters in the C<el>.
 320
 321 See also is_utf8_string_loc() and is_utf8_string().
 322
 323 =cut
 324 */
 325
 326 bool
 327 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 328 {
 329     const U8* const send = s + (len ? len : strlen((const char *)s));
 330     const U8* x = s;
 331     STRLEN c;
 332     STRLEN outlen = 0;
 333     PERL_UNUSED_CONTEXT;
 334
 335     while (x < send) {
 336          /* Inline the easy bits of is_utf8_char() here for speed... */
 337          if (UTF8_IS_INVARIANT(*x))
 338              c = 1;
 339          else if (!UTF8_IS_START(*x))
 340              goto out;
 341          else {
 342              /* ... and call is_utf8_char() only if really needed. */
 343 #ifdef IS_UTF8_CHAR
 344              c = UTF8SKIP(x);
 345              if (IS_UTF8_CHAR_FAST(c)) {
 346                  if (!IS_UTF8_CHAR(x, c))
 347                      c = 0;
 348              } else
 349                  c = is_utf8_char_slow(x, c);
 350 #else
 351              c = is_utf8_char(x);
 352 #endif /* #ifdef IS_UTF8_CHAR */
 353              if (!c)
 354                  goto out;
 355          }
 356          x += c;
 357          outlen++;
 358     }
 359
 360  out:
 361     if (el)
 362         *el = outlen;
 363
 364     if (ep)
 365         *ep = x;
 366     return (x == send);
 367 }
 368
 369 /*
 370
 371 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 372
 373 Bottom level UTF-8 decode routine.
 374 Returns the unicode code point value of the first character in the string C<s>
 375 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 376 C<retlen> will be set to the length, in bytes, of that character.
 377
 378 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 379 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 380 it is assumed that the caller will raise a warning, and this function
 381 will silently just set C<retlen> to C<-1> and return zero.  If the
 382 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 383 malformations will be given, C<retlen> will be set to the expected
 384 length of the UTF-8 character in bytes, and zero will be returned.
 385
 386 The C<flags> can also contain various flags to allow deviations from
 387 the strict UTF-8 encoding (see F<utf8.h>).
 388
 389 Most code should use utf8_to_uvchr() rather than call this directly.
 390
 391 =cut
 392 */
 393
 394 UV
 395 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 396 {
 397     dVAR;
 398     const U8 * const s0 = s;
 399     UV uv = *s, ouv = 0;
 400     STRLEN len = 1;
 401     const bool dowarn = ckWARN_d(WARN_UTF8);
 402     const UV startbyte = *s;
 403     STRLEN expectlen = 0;
 404     U32 warning = 0;
 405
 406 /* This list is a superset of the UTF8_ALLOW_XXX. */
 407
 408 #define UTF8_WARN_EMPTY                          1
 409 #define UTF8_WARN_CONTINUATION                   2
 410 #define UTF8_WARN_NON_CONTINUATION               3
 411 #define UTF8_WARN_FE_FF                          4
 412 #define UTF8_WARN_SHORT                          5
 413 #define UTF8_WARN_OVERFLOW                       6
 414 #define UTF8_WARN_SURROGATE                      7
 415 #define UTF8_WARN_LONG                           8
 416 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 417
 418     if (curlen == 0 &&
 419         !(flags & UTF8_ALLOW_EMPTY)) {
 420         warning = UTF8_WARN_EMPTY;
 421         goto malformed;
 422     }
 423
 424     if (UTF8_IS_INVARIANT(uv)) {
 425         if (retlen)
 426             *retlen = 1;
 427         return (UV) (NATIVE_TO_UTF(*s));
 428     }
 429
 430     if (UTF8_IS_CONTINUATION(uv) &&
 431         !(flags & UTF8_ALLOW_CONTINUATION)) {
 432         warning = UTF8_WARN_CONTINUATION;
 433         goto malformed;
 434     }
 435
 436     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 437         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 438         warning = UTF8_WARN_NON_CONTINUATION;
 439         goto malformed;
 440     }
 441
 442 #ifdef EBCDIC
 443     uv = NATIVE_TO_UTF(uv);
 444 #else
 445     if ((uv == 0xfe || uv == 0xff) &&
 446         !(flags & UTF8_ALLOW_FE_FF)) {
 447         warning = UTF8_WARN_FE_FF;
 448         goto malformed;
 449     }
 450 #endif
 451
 452     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 453     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 454     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 455     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 456 #ifdef EBCDIC
 457     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 458     else                        { len =  7; uv &= 0x01; }
 459 #else
 460     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 461     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 462     else                        { len = 13; uv = 0; } /* whoa! */
 463 #endif
 464
 465     if (retlen)
 466         *retlen = len;
 467
 468     expectlen = len;
 469
 470     if ((curlen < expectlen) &&
 471         !(flags & UTF8_ALLOW_SHORT)) {
 472         warning = UTF8_WARN_SHORT;
 473         goto malformed;
 474     }
 475
 476     len--;
 477     s++;
 478     ouv = uv;
 479
 480     while (len--) {
 481         if (!UTF8_IS_CONTINUATION(*s) &&
 482             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 483             s--;
 484             warning = UTF8_WARN_NON_CONTINUATION;
 485             goto malformed;
 486         }
 487         else
 488             uv = UTF8_ACCUMULATE(uv, *s);
 489         if (!(uv > ouv)) {
 490             /* These cannot be allowed. */
 491             if (uv == ouv) {
 492                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 493                     warning = UTF8_WARN_LONG;
 494                     goto malformed;
 495                 }
 496             }
 497             else { /* uv < ouv */
 498                 /* This cannot be allowed. */
 499                 warning = UTF8_WARN_OVERFLOW;
 500                 goto malformed;
 501             }
 502         }
 503         s++;
 504         ouv = uv;
 505     }
 506
 507     if (UNICODE_IS_SURROGATE(uv) &&
 508         !(flags & UTF8_ALLOW_SURROGATE)) {
 509         warning = UTF8_WARN_SURROGATE;
 510         goto malformed;
 511     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 512                !(flags & UTF8_ALLOW_LONG)) {
 513         warning = UTF8_WARN_LONG;
 514         goto malformed;
 515     } else if (UNICODE_IS_ILLEGAL(uv) &&
 516                !(flags & UTF8_ALLOW_FFFF)) {
 517         warning = UTF8_WARN_FFFF;
 518         goto malformed;
 519     }
 520
 521     return uv;
 522
 523 malformed:
 524
 525     if (flags & UTF8_CHECK_ONLY) {
 526         if (retlen)
 527             *retlen = ((STRLEN) -1);
 528         return 0;
 529     }
 530
 531     if (dowarn) {
 532         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 533
 534         switch (warning) {
 535         case 0: /* Intentionally empty. */ break;
 536         case UTF8_WARN_EMPTY:
 537             sv_catpvs(sv, "(empty string)");
 538             break;
 539         case UTF8_WARN_CONTINUATION:
 540             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 541             break;
 542         case UTF8_WARN_NON_CONTINUATION:
 543             if (s == s0)
 544                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 545                            (UV)s[1], startbyte);
 546             else {
 547                 const int len = (int)(s-s0);
 548                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 549                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 550             }
 551
 552             break;
 553         case UTF8_WARN_FE_FF:
 554             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 555             break;
 556         case UTF8_WARN_SHORT:
 557             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 558                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 559             expectlen = curlen;         /* distance for caller to skip */
 560             break;
 561         case UTF8_WARN_OVERFLOW:
 562             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 563                            ouv, *s, startbyte);
 564             break;
 565         case UTF8_WARN_SURROGATE:
 566             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 567             break;
 568         case UTF8_WARN_LONG:
 569             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 570                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 571             break;
 572         case UTF8_WARN_FFFF:
 573             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 574             break;
 575         default:
 576             sv_catpvs(sv, "(unknown reason)");
 577             break;
 578         }
 579
 580         if (warning) {
 581             const char * const s = SvPVX_const(sv);
 582
 583             if (PL_op)
 584                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 585                             "%s in %s", s,  OP_DESC(PL_op));
 586             else
 587                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 588         }
 589     }
 590
 591     if (retlen)
 592         *retlen = expectlen ? expectlen : len;
 593
 594     return 0;
 595 }
 596
 597 /*
 598 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 599
 600 Returns the native character value of the first character in the string C<s>
 601 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 602 length, in bytes, of that character.
 603
 604 If C<s> does not point to a well-formed UTF-8 character, zero is
 605 returned and retlen is set, if possible, to -1.
 606
 607 =cut
 608 */
 609
 610 UV
 611 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 612 {
 613     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 614                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 615 }
 616
 617 /*
 618 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 619
 620 Returns the Unicode code point of the first character in the string C<s>
 621 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 622 length, in bytes, of that character.
 623
 624 This function should only be used when returned UV is considered
 625 an index into the Unicode semantic tables (e.g. swashes).
 626
 627 If C<s> does not point to a well-formed UTF-8 character, zero is
 628 returned and retlen is set, if possible, to -1.
 629
 630 =cut
 631 */
 632
 633 UV
 634 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 635 {
 636     /* Call the low level routine asking for checks */
 637     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 638                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 639 }
 640
 641 /*
 642 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 643
 644 Return the length of the UTF-8 char encoded string C<s> in characters.
 645 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 646 up past C<e>, croaks.
 647
 648 =cut
 649 */
 650
 651 STRLEN
 652 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 653 {
 654     dVAR;
 655     STRLEN len = 0;
 656     U8 t = 0;
 657
 658     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 659      * the bitops (especially ~) can create illegal UTF-8.
 660      * In other words: in Perl UTF-8 is not just for Unicode. */
 661
 662     if (e < s)
 663         goto warn_and_return;
 664     while (s < e) {
 665         t = UTF8SKIP(s);
 666         if (e - s < t) {
 667             warn_and_return:
 668             if (ckWARN_d(WARN_UTF8)) {
 669                 if (PL_op)
 670                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 671                             "%s in %s", unees, OP_DESC(PL_op));
 672                 else
 673                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 674             }
 675             return len;
 676         }
 677         s += t;
 678         len++;
 679     }
 680
 681     return len;
 682 }
 683
 684 /*
 685 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 686
 687 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 688 and C<b>.
 689
 690 WARNING: use only if you *know* that the pointers point inside the
 691 same UTF-8 buffer.
 692
 693 =cut
 694 */
 695
 696 IV
 697 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 698 {
 699     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 700 }
 701
 702 /*
 703 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 704
 705 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 706 forward or backward.
 707
 708 WARNING: do not use the following unless you *know* C<off> is within
 709 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 710 on the first byte of character or just after the last byte of a character.
 711
 712 =cut
 713 */
 714
 715 U8 *
 716 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 717 {
 718     PERL_UNUSED_CONTEXT;
 719     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 720      * the bitops (especially ~) can create illegal UTF-8.
 721      * In other words: in Perl UTF-8 is not just for Unicode. */
 722
 723     if (off >= 0) {
 724         while (off--)
 725             s += UTF8SKIP(s);
 726     }
 727     else {
 728         while (off++) {
 729             s--;
 730             while (UTF8_IS_CONTINUATION(*s))
 731                 s--;
 732         }
 733     }
 734     return (U8 *)s;
 735 }
 736
 737 /*
 738 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 739
 740 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 741 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 742 updates len to contain the new length.
 743 Returns zero on failure, setting C<len> to -1.
 744
 745 If you need a copy of the string, see C<bytes_from_utf8>.
 746
 747 =cut
 748 */
 749
 750 U8 *
 751 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 752 {
 753     U8 * const save = s;
 754     U8 * const send = s + *len;
 755     U8 *d;
 756
 757     /* ensure valid UTF-8 and chars < 256 before updating string */
 758     while (s < send) {
 759         U8 c = *s++;
 760
 761         if (!UTF8_IS_INVARIANT(c) &&
 762             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 763              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 764             *len = ((STRLEN) -1);
 765             return 0;
 766         }
 767     }
 768
 769     d = s = save;
 770     while (s < send) {
 771         STRLEN ulen;
 772         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 773         s += ulen;
 774     }
 775     *d = '\0';
 776     *len = d - save;
 777     return save;
 778 }
 779
 780 /*
 781 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 782
 783 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 784 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 785 the newly-created string, and updates C<len> to contain the new
 786 length.  Returns the original string if no conversion occurs, C<len>
 787 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 788 0 if C<s> is converted or contains all 7bit characters.
 789
 790 =cut
 791 */
 792
 793 U8 *
 794 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 795 {
 796     U8 *d;
 797     const U8 *start = s;
 798     const U8 *send;
 799     I32 count = 0;
 800
 801     PERL_UNUSED_CONTEXT;
 802     if (!*is_utf8)
 803         return (U8 *)start;
 804
 805     /* ensure valid UTF-8 and chars < 256 before converting string */
 806     for (send = s + *len; s < send;) {
 807         U8 c = *s++;
 808         if (!UTF8_IS_INVARIANT(c)) {
 809             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 810                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 811                 count++;
 812             else
 813                 return (U8 *)start;
 814         }
 815     }
 816
 817     *is_utf8 = FALSE;
 818
 819     Newx(d, (*len) - count + 1, U8);
 820     s = start; start = d;
 821     while (s < send) {
 822         U8 c = *s++;
 823         if (!UTF8_IS_INVARIANT(c)) {
 824             /* Then it is two-byte encoded */
 825             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 826             c = ASCII_TO_NATIVE(c);
 827         }
 828         *d++ = c;
 829     }
 830     *d = '\0';
 831     *len = d - start;
 832     return (U8 *)start;
 833 }
 834
 835 /*
 836 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 837
 838 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 839 Returns a pointer to the newly-created string, and sets C<len> to
 840 reflect the new length.
 841
 842 If you want to convert to UTF-8 from other encodings than ASCII,
 843 see sv_recode_to_utf8().
 844
 845 =cut
 846 */
 847
 848 U8*
 849 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 850 {
 851     const U8 * const send = s + (*len);
 852     U8 *d;
 853     U8 *dst;
 854     PERL_UNUSED_CONTEXT;
 855
 856     Newx(d, (*len) * 2 + 1, U8);
 857     dst = d;
 858
 859     while (s < send) {
 860         const UV uv = NATIVE_TO_ASCII(*s++);
 861         if (UNI_IS_INVARIANT(uv))
 862             *d++ = (U8)UTF_TO_NATIVE(uv);
 863         else {
 864             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 865             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 866         }
 867     }
 868     *d = '\0';
 869     *len = d-dst;
 870     return dst;
 871 }
 872
 873 /*
 874  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 875  *
 876  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 877  * We optimize for native, for obvious reasons. */
 878
 879 U8*
 880 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 881 {
 882     U8* pend;
 883     U8* dstart = d;
 884
 885     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 886          d[0] = 0;
 887          *newlen = 1;
 888          return d;
 889     }
 890
 891     if (bytelen & 1)
 892         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 893
 894     pend = p + bytelen;
 895
 896     while (p < pend) {
 897         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 898         p += 2;
 899         if (uv < 0x80) {
 900 #ifdef EBCDIC
 901             *d++ = UNI_TO_NATIVE(uv);
 902 #else
 903             *d++ = (U8)uv;
 904 #endif
 905             continue;
 906         }
 907         if (uv < 0x800) {
 908             *d++ = (U8)(( uv >>  6)         | 0xc0);
 909             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 910             continue;
 911         }
 912         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 913             UV low = (p[0] << 8) + p[1];
 914             p += 2;
 915             if (low < 0xdc00 || low >= 0xdfff)
 916                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 917             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 918         }
 919         if (uv < 0x10000) {
 920             *d++ = (U8)(( uv >> 12)         | 0xe0);
 921             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 922             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 923             continue;
 924         }
 925         else {
 926             *d++ = (U8)(( uv >> 18)         | 0xf0);
 927             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 928             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 929             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 930             continue;
 931         }
 932     }
 933     *newlen = d - dstart;
 934     return d;
 935 }
 936
 937 /* Note: this one is slightly destructive of the source. */
 938
 939 U8*
 940 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 941 {
 942     U8* s = (U8*)p;
 943     U8* const send = s + bytelen;
 944     while (s < send) {
 945         const U8 tmp = s[0];
 946         s[0] = s[1];
 947         s[1] = tmp;
 948         s += 2;
 949     }
 950     return utf16_to_utf8(p, d, bytelen, newlen);
 951 }
 952
 953 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 954
 955 bool
 956 Perl_is_uni_alnum(pTHX_ UV c)
 957 {
 958     U8 tmpbuf[UTF8_MAXBYTES+1];
 959     uvchr_to_utf8(tmpbuf, c);
 960     return is_utf8_alnum(tmpbuf);
 961 }
 962
 963 bool
 964 Perl_is_uni_alnumc(pTHX_ UV c)
 965 {
 966     U8 tmpbuf[UTF8_MAXBYTES+1];
 967     uvchr_to_utf8(tmpbuf, c);
 968     return is_utf8_alnumc(tmpbuf);
 969 }
 970
 971 bool
 972 Perl_is_uni_idfirst(pTHX_ UV c)
 973 {
 974     U8 tmpbuf[UTF8_MAXBYTES+1];
 975     uvchr_to_utf8(tmpbuf, c);
 976     return is_utf8_idfirst(tmpbuf);
 977 }
 978
 979 bool
 980 Perl_is_uni_alpha(pTHX_ UV c)
 981 {
 982     U8 tmpbuf[UTF8_MAXBYTES+1];
 983     uvchr_to_utf8(tmpbuf, c);
 984     return is_utf8_alpha(tmpbuf);
 985 }
 986
 987 bool
 988 Perl_is_uni_ascii(pTHX_ UV c)
 989 {
 990     U8 tmpbuf[UTF8_MAXBYTES+1];
 991     uvchr_to_utf8(tmpbuf, c);
 992     return is_utf8_ascii(tmpbuf);
 993 }
 994
 995 bool
 996 Perl_is_uni_space(pTHX_ UV c)
 997 {
 998     U8 tmpbuf[UTF8_MAXBYTES+1];
 999     uvchr_to_utf8(tmpbuf, c);
1000     return is_utf8_space(tmpbuf);
1001 }
1002
1003 bool
1004 Perl_is_uni_digit(pTHX_ UV c)
1005 {
1006     U8 tmpbuf[UTF8_MAXBYTES+1];
1007     uvchr_to_utf8(tmpbuf, c);
1008     return is_utf8_digit(tmpbuf);
1009 }
1010
1011 bool
1012 Perl_is_uni_upper(pTHX_ UV c)
1013 {
1014     U8 tmpbuf[UTF8_MAXBYTES+1];
1015     uvchr_to_utf8(tmpbuf, c);
1016     return is_utf8_upper(tmpbuf);
1017 }
1018
1019 bool
1020 Perl_is_uni_lower(pTHX_ UV c)
1021 {
1022     U8 tmpbuf[UTF8_MAXBYTES+1];
1023     uvchr_to_utf8(tmpbuf, c);
1024     return is_utf8_lower(tmpbuf);
1025 }
1026
1027 bool
1028 Perl_is_uni_cntrl(pTHX_ UV c)
1029 {
1030     U8 tmpbuf[UTF8_MAXBYTES+1];
1031     uvchr_to_utf8(tmpbuf, c);
1032     return is_utf8_cntrl(tmpbuf);
1033 }
1034
1035 bool
1036 Perl_is_uni_graph(pTHX_ UV c)
1037 {
1038     U8 tmpbuf[UTF8_MAXBYTES+1];
1039     uvchr_to_utf8(tmpbuf, c);
1040     return is_utf8_graph(tmpbuf);
1041 }
1042
1043 bool
1044 Perl_is_uni_print(pTHX_ UV c)
1045 {
1046     U8 tmpbuf[UTF8_MAXBYTES+1];
1047     uvchr_to_utf8(tmpbuf, c);
1048     return is_utf8_print(tmpbuf);
1049 }
1050
1051 bool
1052 Perl_is_uni_punct(pTHX_ UV c)
1053 {
1054     U8 tmpbuf[UTF8_MAXBYTES+1];
1055     uvchr_to_utf8(tmpbuf, c);
1056     return is_utf8_punct(tmpbuf);
1057 }
1058
1059 bool
1060 Perl_is_uni_xdigit(pTHX_ UV c)
1061 {
1062     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1063     uvchr_to_utf8(tmpbuf, c);
1064     return is_utf8_xdigit(tmpbuf);
1065 }
1066
1067 UV
1068 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1069 {
1070     uvchr_to_utf8(p, c);
1071     return to_utf8_upper(p, p, lenp);
1072 }
1073
1074 UV
1075 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1076 {
1077     uvchr_to_utf8(p, c);
1078     return to_utf8_title(p, p, lenp);
1079 }
1080
1081 UV
1082 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1083 {
1084     uvchr_to_utf8(p, c);
1085     return to_utf8_lower(p, p, lenp);
1086 }
1087
1088 UV
1089 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1090 {
1091     uvchr_to_utf8(p, c);
1092     return to_utf8_fold(p, p, lenp);
1093 }
1094
1095 /* for now these all assume no locale info available for Unicode > 255 */
1096
1097 bool
1098 Perl_is_uni_alnum_lc(pTHX_ UV c)
1099 {
1100     return is_uni_alnum(c);     /* XXX no locale support yet */
1101 }
1102
1103 bool
1104 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1105 {
1106     return is_uni_alnumc(c);    /* XXX no locale support yet */
1107 }
1108
1109 bool
1110 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1111 {
1112     return is_uni_idfirst(c);   /* XXX no locale support yet */
1113 }
1114
1115 bool
1116 Perl_is_uni_alpha_lc(pTHX_ UV c)
1117 {
1118     return is_uni_alpha(c);     /* XXX no locale support yet */
1119 }
1120
1121 bool
1122 Perl_is_uni_ascii_lc(pTHX_ UV c)
1123 {
1124     return is_uni_ascii(c);     /* XXX no locale support yet */
1125 }
1126
1127 bool
1128 Perl_is_uni_space_lc(pTHX_ UV c)
1129 {
1130     return is_uni_space(c);     /* XXX no locale support yet */
1131 }
1132
1133 bool
1134 Perl_is_uni_digit_lc(pTHX_ UV c)
1135 {
1136     return is_uni_digit(c);     /* XXX no locale support yet */
1137 }
1138
1139 bool
1140 Perl_is_uni_upper_lc(pTHX_ UV c)
1141 {
1142     return is_uni_upper(c);     /* XXX no locale support yet */
1143 }
1144
1145 bool
1146 Perl_is_uni_lower_lc(pTHX_ UV c)
1147 {
1148     return is_uni_lower(c);     /* XXX no locale support yet */
1149 }
1150
1151 bool
1152 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1153 {
1154     return is_uni_cntrl(c);     /* XXX no locale support yet */
1155 }
1156
1157 bool
1158 Perl_is_uni_graph_lc(pTHX_ UV c)
1159 {
1160     return is_uni_graph(c);     /* XXX no locale support yet */
1161 }
1162
1163 bool
1164 Perl_is_uni_print_lc(pTHX_ UV c)
1165 {
1166     return is_uni_print(c);     /* XXX no locale support yet */
1167 }
1168
1169 bool
1170 Perl_is_uni_punct_lc(pTHX_ UV c)
1171 {
1172     return is_uni_punct(c);     /* XXX no locale support yet */
1173 }
1174
1175 bool
1176 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1177 {
1178     return is_uni_xdigit(c);    /* XXX no locale support yet */
1179 }
1180
1181 U32
1182 Perl_to_uni_upper_lc(pTHX_ U32 c)
1183 {
1184     /* XXX returns only the first character -- do not use XXX */
1185     /* XXX no locale support yet */
1186     STRLEN len;
1187     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1188     return (U32)to_uni_upper(c, tmpbuf, &len);
1189 }
1190
1191 U32
1192 Perl_to_uni_title_lc(pTHX_ U32 c)
1193 {
1194     /* XXX returns only the first character XXX -- do not use XXX */
1195     /* XXX no locale support yet */
1196     STRLEN len;
1197     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1198     return (U32)to_uni_title(c, tmpbuf, &len);
1199 }
1200
1201 U32
1202 Perl_to_uni_lower_lc(pTHX_ U32 c)
1203 {
1204     /* XXX returns only the first character -- do not use XXX */
1205     /* XXX no locale support yet */
1206     STRLEN len;
1207     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1208     return (U32)to_uni_lower(c, tmpbuf, &len);
1209 }
1210
1211 static bool
1212 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1213                  const char *const swashname)
1214 {
1215     dVAR;
1216     if (!is_utf8_char(p))
1217         return FALSE;
1218     if (!*swash)
1219         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1220     return swash_fetch(*swash, p, TRUE) != 0;
1221 }
1222
1223 bool
1224 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1225 {
1226     dVAR;
1227     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1228      * descendant of isalnum(3), in other words, it doesn't
1229      * contain the '_'. --jhi */
1230     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1231 }
1232
1233 bool
1234 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1235 {
1236     dVAR;
1237     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1238 }
1239
1240 bool
1241 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1242 {
1243     dVAR;
1244     if (*p == '_')
1245         return TRUE;
1246     /* is_utf8_idstart would be more logical. */
1247     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1248 }
1249
1250 bool
1251 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1252 {
1253     dVAR;
1254     if (*p == '_')
1255         return TRUE;
1256     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1257 }
1258
1259 bool
1260 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1261 {
1262     dVAR;
1263     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1264 }
1265
1266 bool
1267 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1268 {
1269     dVAR;
1270     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1271 }
1272
1273 bool
1274 Perl_is_utf8_space(pTHX_ const U8 *p)
1275 {
1276     dVAR;
1277     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1278 }
1279
1280 bool
1281 Perl_is_utf8_digit(pTHX_ const U8 *p)
1282 {
1283     dVAR;
1284     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1285 }
1286
1287 bool
1288 Perl_is_utf8_upper(pTHX_ const U8 *p)
1289 {
1290     dVAR;
1291     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1292 }
1293
1294 bool
1295 Perl_is_utf8_lower(pTHX_ const U8 *p)
1296 {
1297     dVAR;
1298     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1299 }
1300
1301 bool
1302 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1303 {
1304     dVAR;
1305     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1306 }
1307
1308 bool
1309 Perl_is_utf8_graph(pTHX_ const U8 *p)
1310 {
1311     dVAR;
1312     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1313 }
1314
1315 bool
1316 Perl_is_utf8_print(pTHX_ const U8 *p)
1317 {
1318     dVAR;
1319     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1320 }
1321
1322 bool
1323 Perl_is_utf8_punct(pTHX_ const U8 *p)
1324 {
1325     dVAR;
1326     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1327 }
1328
1329 bool
1330 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1331 {
1332     dVAR;
1333     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1334 }
1335
1336 bool
1337 Perl_is_utf8_mark(pTHX_ const U8 *p)
1338 {
1339     dVAR;
1340     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1341 }
1342
1343 /*
1344 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1345
1346 The "p" contains the pointer to the UTF-8 string encoding
1347 the character that is being converted.
1348
1349 The "ustrp" is a pointer to the character buffer to put the
1350 conversion result to.  The "lenp" is a pointer to the length
1351 of the result.
1352
1353 The "swashp" is a pointer to the swash to use.
1354
1355 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1356 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1357 but not always, a multicharacter mapping), is tried first.
1358
1359 The "special" is a string like "utf8::ToSpecLower", which means the
1360 hash %utf8::ToSpecLower.  The access to the hash is through
1361 Perl_to_utf8_case().
1362
1363 The "normal" is a string like "ToLower" which means the swash
1364 %utf8::ToLower.
1365
1366 =cut */
1367
1368 UV
1369 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1370                         SV **swashp, const char *normal, const char *special)
1371 {
1372     dVAR;
1373     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1374     STRLEN len = 0;
1375
1376     const UV uv0 = utf8_to_uvchr(p, NULL);
1377     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1378      * are necessary in EBCDIC, they are redundant no-ops
1379      * in ASCII-ish platforms, and hopefully optimized away. */
1380     const UV uv1 = NATIVE_TO_UNI(uv0);
1381     uvuni_to_utf8(tmpbuf, uv1);
1382
1383     if (!*swashp) /* load on-demand */
1384          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1385
1386     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1387     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1388          /* It might be "special" (sometimes, but not always,
1389           * a multicharacter mapping) */
1390          HV * const hv = get_hv(special, FALSE);
1391          SV **svp;
1392
1393          if (hv &&
1394              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1395              (*svp)) {
1396              const char *s;
1397
1398               s = SvPV_const(*svp, len);
1399               if (len == 1)
1400                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1401               else {
1402 #ifdef EBCDIC
1403                    /* If we have EBCDIC we need to remap the characters
1404                     * since any characters in the low 256 are Unicode
1405                     * code points, not EBCDIC. */
1406                    U8 *t = (U8*)s, *tend = t + len, *d;
1407
1408                    d = tmpbuf;
1409                    if (SvUTF8(*svp)) {
1410                         STRLEN tlen = 0;
1411
1412                         while (t < tend) {
1413                              const UV c = utf8_to_uvchr(t, &tlen);
1414                              if (tlen > 0) {
1415                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1416                                   t += tlen;
1417                              }
1418                              else
1419                                   break;
1420                         }
1421                    }
1422                    else {
1423                         while (t < tend) {
1424                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1425                              t++;
1426                         }
1427                    }
1428                    len = d - tmpbuf;
1429                    Copy(tmpbuf, ustrp, len, U8);
1430 #else
1431                    Copy(s, ustrp, len, U8);
1432 #endif
1433               }
1434          }
1435     }
1436
1437     if (!len && *swashp) {
1438         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1439
1440          if (uv2) {
1441               /* It was "normal" (a single character mapping). */
1442               const UV uv3 = UNI_TO_NATIVE(uv2);
1443               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1444          }
1445     }
1446
1447     if (!len) /* Neither: just copy. */
1448          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1449
1450     if (lenp)
1451          *lenp = len;
1452
1453     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1454 }
1455
1456 /*
1457 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1458
1459 Convert the UTF-8 encoded character at p to its uppercase version and
1460 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1461 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1462 the uppercase version may be longer than the original character.
1463
1464 The first character of the uppercased version is returned
1465 (but note, as explained above, that there may be more.)
1466
1467 =cut */
1468
1469 UV
1470 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1471 {
1472     dVAR;
1473     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1474                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1475 }
1476
1477 /*
1478 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1479
1480 Convert the UTF-8 encoded character at p to its titlecase version and
1481 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1482 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1483 titlecase version may be longer than the original character.
1484
1485 The first character of the titlecased version is returned
1486 (but note, as explained above, that there may be more.)
1487
1488 =cut */
1489
1490 UV
1491 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1492 {
1493     dVAR;
1494     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1495                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1496 }
1497
1498 /*
1499 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1500
1501 Convert the UTF-8 encoded character at p to its lowercase version and
1502 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1503 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1504 lowercase version may be longer than the original character.
1505
1506 The first character of the lowercased version is returned
1507 (but note, as explained above, that there may be more.)
1508
1509 =cut */
1510
1511 UV
1512 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1513 {
1514     dVAR;
1515     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1516                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1517 }
1518
1519 /*
1520 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1521
1522 Convert the UTF-8 encoded character at p to its foldcase version and
1523 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1524 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1525 foldcase version may be longer than the original character (up to
1526 three characters).
1527
1528 The first character of the foldcased version is returned
1529 (but note, as explained above, that there may be more.)
1530
1531 =cut */
1532
1533 UV
1534 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1535 {
1536     dVAR;
1537     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1538                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1539 }
1540
1541 /* Note:
1542  * A "swash" is a swatch hash.
1543  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1544  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1545  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1546  */
1547 SV*
1548 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1549 {
1550     dVAR;
1551     SV* retval;
1552     SV* const tokenbufsv = sv_newmortal();
1553     dSP;
1554     const size_t pkg_len = strlen(pkg);
1555     const size_t name_len = strlen(name);
1556     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1557     SV* errsv_save;
1558
1559     PUSHSTACKi(PERLSI_MAGIC);
1560     ENTER;
1561     SAVEI32(PL_hints);
1562     PL_hints = 0;
1563     save_re_context();
1564     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1565         ENTER;
1566         errsv_save = newSVsv(ERRSV);
1567         /* It is assumed that callers of this routine are not passing in any
1568            user derived data.  */
1569         /* Need to do this after save_re_context() as it will set PL_tainted to
1570            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1571            Even line to create errsv_save can turn on PL_tainted.  */
1572         SAVEBOOL(PL_tainted);
1573         PL_tainted = 0;
1574         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1575                          NULL);
1576         if (!SvTRUE(ERRSV))
1577             sv_setsv(ERRSV, errsv_save);
1578         SvREFCNT_dec(errsv_save);
1579         LEAVE;
1580     }
1581     SPAGAIN;
1582     PUSHMARK(SP);
1583     EXTEND(SP,5);
1584     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1585     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1586     PUSHs(listsv);
1587     PUSHs(sv_2mortal(newSViv(minbits)));
1588     PUSHs(sv_2mortal(newSViv(none)));
1589     PUTBACK;
1590     if (IN_PERL_COMPILETIME) {
1591         /* XXX ought to be handled by lex_start */
1592         SAVEI32(PL_in_my);
1593         PL_in_my = 0;
1594         sv_setpv(tokenbufsv, PL_tokenbuf);
1595     }
1596     errsv_save = newSVsv(ERRSV);
1597     if (call_method("SWASHNEW", G_SCALAR))
1598         retval = newSVsv(*PL_stack_sp--);
1599     else
1600         retval = &PL_sv_undef;
1601     if (!SvTRUE(ERRSV))
1602         sv_setsv(ERRSV, errsv_save);
1603     SvREFCNT_dec(errsv_save);
1604     LEAVE;
1605     POPSTACK;
1606     if (IN_PERL_COMPILETIME) {
1607         STRLEN len;
1608         const char* const pv = SvPV_const(tokenbufsv, len);
1609
1610         Copy(pv, PL_tokenbuf, len+1, char);
1611         CopHINTS_set(PL_curcop, PL_hints);
1612     }
1613     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1614         if (SvPOK(retval))
1615             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1616                        (void*)retval);
1617         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1618     }
1619     return retval;
1620 }
1621
1622
1623 /* This API is wrong for special case conversions since we may need to
1624  * return several Unicode characters for a single Unicode character
1625  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1626  * the lower-level routine, and it is similarly broken for returning
1627  * multiple values.  --jhi */
1628 /* Now SWASHGET is recasted into S_swash_get in this file. */
1629
1630 /* Note:
1631  * Returns the value of property/mapping C<swash> for the first character
1632  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1633  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1634  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1635  */
1636 UV
1637 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1638 {
1639     dVAR;
1640     HV* const hv = (HV*)SvRV(swash);
1641     U32 klen;
1642     U32 off;
1643     STRLEN slen;
1644     STRLEN needents;
1645     const U8 *tmps = NULL;
1646     U32 bit;
1647     SV *swatch;
1648     U8 tmputf8[2];
1649     const UV c = NATIVE_TO_ASCII(*ptr);
1650
1651     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1652         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1653         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1654         ptr = tmputf8;
1655     }
1656     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1657      * then the "swatch" is a vec() for al the chars which start
1658      * with 0xAA..0xYY
1659      * So the key in the hash (klen) is length of encoded char -1
1660      */
1661     klen = UTF8SKIP(ptr) - 1;
1662     off  = ptr[klen];
1663
1664     if (klen == 0) {
1665       /* If char in invariant then swatch is for all the invariant chars
1666        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1667        */
1668         needents = UTF_CONTINUATION_MARK;
1669         off      = NATIVE_TO_UTF(ptr[klen]);
1670     }
1671     else {
1672       /* If char is encoded then swatch is for the prefix */
1673         needents = (1 << UTF_ACCUMULATION_SHIFT);
1674         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1675     }
1676
1677     /*
1678      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1679      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1680      * it's nothing to sniff at.)  Pity we usually come through at least
1681      * two function calls to get here...
1682      *
1683      * NB: this code assumes that swatches are never modified, once generated!
1684      */
1685
1686     if (hv   == PL_last_swash_hv &&
1687         klen == PL_last_swash_klen &&
1688         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1689     {
1690         tmps = PL_last_swash_tmps;
1691         slen = PL_last_swash_slen;
1692     }
1693     else {
1694         /* Try our second-level swatch cache, kept in a hash. */
1695         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1696
1697         /* If not cached, generate it via swash_get */
1698         if (!svp || !SvPOK(*svp)
1699                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1700             /* We use utf8n_to_uvuni() as we want an index into
1701                Unicode tables, not a native character number.
1702              */
1703             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1704                                            ckWARN(WARN_UTF8) ?
1705                                            0 : UTF8_ALLOW_ANY);
1706             swatch = swash_get(swash,
1707                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1708                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1709                                 needents);
1710
1711             if (IN_PERL_COMPILETIME)
1712                 CopHINTS_set(PL_curcop, PL_hints);
1713
1714             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1715
1716             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1717                      || (slen << 3) < needents)
1718                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1719         }
1720
1721         PL_last_swash_hv = hv;
1722         PL_last_swash_klen = klen;
1723         /* FIXME change interpvar.h?  */
1724         PL_last_swash_tmps = (U8 *) tmps;
1725         PL_last_swash_slen = slen;
1726         if (klen)
1727             Copy(ptr, PL_last_swash_key, klen, U8);
1728     }
1729
1730     switch ((int)((slen << 3) / needents)) {
1731     case 1:
1732         bit = 1 << (off & 7);
1733         off >>= 3;
1734         return (tmps[off] & bit) != 0;
1735     case 8:
1736         return tmps[off];
1737     case 16:
1738         off <<= 1;
1739         return (tmps[off] << 8) + tmps[off + 1] ;
1740     case 32:
1741         off <<= 2;
1742         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1743     }
1744     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1745     NORETURN_FUNCTION_END;
1746 }
1747
1748 /* Note:
1749  * Returns a swatch (a bit vector string) for a code point sequence
1750  * that starts from the value C<start> and comprises the number C<span>.
1751  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1752  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1753  */
1754 STATIC SV*
1755 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1756 {
1757     SV *swatch;
1758     U8 *l, *lend, *x, *xend, *s;
1759     STRLEN lcur, xcur, scur;
1760
1761     HV* const hv = (HV*)SvRV(swash);
1762     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1763     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1764     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1765     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1766     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1767     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1768     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1769     const STRLEN bits  = SvUV(*bitssvp);
1770     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1771     const UV     none  = SvUV(*nonesvp);
1772     const UV     end   = start + span;
1773
1774     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1775         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1776                                                  (UV)bits);
1777     }
1778
1779     /* create and initialize $swatch */
1780     swatch = newSVpvs("");
1781     scur   = octets ? (span * octets) : (span + 7) / 8;
1782     SvGROW(swatch, scur + 1);
1783     s = (U8*)SvPVX(swatch);
1784     if (octets && none) {
1785         const U8* const e = s + scur;
1786         while (s < e) {
1787             if (bits == 8)
1788                 *s++ = (U8)(none & 0xff);
1789             else if (bits == 16) {
1790                 *s++ = (U8)((none >>  8) & 0xff);
1791                 *s++ = (U8)( none        & 0xff);
1792             }
1793             else if (bits == 32) {
1794                 *s++ = (U8)((none >> 24) & 0xff);
1795                 *s++ = (U8)((none >> 16) & 0xff);
1796                 *s++ = (U8)((none >>  8) & 0xff);
1797                 *s++ = (U8)( none        & 0xff);
1798             }
1799         }
1800         *s = '\0';
1801     }
1802     else {
1803         (void)memzero((U8*)s, scur + 1);
1804     }
1805     SvCUR_set(swatch, scur);
1806     s = (U8*)SvPVX(swatch);
1807
1808     /* read $swash->{LIST} */
1809     l = (U8*)SvPV(*listsvp, lcur);
1810     lend = l + lcur;
1811     while (l < lend) {
1812         UV min, max, val;
1813         STRLEN numlen;
1814         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1815
1816         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1817
1818         numlen = lend - l;
1819         min = grok_hex((char *)l, &numlen, &flags, NULL);
1820         if (numlen)
1821             l += numlen;
1822         else if (nl) {
1823             l = nl + 1; /* 1 is length of "\n" */
1824             continue;
1825         }
1826         else {
1827             l = lend; /* to LIST's end at which \n is not found */
1828             break;
1829         }
1830
1831         if (isBLANK(*l)) {
1832             ++l;
1833             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1834             numlen = lend - l;
1835             max = grok_hex((char *)l, &numlen, &flags, NULL);
1836             if (numlen)
1837                 l += numlen;
1838             else
1839                 max = min;
1840
1841             if (octets) {
1842                 if (isBLANK(*l)) {
1843                     ++l;
1844                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1845                             PERL_SCAN_DISALLOW_PREFIX;
1846                     numlen = lend - l;
1847                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1848                     if (numlen)
1849                         l += numlen;
1850                     else
1851                         val = 0;
1852                 }
1853                 else {
1854                     val = 0;
1855                     if (typeto) {
1856                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1857                                          typestr, l);
1858                     }
1859                 }
1860             }
1861             else
1862                 val = 0; /* bits == 1, then val should be ignored */
1863         }
1864         else {
1865             max = min;
1866             if (octets) {
1867                 val = 0;
1868                 if (typeto) {
1869                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1870                 }
1871             }
1872             else
1873                 val = 0; /* bits == 1, then val should be ignored */
1874         }
1875
1876         if (nl)
1877             l = nl + 1;
1878         else
1879             l = lend;
1880
1881         if (max < start)
1882             continue;
1883
1884         if (octets) {
1885             UV key;
1886             if (min < start) {
1887                 if (!none || val < none) {
1888                     val += start - min;
1889                 }
1890                 min = start;
1891             }
1892             for (key = min; key <= max; key++) {
1893                 STRLEN offset;
1894                 if (key >= end)
1895                     goto go_out_list;
1896                 /* offset must be non-negative (start <= min <= key < end) */
1897                 offset = octets * (key - start);
1898                 if (bits == 8)
1899                     s[offset] = (U8)(val & 0xff);
1900                 else if (bits == 16) {
1901                     s[offset    ] = (U8)((val >>  8) & 0xff);
1902                     s[offset + 1] = (U8)( val        & 0xff);
1903                 }
1904                 else if (bits == 32) {
1905                     s[offset    ] = (U8)((val >> 24) & 0xff);
1906                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1907                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1908                     s[offset + 3] = (U8)( val        & 0xff);
1909                 }
1910
1911                 if (!none || val < none)
1912                     ++val;
1913             }
1914         }
1915         else { /* bits == 1, then val should be ignored */
1916             UV key;
1917             if (min < start)
1918                 min = start;
1919             for (key = min; key <= max; key++) {
1920                 const STRLEN offset = (STRLEN)(key - start);
1921                 if (key >= end)
1922                     goto go_out_list;
1923                 s[offset >> 3] |= 1 << (offset & 7);
1924             }
1925         }
1926     } /* while */
1927   go_out_list:
1928
1929     /* read $swash->{EXTRAS} */
1930     x = (U8*)SvPV(*extssvp, xcur);
1931     xend = x + xcur;
1932     while (x < xend) {
1933         STRLEN namelen;
1934         U8 *namestr;
1935         SV** othersvp;
1936         HV* otherhv;
1937         STRLEN otherbits;
1938         SV **otherbitssvp, *other;
1939         U8 *s, *o, *nl;
1940         STRLEN slen, olen;
1941
1942         const U8 opc = *x++;
1943         if (opc == '\n')
1944             continue;
1945
1946         nl = (U8*)memchr(x, '\n', xend - x);
1947
1948         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1949             if (nl) {
1950                 x = nl + 1; /* 1 is length of "\n" */
1951                 continue;
1952             }
1953             else {
1954                 x = xend; /* to EXTRAS' end at which \n is not found */
1955                 break;
1956             }
1957         }
1958
1959         namestr = x;
1960         if (nl) {
1961             namelen = nl - namestr;
1962             x = nl + 1;
1963         }
1964         else {
1965             namelen = xend - namestr;
1966             x = xend;
1967         }
1968
1969         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1970         otherhv = (HV*)SvRV(*othersvp);
1971         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1972         otherbits = (STRLEN)SvUV(*otherbitssvp);
1973         if (bits < otherbits)
1974             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1975
1976         /* The "other" swatch must be destroyed after. */
1977         other = swash_get(*othersvp, start, span);
1978         o = (U8*)SvPV(other, olen);
1979
1980         if (!olen)
1981             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1982
1983         s = (U8*)SvPV(swatch, slen);
1984         if (bits == 1 && otherbits == 1) {
1985             if (slen != olen)
1986                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1987
1988             switch (opc) {
1989             case '+':
1990                 while (slen--)
1991                     *s++ |= *o++;
1992                 break;
1993             case '!':
1994                 while (slen--)
1995                     *s++ |= ~*o++;
1996                 break;
1997             case '-':
1998                 while (slen--)
1999                     *s++ &= ~*o++;
2000                 break;
2001             case '&':
2002                 while (slen--)
2003                     *s++ &= *o++;
2004                 break;
2005             default:
2006                 break;
2007             }
2008         }
2009         else {
2010             STRLEN otheroctets = otherbits >> 3;
2011             STRLEN offset = 0;
2012             U8* const send = s + slen;
2013
2014             while (s < send) {
2015                 UV otherval = 0;
2016
2017                 if (otherbits == 1) {
2018                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2019                     ++offset;
2020                 }
2021                 else {
2022                     STRLEN vlen = otheroctets;
2023                     otherval = *o++;
2024                     while (--vlen) {
2025                         otherval <<= 8;
2026                         otherval |= *o++;
2027                     }
2028                 }
2029
2030                 if (opc == '+' && otherval)
2031                     NOOP;   /* replace with otherval */
2032                 else if (opc == '!' && !otherval)
2033                     otherval = 1;
2034                 else if (opc == '-' && otherval)
2035                     otherval = 0;
2036                 else if (opc == '&' && !otherval)
2037                     otherval = 0;
2038                 else {
2039                     s += octets; /* no replacement */
2040                     continue;
2041                 }
2042
2043                 if (bits == 8)
2044                     *s++ = (U8)( otherval & 0xff);
2045                 else if (bits == 16) {
2046                     *s++ = (U8)((otherval >>  8) & 0xff);
2047                     *s++ = (U8)( otherval        & 0xff);
2048                 }
2049                 else if (bits == 32) {
2050                     *s++ = (U8)((otherval >> 24) & 0xff);
2051                     *s++ = (U8)((otherval >> 16) & 0xff);
2052                     *s++ = (U8)((otherval >>  8) & 0xff);
2053                     *s++ = (U8)( otherval        & 0xff);
2054                 }
2055             }
2056         }
2057         sv_free(other); /* through with it! */
2058     } /* while */
2059     return swatch;
2060 }
2061
2062 /*
2063 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2064
2065 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2066 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2067 bytes available. The return value is the pointer to the byte after the
2068 end of the new character. In other words,
2069
2070     d = uvchr_to_utf8(d, uv);
2071
2072 is the recommended wide native character-aware way of saying
2073
2074     *(d++) = uv;
2075
2076 =cut
2077 */
2078
2079 /* On ASCII machines this is normally a macro but we want a
2080    real function in case XS code wants it
2081 */
2082 U8 *
2083 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2084 {
2085     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2086 }
2087
2088 U8 *
2089 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2090 {
2091     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2092 }
2093
2094 /*
2095 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2096 flags
2097
2098 Returns the native character value of the first character in the string
2099 C<s>
2100 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2101 length, in bytes, of that character.
2102
2103 Allows length and flags to be passed to low level routine.
2104
2105 =cut
2106 */
2107 /* On ASCII machines this is normally a macro but we want
2108    a real function in case XS code wants it
2109 */
2110 UV
2111 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2112 U32 flags)
2113 {
2114     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2115     return UNI_TO_NATIVE(uv);
2116 }
2117
2118 /*
2119 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2120
2121 Build to the scalar dsv a displayable version of the string spv,
2122 length len, the displayable version being at most pvlim bytes long
2123 (if longer, the rest is truncated and "..." will be appended).
2124
2125 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2126 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2127 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2128 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2129 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2130 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2131
2132 The pointer to the PV of the dsv is returned.
2133
2134 =cut */
2135 char *
2136 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2137 {
2138     int truncated = 0;
2139     const char *s, *e;
2140
2141     sv_setpvn(dsv, "", 0);
2142     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2143          UV u;
2144           /* This serves double duty as a flag and a character to print after
2145              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2146           */
2147          char ok = 0;
2148
2149          if (pvlim && SvCUR(dsv) >= pvlim) {
2150               truncated++;
2151               break;
2152          }
2153          u = utf8_to_uvchr((U8*)s, 0);
2154          if (u < 256) {
2155              const unsigned char c = (unsigned char)u & 0xFF;
2156              if (flags & UNI_DISPLAY_BACKSLASH) {
2157                  switch (c) {
2158                  case '\n':
2159                      ok = 'n'; break;
2160                  case '\r':
2161                      ok = 'r'; break;
2162                  case '\t':
2163                      ok = 't'; break;
2164                  case '\f':
2165                      ok = 'f'; break;
2166                  case '\a':
2167                      ok = 'a'; break;
2168                  case '\\':
2169                      ok = '\\'; break;
2170                  default: break;
2171                  }
2172                  if (ok) {
2173                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2174                  }
2175              }
2176              /* isPRINT() is the locale-blind version. */
2177              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2178                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2179                  ok = 1;
2180              }
2181          }
2182          if (!ok)
2183              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2184     }
2185     if (truncated)
2186          sv_catpvs(dsv, "...");
2187
2188     return SvPVX(dsv);
2189 }
2190
2191 /*
2192 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2193
2194 Build to the scalar dsv a displayable version of the scalar sv,
2195 the displayable version being at most pvlim bytes long
2196 (if longer, the rest is truncated and "..." will be appended).
2197
2198 The flags argument is as in pv_uni_display().
2199
2200 The pointer to the PV of the dsv is returned.
2201
2202 =cut
2203 */
2204 char *
2205 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2206 {
2207      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2208                                 SvCUR(ssv), pvlim, flags);
2209 }
2210
2211 /*
2212 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2213
2214 Return true if the strings s1 and s2 differ case-insensitively, false
2215 if not (if they are equal case-insensitively).  If u1 is true, the
2216 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2217 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2218 are false, the respective string is assumed to be in native 8-bit
2219 encoding.
2220
2221 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2222 in there (they will point at the beginning of the I<next> character).
2223 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2224 pointers beyond which scanning will not continue under any
2225 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2226 s2+l2 will be used as goal end pointers that will also stop the scan,
2227 and which qualify towards defining a successful match: all the scans
2228 that define an explicit length must reach their goal pointers for
2229 a match to succeed).
2230
2231 For case-insensitiveness, the "casefolding" of Unicode is used
2232 instead of upper/lowercasing both the characters, see
2233 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2234
2235 =cut */
2236 I32
2237 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2238 {
2239      dVAR;
2240      register const U8 *p1  = (const U8*)s1;
2241      register const U8 *p2  = (const U8*)s2;
2242      register const U8 *f1 = NULL;
2243      register const U8 *f2 = NULL;
2244      register U8 *e1 = NULL;
2245      register U8 *q1 = NULL;
2246      register U8 *e2 = NULL;
2247      register U8 *q2 = NULL;
2248      STRLEN n1 = 0, n2 = 0;
2249      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2250      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2251      U8 natbuf[1+1];
2252      STRLEN foldlen1, foldlen2;
2253      bool match;
2254
2255      if (pe1)
2256           e1 = *(U8**)pe1;
2257      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2258           f1 = (const U8*)s1 + l1;
2259      if (pe2)
2260           e2 = *(U8**)pe2;
2261      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2262           f2 = (const U8*)s2 + l2;
2263
2264      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2265           return 1; /* mismatch; possible infinite loop or false positive */
2266
2267      if (!u1 || !u2)
2268           natbuf[1] = 0; /* Need to terminate the buffer. */
2269
2270      while ((e1 == 0 || p1 < e1) &&
2271             (f1 == 0 || p1 < f1) &&
2272             (e2 == 0 || p2 < e2) &&
2273             (f2 == 0 || p2 < f2)) {
2274           if (n1 == 0) {
2275                if (u1)
2276                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2277                else {
2278                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2279                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2280                }
2281                q1 = foldbuf1;
2282                n1 = foldlen1;
2283           }
2284           if (n2 == 0) {
2285                if (u2)
2286                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2287                else {
2288                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2289                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2290                }
2291                q2 = foldbuf2;
2292                n2 = foldlen2;
2293           }
2294           while (n1 && n2) {
2295                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2296                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2297                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2298                    return 1; /* mismatch */
2299                n1 -= UTF8SKIP(q1);
2300                q1 += UTF8SKIP(q1);
2301                n2 -= UTF8SKIP(q2);
2302                q2 += UTF8SKIP(q2);
2303           }
2304           if (n1 == 0)
2305                p1 += u1 ? UTF8SKIP(p1) : 1;
2306           if (n2 == 0)
2307                p2 += u2 ? UTF8SKIP(p2) : 1;
2308
2309      }
2310
2311      /* A match is defined by all the scans that specified
2312       * an explicit length reaching their final goals. */
2313      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2314
2315      if (match) {
2316           if (pe1)
2317                *pe1 = (char*)p1;
2318           if (pe2)
2319                *pe2 = (char*)p2;
2320      }
2321
2322      return match ? 0 : 1; /* 0 match, 1 mismatch */
2323 }
2324
2325 /*
2326  * Local variables:
2327  * c-indentation-style: bsd
2328  * c-basic-offset: 4
2329  * indent-tabs-mode: t
2330  * End:
2331  *
2332  * ex: set ts=8 sts=4 sw=4 noet:
2333  */