utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241     PERL_UNUSED_CONTEXT;
 242 #ifdef IS_UTF8_CHAR
 243     if (IS_UTF8_CHAR_FAST(len))
 244         return IS_UTF8_CHAR(s, len) ? len : 0;
 245 #endif /* #ifdef IS_UTF8_CHAR */
 246     return is_utf8_char_slow(s, len);
 247 }
 248
 249 /*
 250 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 251
 252 Returns true if first C<len> bytes of the given string form a valid
 253 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 254 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 255 because a valid ASCII string is a valid UTF-8 string.
 256
 257 See also is_utf8_string_loclen() and is_utf8_string_loc().
 258
 259 =cut
 260 */
 261
 262 bool
 263 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 264 {
 265     const U8* const send = s + (len ? len : strlen((const char *)s));
 266     const U8* x = s;
 267
 268     PERL_UNUSED_CONTEXT;
 269
 270     while (x < send) {
 271         STRLEN c;
 272          /* Inline the easy bits of is_utf8_char() here for speed... */
 273          if (UTF8_IS_INVARIANT(*x))
 274               c = 1;
 275          else if (!UTF8_IS_START(*x))
 276              goto out;
 277          else {
 278               /* ... and call is_utf8_char() only if really needed. */
 279 #ifdef IS_UTF8_CHAR
 280              c = UTF8SKIP(x);
 281              if (IS_UTF8_CHAR_FAST(c)) {
 282                  if (!IS_UTF8_CHAR(x, c))
 283                      c = 0;
 284              }
 285              else
 286                 c = is_utf8_char_slow(x, c);
 287 #else
 288              c = is_utf8_char(x);
 289 #endif /* #ifdef IS_UTF8_CHAR */
 290               if (!c)
 291                   goto out;
 292          }
 293         x += c;
 294     }
 295
 296  out:
 297     if (x != send)
 298         return FALSE;
 299
 300     return TRUE;
 301 }
 302
 303 /*
 304 Implemented as a macro in utf8.h
 305
 306 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 307
 308 Like is_utf8_string() but stores the location of the failure (in the
 309 case of "utf8ness failure") or the location s+len (in the case of
 310 "utf8ness success") in the C<ep>.
 311
 312 See also is_utf8_string_loclen() and is_utf8_string().
 313
 314 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 315
 316 Like is_utf8_string() but stores the location of the failure (in the
 317 case of "utf8ness failure") or the location s+len (in the case of
 318 "utf8ness success") in the C<ep>, and the number of UTF-8
 319 encoded characters in the C<el>.
 320
 321 See also is_utf8_string_loc() and is_utf8_string().
 322
 323 =cut
 324 */
 325
 326 bool
 327 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 328 {
 329     const U8* const send = s + (len ? len : strlen((const char *)s));
 330     const U8* x = s;
 331     STRLEN c;
 332     STRLEN outlen = 0;
 333     PERL_UNUSED_CONTEXT;
 334
 335     while (x < send) {
 336          /* Inline the easy bits of is_utf8_char() here for speed... */
 337          if (UTF8_IS_INVARIANT(*x))
 338              c = 1;
 339          else if (!UTF8_IS_START(*x))
 340              goto out;
 341          else {
 342              /* ... and call is_utf8_char() only if really needed. */
 343 #ifdef IS_UTF8_CHAR
 344              c = UTF8SKIP(x);
 345              if (IS_UTF8_CHAR_FAST(c)) {
 346                  if (!IS_UTF8_CHAR(x, c))
 347                      c = 0;
 348              } else
 349                  c = is_utf8_char_slow(x, c);
 350 #else
 351              c = is_utf8_char(x);
 352 #endif /* #ifdef IS_UTF8_CHAR */
 353              if (!c)
 354                  goto out;
 355          }
 356          x += c;
 357          outlen++;
 358     }
 359
 360  out:
 361     if (el)
 362         *el = outlen;
 363
 364     if (ep)
 365         *ep = x;
 366     return (x == send);
 367 }
 368
 369 /*
 370
 371 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 372
 373 Bottom level UTF-8 decode routine.
 374 Returns the unicode code point value of the first character in the string C<s>
 375 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 376 C<retlen> will be set to the length, in bytes, of that character.
 377
 378 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 379 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 380 it is assumed that the caller will raise a warning, and this function
 381 will silently just set C<retlen> to C<-1> and return zero.  If the
 382 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 383 malformations will be given, C<retlen> will be set to the expected
 384 length of the UTF-8 character in bytes, and zero will be returned.
 385
 386 The C<flags> can also contain various flags to allow deviations from
 387 the strict UTF-8 encoding (see F<utf8.h>).
 388
 389 Most code should use utf8_to_uvchr() rather than call this directly.
 390
 391 =cut
 392 */
 393
 394 UV
 395 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 396 {
 397     dVAR;
 398     const U8 * const s0 = s;
 399     UV uv = *s, ouv = 0;
 400     STRLEN len = 1;
 401     const bool dowarn = ckWARN_d(WARN_UTF8);
 402     const UV startbyte = *s;
 403     STRLEN expectlen = 0;
 404     U32 warning = 0;
 405
 406 /* This list is a superset of the UTF8_ALLOW_XXX. */
 407
 408 #define UTF8_WARN_EMPTY                          1
 409 #define UTF8_WARN_CONTINUATION                   2
 410 #define UTF8_WARN_NON_CONTINUATION               3
 411 #define UTF8_WARN_FE_FF                          4
 412 #define UTF8_WARN_SHORT                          5
 413 #define UTF8_WARN_OVERFLOW                       6
 414 #define UTF8_WARN_SURROGATE                      7
 415 #define UTF8_WARN_LONG                           8
 416 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 417
 418     if (curlen == 0 &&
 419         !(flags & UTF8_ALLOW_EMPTY)) {
 420         warning = UTF8_WARN_EMPTY;
 421         goto malformed;
 422     }
 423
 424     if (UTF8_IS_INVARIANT(uv)) {
 425         if (retlen)
 426             *retlen = 1;
 427         return (UV) (NATIVE_TO_UTF(*s));
 428     }
 429
 430     if (UTF8_IS_CONTINUATION(uv) &&
 431         !(flags & UTF8_ALLOW_CONTINUATION)) {
 432         warning = UTF8_WARN_CONTINUATION;
 433         goto malformed;
 434     }
 435
 436     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 437         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 438         warning = UTF8_WARN_NON_CONTINUATION;
 439         goto malformed;
 440     }
 441
 442 #ifdef EBCDIC
 443     uv = NATIVE_TO_UTF(uv);
 444 #else
 445     if ((uv == 0xfe || uv == 0xff) &&
 446         !(flags & UTF8_ALLOW_FE_FF)) {
 447         warning = UTF8_WARN_FE_FF;
 448         goto malformed;
 449     }
 450 #endif
 451
 452     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 453     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 454     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 455     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 456 #ifdef EBCDIC
 457     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 458     else                        { len =  7; uv &= 0x01; }
 459 #else
 460     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 461     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 462     else                        { len = 13; uv = 0; } /* whoa! */
 463 #endif
 464
 465     if (retlen)
 466         *retlen = len;
 467
 468     expectlen = len;
 469
 470     if ((curlen < expectlen) &&
 471         !(flags & UTF8_ALLOW_SHORT)) {
 472         warning = UTF8_WARN_SHORT;
 473         goto malformed;
 474     }
 475
 476     len--;
 477     s++;
 478     ouv = uv;
 479
 480     while (len--) {
 481         if (!UTF8_IS_CONTINUATION(*s) &&
 482             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 483             s--;
 484             warning = UTF8_WARN_NON_CONTINUATION;
 485             goto malformed;
 486         }
 487         else
 488             uv = UTF8_ACCUMULATE(uv, *s);
 489         if (!(uv > ouv)) {
 490             /* These cannot be allowed. */
 491             if (uv == ouv) {
 492                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 493                     warning = UTF8_WARN_LONG;
 494                     goto malformed;
 495                 }
 496             }
 497             else { /* uv < ouv */
 498                 /* This cannot be allowed. */
 499                 warning = UTF8_WARN_OVERFLOW;
 500                 goto malformed;
 501             }
 502         }
 503         s++;
 504         ouv = uv;
 505     }
 506
 507     if (UNICODE_IS_SURROGATE(uv) &&
 508         !(flags & UTF8_ALLOW_SURROGATE)) {
 509         warning = UTF8_WARN_SURROGATE;
 510         goto malformed;
 511     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 512                !(flags & UTF8_ALLOW_LONG)) {
 513         warning = UTF8_WARN_LONG;
 514         goto malformed;
 515     } else if (UNICODE_IS_ILLEGAL(uv) &&
 516                !(flags & UTF8_ALLOW_FFFF)) {
 517         warning = UTF8_WARN_FFFF;
 518         goto malformed;
 519     }
 520
 521     return uv;
 522
 523 malformed:
 524
 525     if (flags & UTF8_CHECK_ONLY) {
 526         if (retlen)
 527             *retlen = -1;
 528         return 0;
 529     }
 530
 531     if (dowarn) {
 532         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 533
 534         switch (warning) {
 535         case 0: /* Intentionally empty. */ break;
 536         case UTF8_WARN_EMPTY:
 537             sv_catpvs(sv, "(empty string)");
 538             break;
 539         case UTF8_WARN_CONTINUATION:
 540             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 541             break;
 542         case UTF8_WARN_NON_CONTINUATION:
 543             if (s == s0)
 544                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 545                            (UV)s[1], startbyte);
 546             else {
 547                 const int len = (int)(s-s0);
 548                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 549                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 550             }
 551
 552             break;
 553         case UTF8_WARN_FE_FF:
 554             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 555             break;
 556         case UTF8_WARN_SHORT:
 557             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 558                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 559             expectlen = curlen;         /* distance for caller to skip */
 560             break;
 561         case UTF8_WARN_OVERFLOW:
 562             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 563                            ouv, *s, startbyte);
 564             break;
 565         case UTF8_WARN_SURROGATE:
 566             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 567             break;
 568         case UTF8_WARN_LONG:
 569             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 570                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 571             break;
 572         case UTF8_WARN_FFFF:
 573             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 574             break;
 575         default:
 576             sv_catpvs(sv, "(unknown reason)");
 577             break;
 578         }
 579
 580         if (warning) {
 581             const char * const s = SvPVX_const(sv);
 582
 583             if (PL_op)
 584                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 585                             "%s in %s", s,  OP_DESC(PL_op));
 586             else
 587                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 588         }
 589     }
 590
 591     if (retlen)
 592         *retlen = expectlen ? expectlen : len;
 593
 594     return 0;
 595 }
 596
 597 /*
 598 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 599
 600 Returns the native character value of the first character in the string C<s>
 601 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 602 length, in bytes, of that character.
 603
 604 If C<s> does not point to a well-formed UTF-8 character, zero is
 605 returned and retlen is set, if possible, to -1.
 606
 607 =cut
 608 */
 609
 610 UV
 611 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 612 {
 613     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 614                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 615 }
 616
 617 /*
 618 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 619
 620 Returns the Unicode code point of the first character in the string C<s>
 621 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 622 length, in bytes, of that character.
 623
 624 This function should only be used when returned UV is considered
 625 an index into the Unicode semantic tables (e.g. swashes).
 626
 627 If C<s> does not point to a well-formed UTF-8 character, zero is
 628 returned and retlen is set, if possible, to -1.
 629
 630 =cut
 631 */
 632
 633 UV
 634 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 635 {
 636     /* Call the low level routine asking for checks */
 637     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 638                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 639 }
 640
 641 /*
 642 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 643
 644 Return the length of the UTF-8 char encoded string C<s> in characters.
 645 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 646 up past C<e>, croaks.
 647
 648 =cut
 649 */
 650
 651 STRLEN
 652 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 653 {
 654     dVAR;
 655     STRLEN len = 0;
 656
 657     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 658      * the bitops (especially ~) can create illegal UTF-8.
 659      * In other words: in Perl UTF-8 is not just for Unicode. */
 660
 661     if (e < s)
 662         goto warn_and_return;
 663     while (s < e) {
 664         const U8 t = UTF8SKIP(s);
 665         if (e - s < t) {
 666             warn_and_return:
 667             if (ckWARN_d(WARN_UTF8)) {
 668                 if (PL_op)
 669                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 670                             "%s in %s", unees, OP_DESC(PL_op));
 671                 else
 672                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 673             }
 674             return len;
 675         }
 676         s += t;
 677         len++;
 678     }
 679
 680     return len;
 681 }
 682
 683 /*
 684 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 685
 686 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 687 and C<b>.
 688
 689 WARNING: use only if you *know* that the pointers point inside the
 690 same UTF-8 buffer.
 691
 692 =cut
 693 */
 694
 695 IV
 696 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 697 {
 698     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 699 }
 700
 701 /*
 702 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 703
 704 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 705 forward or backward.
 706
 707 WARNING: do not use the following unless you *know* C<off> is within
 708 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 709 on the first byte of character or just after the last byte of a character.
 710
 711 =cut
 712 */
 713
 714 U8 *
 715 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 716 {
 717     PERL_UNUSED_CONTEXT;
 718     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 719      * the bitops (especially ~) can create illegal UTF-8.
 720      * In other words: in Perl UTF-8 is not just for Unicode. */
 721
 722     if (off >= 0) {
 723         while (off--)
 724             s += UTF8SKIP(s);
 725     }
 726     else {
 727         while (off++) {
 728             s--;
 729             while (UTF8_IS_CONTINUATION(*s))
 730                 s--;
 731         }
 732     }
 733     return (U8 *)s;
 734 }
 735
 736 /*
 737 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 738
 739 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 740 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 741 updates len to contain the new length.
 742 Returns zero on failure, setting C<len> to -1.
 743
 744 If you need a copy of the string, see C<bytes_from_utf8>.
 745
 746 =cut
 747 */
 748
 749 U8 *
 750 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 751 {
 752     U8 * const save = s;
 753     U8 * const send = s + *len;
 754     U8 *d;
 755
 756     /* ensure valid UTF-8 and chars < 256 before updating string */
 757     while (s < send) {
 758         U8 c = *s++;
 759
 760         if (!UTF8_IS_INVARIANT(c) &&
 761             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 762              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 763             *len = -1;
 764             return 0;
 765         }
 766     }
 767
 768     d = s = save;
 769     while (s < send) {
 770         STRLEN ulen;
 771         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 772         s += ulen;
 773     }
 774     *d = '\0';
 775     *len = d - save;
 776     return save;
 777 }
 778
 779 /*
 780 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 781
 782 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 783 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 784 the newly-created string, and updates C<len> to contain the new
 785 length.  Returns the original string if no conversion occurs, C<len>
 786 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 787 0 if C<s> is converted or contains all 7bit characters.
 788
 789 =cut
 790 */
 791
 792 U8 *
 793 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 794 {
 795     U8 *d;
 796     const U8 *start = s;
 797     const U8 *send;
 798     I32 count = 0;
 799
 800     PERL_UNUSED_CONTEXT;
 801     if (!*is_utf8)
 802         return (U8 *)start;
 803
 804     /* ensure valid UTF-8 and chars < 256 before converting string */
 805     for (send = s + *len; s < send;) {
 806         U8 c = *s++;
 807         if (!UTF8_IS_INVARIANT(c)) {
 808             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 809                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 810                 count++;
 811             else
 812                 return (U8 *)start;
 813         }
 814     }
 815
 816     *is_utf8 = FALSE;
 817
 818     Newx(d, (*len) - count + 1, U8);
 819     s = start; start = d;
 820     while (s < send) {
 821         U8 c = *s++;
 822         if (!UTF8_IS_INVARIANT(c)) {
 823             /* Then it is two-byte encoded */
 824             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 825             c = ASCII_TO_NATIVE(c);
 826         }
 827         *d++ = c;
 828     }
 829     *d = '\0';
 830     *len = d - start;
 831     return (U8 *)start;
 832 }
 833
 834 /*
 835 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 836
 837 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 838 Returns a pointer to the newly-created string, and sets C<len> to
 839 reflect the new length.
 840
 841 If you want to convert to UTF-8 from other encodings than ASCII,
 842 see sv_recode_to_utf8().
 843
 844 =cut
 845 */
 846
 847 U8*
 848 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 849 {
 850     const U8 * const send = s + (*len);
 851     U8 *d;
 852     U8 *dst;
 853     PERL_UNUSED_CONTEXT;
 854
 855     Newx(d, (*len) * 2 + 1, U8);
 856     dst = d;
 857
 858     while (s < send) {
 859         const UV uv = NATIVE_TO_ASCII(*s++);
 860         if (UNI_IS_INVARIANT(uv))
 861             *d++ = (U8)UTF_TO_NATIVE(uv);
 862         else {
 863             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 864             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 865         }
 866     }
 867     *d = '\0';
 868     *len = d-dst;
 869     return dst;
 870 }
 871
 872 /*
 873  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 874  *
 875  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 876  * We optimize for native, for obvious reasons. */
 877
 878 U8*
 879 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 880 {
 881     U8* pend;
 882     U8* dstart = d;
 883
 884     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 885          d[0] = 0;
 886          *newlen = 1;
 887          return d;
 888     }
 889
 890     if (bytelen & 1)
 891         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 892
 893     pend = p + bytelen;
 894
 895     while (p < pend) {
 896         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 897         p += 2;
 898         if (uv < 0x80) {
 899 #ifdef EBCDIC
 900             *d++ = UNI_TO_NATIVE(uv);
 901 #else
 902             *d++ = (U8)uv;
 903 #endif
 904             continue;
 905         }
 906         if (uv < 0x800) {
 907             *d++ = (U8)(( uv >>  6)         | 0xc0);
 908             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 909             continue;
 910         }
 911         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 912             UV low = (p[0] << 8) + p[1];
 913             p += 2;
 914             if (low < 0xdc00 || low >= 0xdfff)
 915                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 916             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 917         }
 918         if (uv < 0x10000) {
 919             *d++ = (U8)(( uv >> 12)         | 0xe0);
 920             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 921             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 922             continue;
 923         }
 924         else {
 925             *d++ = (U8)(( uv >> 18)         | 0xf0);
 926             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 927             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 928             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 929             continue;
 930         }
 931     }
 932     *newlen = d - dstart;
 933     return d;
 934 }
 935
 936 /* Note: this one is slightly destructive of the source. */
 937
 938 U8*
 939 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 940 {
 941     U8* s = (U8*)p;
 942     U8* const send = s + bytelen;
 943     while (s < send) {
 944         const U8 tmp = s[0];
 945         s[0] = s[1];
 946         s[1] = tmp;
 947         s += 2;
 948     }
 949     return utf16_to_utf8(p, d, bytelen, newlen);
 950 }
 951
 952 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 953
 954 bool
 955 Perl_is_uni_alnum(pTHX_ UV c)
 956 {
 957     U8 tmpbuf[UTF8_MAXBYTES+1];
 958     uvchr_to_utf8(tmpbuf, c);
 959     return is_utf8_alnum(tmpbuf);
 960 }
 961
 962 bool
 963 Perl_is_uni_alnumc(pTHX_ UV c)
 964 {
 965     U8 tmpbuf[UTF8_MAXBYTES+1];
 966     uvchr_to_utf8(tmpbuf, c);
 967     return is_utf8_alnumc(tmpbuf);
 968 }
 969
 970 bool
 971 Perl_is_uni_idfirst(pTHX_ UV c)
 972 {
 973     U8 tmpbuf[UTF8_MAXBYTES+1];
 974     uvchr_to_utf8(tmpbuf, c);
 975     return is_utf8_idfirst(tmpbuf);
 976 }
 977
 978 bool
 979 Perl_is_uni_alpha(pTHX_ UV c)
 980 {
 981     U8 tmpbuf[UTF8_MAXBYTES+1];
 982     uvchr_to_utf8(tmpbuf, c);
 983     return is_utf8_alpha(tmpbuf);
 984 }
 985
 986 bool
 987 Perl_is_uni_ascii(pTHX_ UV c)
 988 {
 989     U8 tmpbuf[UTF8_MAXBYTES+1];
 990     uvchr_to_utf8(tmpbuf, c);
 991     return is_utf8_ascii(tmpbuf);
 992 }
 993
 994 bool
 995 Perl_is_uni_space(pTHX_ UV c)
 996 {
 997     U8 tmpbuf[UTF8_MAXBYTES+1];
 998     uvchr_to_utf8(tmpbuf, c);
 999     return is_utf8_space(tmpbuf);
1000 }
1001
1002 bool
1003 Perl_is_uni_digit(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_digit(tmpbuf);
1008 }
1009
1010 bool
1011 Perl_is_uni_upper(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_upper(tmpbuf);
1016 }
1017
1018 bool
1019 Perl_is_uni_lower(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_lower(tmpbuf);
1024 }
1025
1026 bool
1027 Perl_is_uni_cntrl(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_cntrl(tmpbuf);
1032 }
1033
1034 bool
1035 Perl_is_uni_graph(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_graph(tmpbuf);
1040 }
1041
1042 bool
1043 Perl_is_uni_print(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_print(tmpbuf);
1048 }
1049
1050 bool
1051 Perl_is_uni_punct(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_punct(tmpbuf);
1056 }
1057
1058 bool
1059 Perl_is_uni_xdigit(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_xdigit(tmpbuf);
1064 }
1065
1066 UV
1067 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1068 {
1069     uvchr_to_utf8(p, c);
1070     return to_utf8_upper(p, p, lenp);
1071 }
1072
1073 UV
1074 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1075 {
1076     uvchr_to_utf8(p, c);
1077     return to_utf8_title(p, p, lenp);
1078 }
1079
1080 UV
1081 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1082 {
1083     uvchr_to_utf8(p, c);
1084     return to_utf8_lower(p, p, lenp);
1085 }
1086
1087 UV
1088 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1089 {
1090     uvchr_to_utf8(p, c);
1091     return to_utf8_fold(p, p, lenp);
1092 }
1093
1094 /* for now these all assume no locale info available for Unicode > 255 */
1095
1096 bool
1097 Perl_is_uni_alnum_lc(pTHX_ UV c)
1098 {
1099     return is_uni_alnum(c);     /* XXX no locale support yet */
1100 }
1101
1102 bool
1103 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1104 {
1105     return is_uni_alnumc(c);    /* XXX no locale support yet */
1106 }
1107
1108 bool
1109 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1110 {
1111     return is_uni_idfirst(c);   /* XXX no locale support yet */
1112 }
1113
1114 bool
1115 Perl_is_uni_alpha_lc(pTHX_ UV c)
1116 {
1117     return is_uni_alpha(c);     /* XXX no locale support yet */
1118 }
1119
1120 bool
1121 Perl_is_uni_ascii_lc(pTHX_ UV c)
1122 {
1123     return is_uni_ascii(c);     /* XXX no locale support yet */
1124 }
1125
1126 bool
1127 Perl_is_uni_space_lc(pTHX_ UV c)
1128 {
1129     return is_uni_space(c);     /* XXX no locale support yet */
1130 }
1131
1132 bool
1133 Perl_is_uni_digit_lc(pTHX_ UV c)
1134 {
1135     return is_uni_digit(c);     /* XXX no locale support yet */
1136 }
1137
1138 bool
1139 Perl_is_uni_upper_lc(pTHX_ UV c)
1140 {
1141     return is_uni_upper(c);     /* XXX no locale support yet */
1142 }
1143
1144 bool
1145 Perl_is_uni_lower_lc(pTHX_ UV c)
1146 {
1147     return is_uni_lower(c);     /* XXX no locale support yet */
1148 }
1149
1150 bool
1151 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1152 {
1153     return is_uni_cntrl(c);     /* XXX no locale support yet */
1154 }
1155
1156 bool
1157 Perl_is_uni_graph_lc(pTHX_ UV c)
1158 {
1159     return is_uni_graph(c);     /* XXX no locale support yet */
1160 }
1161
1162 bool
1163 Perl_is_uni_print_lc(pTHX_ UV c)
1164 {
1165     return is_uni_print(c);     /* XXX no locale support yet */
1166 }
1167
1168 bool
1169 Perl_is_uni_punct_lc(pTHX_ UV c)
1170 {
1171     return is_uni_punct(c);     /* XXX no locale support yet */
1172 }
1173
1174 bool
1175 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1176 {
1177     return is_uni_xdigit(c);    /* XXX no locale support yet */
1178 }
1179
1180 U32
1181 Perl_to_uni_upper_lc(pTHX_ U32 c)
1182 {
1183     /* XXX returns only the first character -- do not use XXX */
1184     /* XXX no locale support yet */
1185     STRLEN len;
1186     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1187     return (U32)to_uni_upper(c, tmpbuf, &len);
1188 }
1189
1190 U32
1191 Perl_to_uni_title_lc(pTHX_ U32 c)
1192 {
1193     /* XXX returns only the first character XXX -- do not use XXX */
1194     /* XXX no locale support yet */
1195     STRLEN len;
1196     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1197     return (U32)to_uni_title(c, tmpbuf, &len);
1198 }
1199
1200 U32
1201 Perl_to_uni_lower_lc(pTHX_ U32 c)
1202 {
1203     /* XXX returns only the first character -- do not use XXX */
1204     /* XXX no locale support yet */
1205     STRLEN len;
1206     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1207     return (U32)to_uni_lower(c, tmpbuf, &len);
1208 }
1209
1210 static bool
1211 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1212                  const char *const swashname)
1213 {
1214     dVAR;
1215     if (!is_utf8_char(p))
1216         return FALSE;
1217     if (!*swash)
1218         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1219     return swash_fetch(*swash, p, TRUE) != 0;
1220 }
1221
1222 bool
1223 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1224 {
1225     dVAR;
1226     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1227      * descendant of isalnum(3), in other words, it doesn't
1228      * contain the '_'. --jhi */
1229     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1230 }
1231
1232 bool
1233 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1234 {
1235     dVAR;
1236     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1237 }
1238
1239 bool
1240 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1241 {
1242     dVAR;
1243     if (*p == '_')
1244         return TRUE;
1245     /* is_utf8_idstart would be more logical. */
1246     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1247 }
1248
1249 bool
1250 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1251 {
1252     dVAR;
1253     if (*p == '_')
1254         return TRUE;
1255     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1256 }
1257
1258 bool
1259 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1260 {
1261     dVAR;
1262     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1263 }
1264
1265 bool
1266 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1267 {
1268     dVAR;
1269     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1270 }
1271
1272 bool
1273 Perl_is_utf8_space(pTHX_ const U8 *p)
1274 {
1275     dVAR;
1276     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1277 }
1278
1279 bool
1280 Perl_is_utf8_digit(pTHX_ const U8 *p)
1281 {
1282     dVAR;
1283     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1284 }
1285
1286 bool
1287 Perl_is_utf8_upper(pTHX_ const U8 *p)
1288 {
1289     dVAR;
1290     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1291 }
1292
1293 bool
1294 Perl_is_utf8_lower(pTHX_ const U8 *p)
1295 {
1296     dVAR;
1297     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1298 }
1299
1300 bool
1301 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1302 {
1303     dVAR;
1304     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1305 }
1306
1307 bool
1308 Perl_is_utf8_graph(pTHX_ const U8 *p)
1309 {
1310     dVAR;
1311     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1312 }
1313
1314 bool
1315 Perl_is_utf8_print(pTHX_ const U8 *p)
1316 {
1317     dVAR;
1318     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1319 }
1320
1321 bool
1322 Perl_is_utf8_punct(pTHX_ const U8 *p)
1323 {
1324     dVAR;
1325     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1326 }
1327
1328 bool
1329 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1330 {
1331     dVAR;
1332     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1333 }
1334
1335 bool
1336 Perl_is_utf8_mark(pTHX_ const U8 *p)
1337 {
1338     dVAR;
1339     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1340 }
1341
1342 /*
1343 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1344
1345 The "p" contains the pointer to the UTF-8 string encoding
1346 the character that is being converted.
1347
1348 The "ustrp" is a pointer to the character buffer to put the
1349 conversion result to.  The "lenp" is a pointer to the length
1350 of the result.
1351
1352 The "swashp" is a pointer to the swash to use.
1353
1354 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1355 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1356 but not always, a multicharacter mapping), is tried first.
1357
1358 The "special" is a string like "utf8::ToSpecLower", which means the
1359 hash %utf8::ToSpecLower.  The access to the hash is through
1360 Perl_to_utf8_case().
1361
1362 The "normal" is a string like "ToLower" which means the swash
1363 %utf8::ToLower.
1364
1365 =cut */
1366
1367 UV
1368 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1369                         SV **swashp, const char *normal, const char *special)
1370 {
1371     dVAR;
1372     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1373     STRLEN len = 0;
1374
1375     const UV uv0 = utf8_to_uvchr(p, NULL);
1376     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1377      * are necessary in EBCDIC, they are redundant no-ops
1378      * in ASCII-ish platforms, and hopefully optimized away. */
1379     const UV uv1 = NATIVE_TO_UNI(uv0);
1380     uvuni_to_utf8(tmpbuf, uv1);
1381
1382     if (!*swashp) /* load on-demand */
1383          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1384
1385     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1386     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1387          /* It might be "special" (sometimes, but not always,
1388           * a multicharacter mapping) */
1389          HV * const hv = get_hv(special, FALSE);
1390          SV **svp;
1391
1392          if (hv &&
1393              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1394              (*svp)) {
1395              const char *s;
1396
1397               s = SvPV_const(*svp, len);
1398               if (len == 1)
1399                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1400               else {
1401 #ifdef EBCDIC
1402                    /* If we have EBCDIC we need to remap the characters
1403                     * since any characters in the low 256 are Unicode
1404                     * code points, not EBCDIC. */
1405                    U8 *t = (U8*)s, *tend = t + len, *d;
1406
1407                    d = tmpbuf;
1408                    if (SvUTF8(*svp)) {
1409                         STRLEN tlen = 0;
1410
1411                         while (t < tend) {
1412                              const UV c = utf8_to_uvchr(t, &tlen);
1413                              if (tlen > 0) {
1414                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1415                                   t += tlen;
1416                              }
1417                              else
1418                                   break;
1419                         }
1420                    }
1421                    else {
1422                         while (t < tend) {
1423                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1424                              t++;
1425                         }
1426                    }
1427                    len = d - tmpbuf;
1428                    Copy(tmpbuf, ustrp, len, U8);
1429 #else
1430                    Copy(s, ustrp, len, U8);
1431 #endif
1432               }
1433          }
1434     }
1435
1436     if (!len && *swashp) {
1437         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1438
1439          if (uv2) {
1440               /* It was "normal" (a single character mapping). */
1441               const UV uv3 = UNI_TO_NATIVE(uv2);
1442               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1443          }
1444     }
1445
1446     if (!len) /* Neither: just copy. */
1447          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1448
1449     if (lenp)
1450          *lenp = len;
1451
1452     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1453 }
1454
1455 /*
1456 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1457
1458 Convert the UTF-8 encoded character at p to its uppercase version and
1459 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1460 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1461 the uppercase version may be longer than the original character.
1462
1463 The first character of the uppercased version is returned
1464 (but note, as explained above, that there may be more.)
1465
1466 =cut */
1467
1468 UV
1469 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1470 {
1471     dVAR;
1472     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1473                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1474 }
1475
1476 /*
1477 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1478
1479 Convert the UTF-8 encoded character at p to its titlecase version and
1480 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1481 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1482 titlecase version may be longer than the original character.
1483
1484 The first character of the titlecased version is returned
1485 (but note, as explained above, that there may be more.)
1486
1487 =cut */
1488
1489 UV
1490 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1491 {
1492     dVAR;
1493     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1494                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1495 }
1496
1497 /*
1498 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1499
1500 Convert the UTF-8 encoded character at p to its lowercase version and
1501 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1502 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1503 lowercase version may be longer than the original character.
1504
1505 The first character of the lowercased version is returned
1506 (but note, as explained above, that there may be more.)
1507
1508 =cut */
1509
1510 UV
1511 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1512 {
1513     dVAR;
1514     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1515                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1516 }
1517
1518 /*
1519 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1520
1521 Convert the UTF-8 encoded character at p to its foldcase version and
1522 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1523 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1524 foldcase version may be longer than the original character (up to
1525 three characters).
1526
1527 The first character of the foldcased version is returned
1528 (but note, as explained above, that there may be more.)
1529
1530 =cut */
1531
1532 UV
1533 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1534 {
1535     dVAR;
1536     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1537                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1538 }
1539
1540 /* Note:
1541  * A "swash" is a swatch hash.
1542  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1543  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1544  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1545  */
1546 SV*
1547 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1548 {
1549     dVAR;
1550     SV* retval;
1551     SV* const tokenbufsv = sv_newmortal();
1552     dSP;
1553     const size_t pkg_len = strlen(pkg);
1554     const size_t name_len = strlen(name);
1555     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1556     SV* errsv_save;
1557
1558     PUSHSTACKi(PERLSI_MAGIC);
1559     ENTER;
1560     SAVEI32(PL_hints);
1561     PL_hints = 0;
1562     save_re_context();
1563     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1564         ENTER;
1565         errsv_save = newSVsv(ERRSV);
1566         /* It is assumed that callers of this routine are not passing in any
1567            user derived data.  */
1568         /* Need to do this after save_re_context() as it will set PL_tainted to
1569            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1570            Even line to create errsv_save can turn on PL_tainted.  */
1571         SAVEBOOL(PL_tainted);
1572         PL_tainted = 0;
1573         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1574                          NULL);
1575         if (!SvTRUE(ERRSV))
1576             sv_setsv(ERRSV, errsv_save);
1577         SvREFCNT_dec(errsv_save);
1578         LEAVE;
1579     }
1580     SPAGAIN;
1581     PUSHMARK(SP);
1582     EXTEND(SP,5);
1583     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1584     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1585     PUSHs(listsv);
1586     PUSHs(sv_2mortal(newSViv(minbits)));
1587     PUSHs(sv_2mortal(newSViv(none)));
1588     PUTBACK;
1589     if (IN_PERL_COMPILETIME) {
1590         /* XXX ought to be handled by lex_start */
1591         SAVEI32(PL_in_my);
1592         PL_in_my = 0;
1593         sv_setpv(tokenbufsv, PL_tokenbuf);
1594     }
1595     errsv_save = newSVsv(ERRSV);
1596     if (call_method("SWASHNEW", G_SCALAR))
1597         retval = newSVsv(*PL_stack_sp--);
1598     else
1599         retval = &PL_sv_undef;
1600     if (!SvTRUE(ERRSV))
1601         sv_setsv(ERRSV, errsv_save);
1602     SvREFCNT_dec(errsv_save);
1603     LEAVE;
1604     POPSTACK;
1605     if (IN_PERL_COMPILETIME) {
1606         STRLEN len;
1607         const char* const pv = SvPV_const(tokenbufsv, len);
1608
1609         Copy(pv, PL_tokenbuf, len+1, char);
1610         CopHINTS_set(PL_curcop, PL_hints);
1611     }
1612     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1613         if (SvPOK(retval))
1614             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1615                        (void*)retval);
1616         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1617     }
1618     return retval;
1619 }
1620
1621
1622 /* This API is wrong for special case conversions since we may need to
1623  * return several Unicode characters for a single Unicode character
1624  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1625  * the lower-level routine, and it is similarly broken for returning
1626  * multiple values.  --jhi */
1627 /* Now SWASHGET is recasted into S_swash_get in this file. */
1628
1629 /* Note:
1630  * Returns the value of property/mapping C<swash> for the first character
1631  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1632  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1633  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1634  */
1635 UV
1636 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1637 {
1638     dVAR;
1639     HV* const hv = (HV*)SvRV(swash);
1640     U32 klen;
1641     U32 off;
1642     STRLEN slen;
1643     STRLEN needents;
1644     const U8 *tmps = NULL;
1645     U32 bit;
1646     SV *swatch;
1647     U8 tmputf8[2];
1648     const UV c = NATIVE_TO_ASCII(*ptr);
1649
1650     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1651         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1652         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1653         ptr = tmputf8;
1654     }
1655     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1656      * then the "swatch" is a vec() for al the chars which start
1657      * with 0xAA..0xYY
1658      * So the key in the hash (klen) is length of encoded char -1
1659      */
1660     klen = UTF8SKIP(ptr) - 1;
1661     off  = ptr[klen];
1662
1663     if (klen == 0) {
1664       /* If char in invariant then swatch is for all the invariant chars
1665        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1666        */
1667         needents = UTF_CONTINUATION_MARK;
1668         off      = NATIVE_TO_UTF(ptr[klen]);
1669     }
1670     else {
1671       /* If char is encoded then swatch is for the prefix */
1672         needents = (1 << UTF_ACCUMULATION_SHIFT);
1673         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1674     }
1675
1676     /*
1677      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1678      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1679      * it's nothing to sniff at.)  Pity we usually come through at least
1680      * two function calls to get here...
1681      *
1682      * NB: this code assumes that swatches are never modified, once generated!
1683      */
1684
1685     if (hv   == PL_last_swash_hv &&
1686         klen == PL_last_swash_klen &&
1687         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1688     {
1689         tmps = PL_last_swash_tmps;
1690         slen = PL_last_swash_slen;
1691     }
1692     else {
1693         /* Try our second-level swatch cache, kept in a hash. */
1694         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1695
1696         /* If not cached, generate it via swash_get */
1697         if (!svp || !SvPOK(*svp)
1698                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1699             /* We use utf8n_to_uvuni() as we want an index into
1700                Unicode tables, not a native character number.
1701              */
1702             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1703                                            ckWARN(WARN_UTF8) ?
1704                                            0 : UTF8_ALLOW_ANY);
1705             swatch = swash_get(swash,
1706                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1707                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1708                                 needents);
1709
1710             if (IN_PERL_COMPILETIME)
1711                 CopHINTS_set(PL_curcop, PL_hints);
1712
1713             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1714
1715             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1716                      || (slen << 3) < needents)
1717                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1718         }
1719
1720         PL_last_swash_hv = hv;
1721         PL_last_swash_klen = klen;
1722         /* FIXME change interpvar.h?  */
1723         PL_last_swash_tmps = (U8 *) tmps;
1724         PL_last_swash_slen = slen;
1725         if (klen)
1726             Copy(ptr, PL_last_swash_key, klen, U8);
1727     }
1728
1729     switch ((int)((slen << 3) / needents)) {
1730     case 1:
1731         bit = 1 << (off & 7);
1732         off >>= 3;
1733         return (tmps[off] & bit) != 0;
1734     case 8:
1735         return tmps[off];
1736     case 16:
1737         off <<= 1;
1738         return (tmps[off] << 8) + tmps[off + 1] ;
1739     case 32:
1740         off <<= 2;
1741         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1742     }
1743     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1744     NORETURN_FUNCTION_END;
1745 }
1746
1747 /* Note:
1748  * Returns a swatch (a bit vector string) for a code point sequence
1749  * that starts from the value C<start> and comprises the number C<span>.
1750  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1751  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1752  */
1753 STATIC SV*
1754 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1755 {
1756     SV *swatch;
1757     U8 *l, *lend, *x, *xend, *s;
1758     STRLEN lcur, xcur, scur;
1759
1760     HV* const hv = (HV*)SvRV(swash);
1761     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1762     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1763     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1764     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1765     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1766     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1767     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1768     const STRLEN bits  = SvUV(*bitssvp);
1769     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1770     const UV     none  = SvUV(*nonesvp);
1771     const UV     end   = start + span;
1772
1773     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1774         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1775                                                  (UV)bits);
1776     }
1777
1778     /* create and initialize $swatch */
1779     swatch = newSVpvs("");
1780     scur   = octets ? (span * octets) : (span + 7) / 8;
1781     SvGROW(swatch, scur + 1);
1782     s = (U8*)SvPVX(swatch);
1783     if (octets && none) {
1784         const U8* const e = s + scur;
1785         while (s < e) {
1786             if (bits == 8)
1787                 *s++ = (U8)(none & 0xff);
1788             else if (bits == 16) {
1789                 *s++ = (U8)((none >>  8) & 0xff);
1790                 *s++ = (U8)( none        & 0xff);
1791             }
1792             else if (bits == 32) {
1793                 *s++ = (U8)((none >> 24) & 0xff);
1794                 *s++ = (U8)((none >> 16) & 0xff);
1795                 *s++ = (U8)((none >>  8) & 0xff);
1796                 *s++ = (U8)( none        & 0xff);
1797             }
1798         }
1799         *s = '\0';
1800     }
1801     else {
1802         (void)memzero((U8*)s, scur + 1);
1803     }
1804     SvCUR_set(swatch, scur);
1805     s = (U8*)SvPVX(swatch);
1806
1807     /* read $swash->{LIST} */
1808     l = (U8*)SvPV(*listsvp, lcur);
1809     lend = l + lcur;
1810     while (l < lend) {
1811         UV min, max, val;
1812         STRLEN numlen;
1813         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1814
1815         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1816
1817         numlen = lend - l;
1818         min = grok_hex((char *)l, &numlen, &flags, NULL);
1819         if (numlen)
1820             l += numlen;
1821         else if (nl) {
1822             l = nl + 1; /* 1 is length of "\n" */
1823             continue;
1824         }
1825         else {
1826             l = lend; /* to LIST's end at which \n is not found */
1827             break;
1828         }
1829
1830         if (isBLANK(*l)) {
1831             ++l;
1832             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1833             numlen = lend - l;
1834             max = grok_hex((char *)l, &numlen, &flags, NULL);
1835             if (numlen)
1836                 l += numlen;
1837             else
1838                 max = min;
1839
1840             if (octets) {
1841                 if (isBLANK(*l)) {
1842                     ++l;
1843                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1844                             PERL_SCAN_DISALLOW_PREFIX;
1845                     numlen = lend - l;
1846                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1847                     if (numlen)
1848                         l += numlen;
1849                     else
1850                         val = 0;
1851                 }
1852                 else {
1853                     val = 0;
1854                     if (typeto) {
1855                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1856                                          typestr, l);
1857                     }
1858                 }
1859             }
1860             else
1861                 val = 0; /* bits == 1, then val should be ignored */
1862         }
1863         else {
1864             max = min;
1865             if (octets) {
1866                 val = 0;
1867                 if (typeto) {
1868                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1869                 }
1870             }
1871             else
1872                 val = 0; /* bits == 1, then val should be ignored */
1873         }
1874
1875         if (nl)
1876             l = nl + 1;
1877         else
1878             l = lend;
1879
1880         if (max < start)
1881             continue;
1882
1883         if (octets) {
1884             UV key;
1885             if (min < start) {
1886                 if (!none || val < none) {
1887                     val += start - min;
1888                 }
1889                 min = start;
1890             }
1891             for (key = min; key <= max; key++) {
1892                 STRLEN offset;
1893                 if (key >= end)
1894                     goto go_out_list;
1895                 /* offset must be non-negative (start <= min <= key < end) */
1896                 offset = octets * (key - start);
1897                 if (bits == 8)
1898                     s[offset] = (U8)(val & 0xff);
1899                 else if (bits == 16) {
1900                     s[offset    ] = (U8)((val >>  8) & 0xff);
1901                     s[offset + 1] = (U8)( val        & 0xff);
1902                 }
1903                 else if (bits == 32) {
1904                     s[offset    ] = (U8)((val >> 24) & 0xff);
1905                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1906                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1907                     s[offset + 3] = (U8)( val        & 0xff);
1908                 }
1909
1910                 if (!none || val < none)
1911                     ++val;
1912             }
1913         }
1914         else { /* bits == 1, then val should be ignored */
1915             UV key;
1916             if (min < start)
1917                 min = start;
1918             for (key = min; key <= max; key++) {
1919                 const STRLEN offset = (STRLEN)(key - start);
1920                 if (key >= end)
1921                     goto go_out_list;
1922                 s[offset >> 3] |= 1 << (offset & 7);
1923             }
1924         }
1925     } /* while */
1926   go_out_list:
1927
1928     /* read $swash->{EXTRAS} */
1929     x = (U8*)SvPV(*extssvp, xcur);
1930     xend = x + xcur;
1931     while (x < xend) {
1932         STRLEN namelen;
1933         U8 *namestr;
1934         SV** othersvp;
1935         HV* otherhv;
1936         STRLEN otherbits;
1937         SV **otherbitssvp, *other;
1938         U8 *s, *o, *nl;
1939         STRLEN slen, olen;
1940
1941         const U8 opc = *x++;
1942         if (opc == '\n')
1943             continue;
1944
1945         nl = (U8*)memchr(x, '\n', xend - x);
1946
1947         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1948             if (nl) {
1949                 x = nl + 1; /* 1 is length of "\n" */
1950                 continue;
1951             }
1952             else {
1953                 x = xend; /* to EXTRAS' end at which \n is not found */
1954                 break;
1955             }
1956         }
1957
1958         namestr = x;
1959         if (nl) {
1960             namelen = nl - namestr;
1961             x = nl + 1;
1962         }
1963         else {
1964             namelen = xend - namestr;
1965             x = xend;
1966         }
1967
1968         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1969         otherhv = (HV*)SvRV(*othersvp);
1970         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1971         otherbits = (STRLEN)SvUV(*otherbitssvp);
1972         if (bits < otherbits)
1973             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1974
1975         /* The "other" swatch must be destroyed after. */
1976         other = swash_get(*othersvp, start, span);
1977         o = (U8*)SvPV(other, olen);
1978
1979         if (!olen)
1980             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1981
1982         s = (U8*)SvPV(swatch, slen);
1983         if (bits == 1 && otherbits == 1) {
1984             if (slen != olen)
1985                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1986
1987             switch (opc) {
1988             case '+':
1989                 while (slen--)
1990                     *s++ |= *o++;
1991                 break;
1992             case '!':
1993                 while (slen--)
1994                     *s++ |= ~*o++;
1995                 break;
1996             case '-':
1997                 while (slen--)
1998                     *s++ &= ~*o++;
1999                 break;
2000             case '&':
2001                 while (slen--)
2002                     *s++ &= *o++;
2003                 break;
2004             default:
2005                 break;
2006             }
2007         }
2008         else {
2009             STRLEN otheroctets = otherbits >> 3;
2010             STRLEN offset = 0;
2011             U8* const send = s + slen;
2012
2013             while (s < send) {
2014                 UV otherval = 0;
2015
2016                 if (otherbits == 1) {
2017                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2018                     ++offset;
2019                 }
2020                 else {
2021                     STRLEN vlen = otheroctets;
2022                     otherval = *o++;
2023                     while (--vlen) {
2024                         otherval <<= 8;
2025                         otherval |= *o++;
2026                     }
2027                 }
2028
2029                 if (opc == '+' && otherval)
2030                     NOOP;   /* replace with otherval */
2031                 else if (opc == '!' && !otherval)
2032                     otherval = 1;
2033                 else if (opc == '-' && otherval)
2034                     otherval = 0;
2035                 else if (opc == '&' && !otherval)
2036                     otherval = 0;
2037                 else {
2038                     s += octets; /* no replacement */
2039                     continue;
2040                 }
2041
2042                 if (bits == 8)
2043                     *s++ = (U8)( otherval & 0xff);
2044                 else if (bits == 16) {
2045                     *s++ = (U8)((otherval >>  8) & 0xff);
2046                     *s++ = (U8)( otherval        & 0xff);
2047                 }
2048                 else if (bits == 32) {
2049                     *s++ = (U8)((otherval >> 24) & 0xff);
2050                     *s++ = (U8)((otherval >> 16) & 0xff);
2051                     *s++ = (U8)((otherval >>  8) & 0xff);
2052                     *s++ = (U8)( otherval        & 0xff);
2053                 }
2054             }
2055         }
2056         sv_free(other); /* through with it! */
2057     } /* while */
2058     return swatch;
2059 }
2060
2061 /*
2062 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2063
2064 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2065 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2066 bytes available. The return value is the pointer to the byte after the
2067 end of the new character. In other words,
2068
2069     d = uvchr_to_utf8(d, uv);
2070
2071 is the recommended wide native character-aware way of saying
2072
2073     *(d++) = uv;
2074
2075 =cut
2076 */
2077
2078 /* On ASCII machines this is normally a macro but we want a
2079    real function in case XS code wants it
2080 */
2081 U8 *
2082 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2083 {
2084     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2085 }
2086
2087 U8 *
2088 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2089 {
2090     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2091 }
2092
2093 /*
2094 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2095 flags
2096
2097 Returns the native character value of the first character in the string
2098 C<s>
2099 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2100 length, in bytes, of that character.
2101
2102 Allows length and flags to be passed to low level routine.
2103
2104 =cut
2105 */
2106 /* On ASCII machines this is normally a macro but we want
2107    a real function in case XS code wants it
2108 */
2109 UV
2110 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2111 U32 flags)
2112 {
2113     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2114     return UNI_TO_NATIVE(uv);
2115 }
2116
2117 /*
2118 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2119
2120 Build to the scalar dsv a displayable version of the string spv,
2121 length len, the displayable version being at most pvlim bytes long
2122 (if longer, the rest is truncated and "..." will be appended).
2123
2124 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2125 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2126 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2127 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2128 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2129 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2130
2131 The pointer to the PV of the dsv is returned.
2132
2133 =cut */
2134 char *
2135 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2136 {
2137     int truncated = 0;
2138     const char *s, *e;
2139
2140     sv_setpvn(dsv, "", 0);
2141     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2142          UV u;
2143           /* This serves double duty as a flag and a character to print after
2144              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2145           */
2146          char ok = 0;
2147
2148          if (pvlim && SvCUR(dsv) >= pvlim) {
2149               truncated++;
2150               break;
2151          }
2152          u = utf8_to_uvchr((U8*)s, 0);
2153          if (u < 256) {
2154              const unsigned char c = (unsigned char)u & 0xFF;
2155              if (flags & UNI_DISPLAY_BACKSLASH) {
2156                  switch (c) {
2157                  case '\n':
2158                      ok = 'n'; break;
2159                  case '\r':
2160                      ok = 'r'; break;
2161                  case '\t':
2162                      ok = 't'; break;
2163                  case '\f':
2164                      ok = 'f'; break;
2165                  case '\a':
2166                      ok = 'a'; break;
2167                  case '\\':
2168                      ok = '\\'; break;
2169                  default: break;
2170                  }
2171                  if (ok) {
2172                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2173                  }
2174              }
2175              /* isPRINT() is the locale-blind version. */
2176              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2177                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2178                  ok = 1;
2179              }
2180          }
2181          if (!ok)
2182              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2183     }
2184     if (truncated)
2185          sv_catpvs(dsv, "...");
2186
2187     return SvPVX(dsv);
2188 }
2189
2190 /*
2191 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2192
2193 Build to the scalar dsv a displayable version of the scalar sv,
2194 the displayable version being at most pvlim bytes long
2195 (if longer, the rest is truncated and "..." will be appended).
2196
2197 The flags argument is as in pv_uni_display().
2198
2199 The pointer to the PV of the dsv is returned.
2200
2201 =cut
2202 */
2203 char *
2204 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2205 {
2206      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2207                                 SvCUR(ssv), pvlim, flags);
2208 }
2209
2210 /*
2211 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2212
2213 Return true if the strings s1 and s2 differ case-insensitively, false
2214 if not (if they are equal case-insensitively).  If u1 is true, the
2215 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2216 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2217 are false, the respective string is assumed to be in native 8-bit
2218 encoding.
2219
2220 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2221 in there (they will point at the beginning of the I<next> character).
2222 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2223 pointers beyond which scanning will not continue under any
2224 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2225 s2+l2 will be used as goal end pointers that will also stop the scan,
2226 and which qualify towards defining a successful match: all the scans
2227 that define an explicit length must reach their goal pointers for
2228 a match to succeed).
2229
2230 For case-insensitiveness, the "casefolding" of Unicode is used
2231 instead of upper/lowercasing both the characters, see
2232 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2233
2234 =cut */
2235 I32
2236 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2237 {
2238      dVAR;
2239      register const U8 *p1  = (const U8*)s1;
2240      register const U8 *p2  = (const U8*)s2;
2241      register const U8 *f1 = NULL;
2242      register const U8 *f2 = NULL;
2243      register U8 *e1 = NULL;
2244      register U8 *q1 = NULL;
2245      register U8 *e2 = NULL;
2246      register U8 *q2 = NULL;
2247      STRLEN n1 = 0, n2 = 0;
2248      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2249      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2250      U8 natbuf[1+1];
2251      STRLEN foldlen1, foldlen2;
2252      bool match;
2253
2254      if (pe1)
2255           e1 = *(U8**)pe1;
2256      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2257           f1 = (const U8*)s1 + l1;
2258      if (pe2)
2259           e2 = *(U8**)pe2;
2260      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2261           f2 = (const U8*)s2 + l2;
2262
2263      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2264           return 1; /* mismatch; possible infinite loop or false positive */
2265
2266      if (!u1 || !u2)
2267           natbuf[1] = 0; /* Need to terminate the buffer. */
2268
2269      while ((e1 == 0 || p1 < e1) &&
2270             (f1 == 0 || p1 < f1) &&
2271             (e2 == 0 || p2 < e2) &&
2272             (f2 == 0 || p2 < f2)) {
2273           if (n1 == 0) {
2274                if (u1)
2275                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2276                else {
2277                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2278                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2279                }
2280                q1 = foldbuf1;
2281                n1 = foldlen1;
2282           }
2283           if (n2 == 0) {
2284                if (u2)
2285                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2286                else {
2287                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2288                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2289                }
2290                q2 = foldbuf2;
2291                n2 = foldlen2;
2292           }
2293           while (n1 && n2) {
2294                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2295                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2296                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2297                    return 1; /* mismatch */
2298                n1 -= UTF8SKIP(q1);
2299                q1 += UTF8SKIP(q1);
2300                n2 -= UTF8SKIP(q2);
2301                q2 += UTF8SKIP(q2);
2302           }
2303           if (n1 == 0)
2304                p1 += u1 ? UTF8SKIP(p1) : 1;
2305           if (n2 == 0)
2306                p2 += u2 ? UTF8SKIP(p2) : 1;
2307
2308      }
2309
2310      /* A match is defined by all the scans that specified
2311       * an explicit length reaching their final goals. */
2312      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2313
2314      if (match) {
2315           if (pe1)
2316                *pe1 = (char*)p1;
2317           if (pe2)
2318                *pe2 = (char*)p2;
2319      }
2320
2321      return match ? 0 : 1; /* 0 match, 1 mismatch */
2322 }
2323
2324 /*
2325  * Local variables:
2326  * c-indentation-style: bsd
2327  * c-basic-offset: 4
2328  * indent-tabs-mode: t
2329  * End:
2330  *
2331  * ex: set ts=8 sts=4 sw=4 noet:
2332  */