utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241     PERL_UNUSED_CONTEXT;
 242 #ifdef IS_UTF8_CHAR
 243     if (IS_UTF8_CHAR_FAST(len))
 244         return IS_UTF8_CHAR(s, len) ? len : 0;
 245 #endif /* #ifdef IS_UTF8_CHAR */
 246     return is_utf8_char_slow(s, len);
 247 }
 248
 249 /*
 250 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 251
 252 Returns true if first C<len> bytes of the given string form a valid
 253 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 254 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 255 because a valid ASCII string is a valid UTF-8 string.
 256
 257 See also is_utf8_string_loclen() and is_utf8_string_loc().
 258
 259 =cut
 260 */
 261
 262 bool
 263 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 264 {
 265     const U8* x = s;
 266     const U8* send;
 267
 268     PERL_UNUSED_CONTEXT;
 269     if (!len)
 270         len = strlen((const char *)s);
 271     send = s + len;
 272
 273     while (x < send) {
 274         STRLEN c;
 275          /* Inline the easy bits of is_utf8_char() here for speed... */
 276          if (UTF8_IS_INVARIANT(*x))
 277               c = 1;
 278          else if (!UTF8_IS_START(*x))
 279              goto out;
 280          else {
 281               /* ... and call is_utf8_char() only if really needed. */
 282 #ifdef IS_UTF8_CHAR
 283              c = UTF8SKIP(x);
 284              if (IS_UTF8_CHAR_FAST(c)) {
 285                  if (!IS_UTF8_CHAR(x, c))
 286                      goto out;
 287              } else if (!is_utf8_char_slow(x, c))
 288                  goto out;
 289 #else
 290              c = is_utf8_char(x);
 291 #endif /* #ifdef IS_UTF8_CHAR */
 292               if (!c)
 293                   goto out;
 294          }
 295         x += c;
 296     }
 297
 298  out:
 299     if (x != send)
 300         return FALSE;
 301
 302     return TRUE;
 303 }
 304
 305 /*
 306 Implemented as a macro in utf8.h
 307
 308 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 309
 310 Like is_utf8_string() but stores the location of the failure (in the
 311 case of "utf8ness failure") or the location s+len (in the case of
 312 "utf8ness success") in the C<ep>.
 313
 314 See also is_utf8_string_loclen() and is_utf8_string().
 315
 316 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 317
 318 Like is_utf8_string() but stores the location of the failure (in the
 319 case of "utf8ness failure") or the location s+len (in the case of
 320 "utf8ness success") in the C<ep>, and the number of UTF-8
 321 encoded characters in the C<el>.
 322
 323 See also is_utf8_string_loc() and is_utf8_string().
 324
 325 =cut
 326 */
 327
 328 bool
 329 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 330 {
 331     const U8* x = s;
 332     const U8* send;
 333     STRLEN c;
 334     PERL_UNUSED_CONTEXT;
 335
 336     if (!len)
 337         len = strlen((const char *)s);
 338     send = s + len;
 339     if (el)
 340         *el = 0;
 341
 342     while (x < send) {
 343          /* Inline the easy bits of is_utf8_char() here for speed... */
 344          if (UTF8_IS_INVARIANT(*x))
 345              c = 1;
 346          else if (!UTF8_IS_START(*x))
 347              goto out;
 348          else {
 349              /* ... and call is_utf8_char() only if really needed. */
 350 #ifdef IS_UTF8_CHAR
 351              c = UTF8SKIP(x);
 352              if (IS_UTF8_CHAR_FAST(c)) {
 353                  if (!IS_UTF8_CHAR(x, c))
 354                      c = 0;
 355              } else
 356                  c = is_utf8_char_slow(x, c);
 357 #else
 358              c = is_utf8_char(x);
 359 #endif /* #ifdef IS_UTF8_CHAR */
 360              if (!c)
 361                  goto out;
 362          }
 363          x += c;
 364          if (el)
 365              (*el)++;
 366     }
 367
 368  out:
 369     if (ep)
 370         *ep = x;
 371     if (x != send)
 372         return FALSE;
 373
 374     return TRUE;
 375 }
 376
 377 /*
 378
 379 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 380
 381 Bottom level UTF-8 decode routine.
 382 Returns the unicode code point value of the first character in the string C<s>
 383 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 384 C<retlen> will be set to the length, in bytes, of that character.
 385
 386 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 387 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 388 it is assumed that the caller will raise a warning, and this function
 389 will silently just set C<retlen> to C<-1> and return zero.  If the
 390 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 391 malformations will be given, C<retlen> will be set to the expected
 392 length of the UTF-8 character in bytes, and zero will be returned.
 393
 394 The C<flags> can also contain various flags to allow deviations from
 395 the strict UTF-8 encoding (see F<utf8.h>).
 396
 397 Most code should use utf8_to_uvchr() rather than call this directly.
 398
 399 =cut
 400 */
 401
 402 UV
 403 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 404 {
 405     dVAR;
 406     const U8 *s0 = s;
 407     UV uv = *s, ouv = 0;
 408     STRLEN len = 1;
 409     const bool dowarn = ckWARN_d(WARN_UTF8);
 410     const UV startbyte = *s;
 411     STRLEN expectlen = 0;
 412     U32 warning = 0;
 413
 414 /* This list is a superset of the UTF8_ALLOW_XXX. */
 415
 416 #define UTF8_WARN_EMPTY                          1
 417 #define UTF8_WARN_CONTINUATION                   2
 418 #define UTF8_WARN_NON_CONTINUATION               3
 419 #define UTF8_WARN_FE_FF                          4
 420 #define UTF8_WARN_SHORT                          5
 421 #define UTF8_WARN_OVERFLOW                       6
 422 #define UTF8_WARN_SURROGATE                      7
 423 #define UTF8_WARN_LONG                           8
 424 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 425
 426     if (curlen == 0 &&
 427         !(flags & UTF8_ALLOW_EMPTY)) {
 428         warning = UTF8_WARN_EMPTY;
 429         goto malformed;
 430     }
 431
 432     if (UTF8_IS_INVARIANT(uv)) {
 433         if (retlen)
 434             *retlen = 1;
 435         return (UV) (NATIVE_TO_UTF(*s));
 436     }
 437
 438     if (UTF8_IS_CONTINUATION(uv) &&
 439         !(flags & UTF8_ALLOW_CONTINUATION)) {
 440         warning = UTF8_WARN_CONTINUATION;
 441         goto malformed;
 442     }
 443
 444     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 445         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 446         warning = UTF8_WARN_NON_CONTINUATION;
 447         goto malformed;
 448     }
 449
 450 #ifdef EBCDIC
 451     uv = NATIVE_TO_UTF(uv);
 452 #else
 453     if ((uv == 0xfe || uv == 0xff) &&
 454         !(flags & UTF8_ALLOW_FE_FF)) {
 455         warning = UTF8_WARN_FE_FF;
 456         goto malformed;
 457     }
 458 #endif
 459
 460     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 461     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 462     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 463     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 464 #ifdef EBCDIC
 465     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 466     else                        { len =  7; uv &= 0x01; }
 467 #else
 468     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 469     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 470     else                        { len = 13; uv = 0; } /* whoa! */
 471 #endif
 472
 473     if (retlen)
 474         *retlen = len;
 475
 476     expectlen = len;
 477
 478     if ((curlen < expectlen) &&
 479         !(flags & UTF8_ALLOW_SHORT)) {
 480         warning = UTF8_WARN_SHORT;
 481         goto malformed;
 482     }
 483
 484     len--;
 485     s++;
 486     ouv = uv;
 487
 488     while (len--) {
 489         if (!UTF8_IS_CONTINUATION(*s) &&
 490             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 491             s--;
 492             warning = UTF8_WARN_NON_CONTINUATION;
 493             goto malformed;
 494         }
 495         else
 496             uv = UTF8_ACCUMULATE(uv, *s);
 497         if (!(uv > ouv)) {
 498             /* These cannot be allowed. */
 499             if (uv == ouv) {
 500                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 501                     warning = UTF8_WARN_LONG;
 502                     goto malformed;
 503                 }
 504             }
 505             else { /* uv < ouv */
 506                 /* This cannot be allowed. */
 507                 warning = UTF8_WARN_OVERFLOW;
 508                 goto malformed;
 509             }
 510         }
 511         s++;
 512         ouv = uv;
 513     }
 514
 515     if (UNICODE_IS_SURROGATE(uv) &&
 516         !(flags & UTF8_ALLOW_SURROGATE)) {
 517         warning = UTF8_WARN_SURROGATE;
 518         goto malformed;
 519     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 520                !(flags & UTF8_ALLOW_LONG)) {
 521         warning = UTF8_WARN_LONG;
 522         goto malformed;
 523     } else if (UNICODE_IS_ILLEGAL(uv) &&
 524                !(flags & UTF8_ALLOW_FFFF)) {
 525         warning = UTF8_WARN_FFFF;
 526         goto malformed;
 527     }
 528
 529     return uv;
 530
 531 malformed:
 532
 533     if (flags & UTF8_CHECK_ONLY) {
 534         if (retlen)
 535             *retlen = -1;
 536         return 0;
 537     }
 538
 539     if (dowarn) {
 540         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 541
 542         switch (warning) {
 543         case 0: /* Intentionally empty. */ break;
 544         case UTF8_WARN_EMPTY:
 545             sv_catpvs(sv, "(empty string)");
 546             break;
 547         case UTF8_WARN_CONTINUATION:
 548             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 549             break;
 550         case UTF8_WARN_NON_CONTINUATION:
 551             if (s == s0)
 552                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 553                            (UV)s[1], startbyte);
 554             else {
 555                 const int len = (int)(s-s0);
 556                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 557                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 558             }
 559
 560             break;
 561         case UTF8_WARN_FE_FF:
 562             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 563             break;
 564         case UTF8_WARN_SHORT:
 565             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 566                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 567             expectlen = curlen;         /* distance for caller to skip */
 568             break;
 569         case UTF8_WARN_OVERFLOW:
 570             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 571                            ouv, *s, startbyte);
 572             break;
 573         case UTF8_WARN_SURROGATE:
 574             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 575             break;
 576         case UTF8_WARN_LONG:
 577             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 578                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 579             break;
 580         case UTF8_WARN_FFFF:
 581             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 582             break;
 583         default:
 584             sv_catpvs(sv, "(unknown reason)");
 585             break;
 586         }
 587
 588         if (warning) {
 589             const char * const s = SvPVX_const(sv);
 590
 591             if (PL_op)
 592                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 593                             "%s in %s", s,  OP_DESC(PL_op));
 594             else
 595                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 596         }
 597     }
 598
 599     if (retlen)
 600         *retlen = expectlen ? expectlen : len;
 601
 602     return 0;
 603 }
 604
 605 /*
 606 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 607
 608 Returns the native character value of the first character in the string C<s>
 609 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 610 length, in bytes, of that character.
 611
 612 If C<s> does not point to a well-formed UTF-8 character, zero is
 613 returned and retlen is set, if possible, to -1.
 614
 615 =cut
 616 */
 617
 618 UV
 619 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 620 {
 621     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 622                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 623 }
 624
 625 /*
 626 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 627
 628 Returns the Unicode code point of the first character in the string C<s>
 629 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 630 length, in bytes, of that character.
 631
 632 This function should only be used when returned UV is considered
 633 an index into the Unicode semantic tables (e.g. swashes).
 634
 635 If C<s> does not point to a well-formed UTF-8 character, zero is
 636 returned and retlen is set, if possible, to -1.
 637
 638 =cut
 639 */
 640
 641 UV
 642 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 643 {
 644     /* Call the low level routine asking for checks */
 645     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 646                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 647 }
 648
 649 /*
 650 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 651
 652 Return the length of the UTF-8 char encoded string C<s> in characters.
 653 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 654 up past C<e>, croaks.
 655
 656 =cut
 657 */
 658
 659 STRLEN
 660 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 661 {
 662     dVAR;
 663     STRLEN len = 0;
 664
 665     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 666      * the bitops (especially ~) can create illegal UTF-8.
 667      * In other words: in Perl UTF-8 is not just for Unicode. */
 668
 669     if (e < s)
 670         goto warn_and_return;
 671     while (s < e) {
 672         const U8 t = UTF8SKIP(s);
 673         if (e - s < t) {
 674             warn_and_return:
 675             if (ckWARN_d(WARN_UTF8)) {
 676                 if (PL_op)
 677                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 678                             "%s in %s", unees, OP_DESC(PL_op));
 679                 else
 680                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 681             }
 682             return len;
 683         }
 684         s += t;
 685         len++;
 686     }
 687
 688     return len;
 689 }
 690
 691 /*
 692 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 693
 694 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 695 and C<b>.
 696
 697 WARNING: use only if you *know* that the pointers point inside the
 698 same UTF-8 buffer.
 699
 700 =cut
 701 */
 702
 703 IV
 704 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 705 {
 706     dVAR;
 707     IV off = 0;
 708     IV sign = 1;
 709
 710     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 711      * the bitops (especially ~) can create illegal UTF-8.
 712      * In other words: in Perl UTF-8 is not just for Unicode. */
 713
 714     if (a < b) {
 715         const U8 *const temp = a;
 716         a = b;
 717         b = temp;
 718         sign = -1;
 719     }
 720
 721     while (b < a) {
 722         const U8 c = UTF8SKIP(b);
 723
 724         if (a - b < c) {
 725             if (ckWARN_d(WARN_UTF8)) {
 726                 if (PL_op)
 727                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 728                                 "%s in %s", unees, OP_DESC(PL_op));
 729                 else
 730                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 731             }
 732             return off * sign;
 733         }
 734         b += c;
 735         off++;
 736     }
 737
 738     return off * sign;
 739 }
 740
 741 /*
 742 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 743
 744 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 745 forward or backward.
 746
 747 WARNING: do not use the following unless you *know* C<off> is within
 748 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 749 on the first byte of character or just after the last byte of a character.
 750
 751 =cut
 752 */
 753
 754 U8 *
 755 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 756 {
 757     PERL_UNUSED_CONTEXT;
 758     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 759      * the bitops (especially ~) can create illegal UTF-8.
 760      * In other words: in Perl UTF-8 is not just for Unicode. */
 761
 762     if (off >= 0) {
 763         while (off--)
 764             s += UTF8SKIP(s);
 765     }
 766     else {
 767         while (off++) {
 768             s--;
 769             while (UTF8_IS_CONTINUATION(*s))
 770                 s--;
 771         }
 772     }
 773     return (U8 *)s;
 774 }
 775
 776 /*
 777 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 778
 779 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 780 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 781 updates len to contain the new length.
 782 Returns zero on failure, setting C<len> to -1.
 783
 784 =cut
 785 */
 786
 787 U8 *
 788 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 789 {
 790     U8 *send;
 791     U8 *d;
 792     U8 *save = s;
 793
 794     /* ensure valid UTF-8 and chars < 256 before updating string */
 795     for (send = s + *len; s < send; ) {
 796         U8 c = *s++;
 797
 798         if (!UTF8_IS_INVARIANT(c) &&
 799             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 800              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 801             *len = -1;
 802             return 0;
 803         }
 804     }
 805
 806     d = s = save;
 807     while (s < send) {
 808         STRLEN ulen;
 809         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 810         s += ulen;
 811     }
 812     *d = '\0';
 813     *len = d - save;
 814     return save;
 815 }
 816
 817 /*
 818 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 819
 820 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 821 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 822 the newly-created string, and updates C<len> to contain the new
 823 length.  Returns the original string if no conversion occurs, C<len>
 824 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 825 0 if C<s> is converted or contains all 7bit characters.
 826
 827 =cut
 828 */
 829
 830 U8 *
 831 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 832 {
 833     U8 *d;
 834     const U8 *start = s;
 835     const U8 *send;
 836     I32 count = 0;
 837
 838     PERL_UNUSED_CONTEXT;
 839     if (!*is_utf8)
 840         return (U8 *)start;
 841
 842     /* ensure valid UTF-8 and chars < 256 before converting string */
 843     for (send = s + *len; s < send;) {
 844         U8 c = *s++;
 845         if (!UTF8_IS_INVARIANT(c)) {
 846             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 847                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 848                 count++;
 849             else
 850                 return (U8 *)start;
 851         }
 852     }
 853
 854     *is_utf8 = 0;
 855
 856     Newx(d, (*len) - count + 1, U8);
 857     s = start; start = d;
 858     while (s < send) {
 859         U8 c = *s++;
 860         if (!UTF8_IS_INVARIANT(c)) {
 861             /* Then it is two-byte encoded */
 862             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 863             c = ASCII_TO_NATIVE(c);
 864         }
 865         *d++ = c;
 866     }
 867     *d = '\0';
 868     *len = d - start;
 869     return (U8 *)start;
 870 }
 871
 872 /*
 873 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 874
 875 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 876 Returns a pointer to the newly-created string, and sets C<len> to
 877 reflect the new length.
 878
 879 If you want to convert to UTF-8 from other encodings than ASCII,
 880 see sv_recode_to_utf8().
 881
 882 =cut
 883 */
 884
 885 U8*
 886 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 887 {
 888     const U8 * const send = s + (*len);
 889     U8 *d;
 890     U8 *dst;
 891     PERL_UNUSED_CONTEXT;
 892
 893     Newx(d, (*len) * 2 + 1, U8);
 894     dst = d;
 895
 896     while (s < send) {
 897         const UV uv = NATIVE_TO_ASCII(*s++);
 898         if (UNI_IS_INVARIANT(uv))
 899             *d++ = (U8)UTF_TO_NATIVE(uv);
 900         else {
 901             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 902             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 903         }
 904     }
 905     *d = '\0';
 906     *len = d-dst;
 907     return dst;
 908 }
 909
 910 /*
 911  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 912  *
 913  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 914  * We optimize for native, for obvious reasons. */
 915
 916 U8*
 917 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 918 {
 919     U8* pend;
 920     U8* dstart = d;
 921
 922     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 923          d[0] = 0;
 924          *newlen = 1;
 925          return d;
 926     }
 927
 928     if (bytelen & 1)
 929         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 930
 931     pend = p + bytelen;
 932
 933     while (p < pend) {
 934         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 935         p += 2;
 936         if (uv < 0x80) {
 937             *d++ = (U8)uv;
 938             continue;
 939         }
 940         if (uv < 0x800) {
 941             *d++ = (U8)(( uv >>  6)         | 0xc0);
 942             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 943             continue;
 944         }
 945         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 946             UV low = (p[0] << 8) + p[1];
 947             p += 2;
 948             if (low < 0xdc00 || low >= 0xdfff)
 949                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 950             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 951         }
 952         if (uv < 0x10000) {
 953             *d++ = (U8)(( uv >> 12)         | 0xe0);
 954             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 955             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 956             continue;
 957         }
 958         else {
 959             *d++ = (U8)(( uv >> 18)         | 0xf0);
 960             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 961             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 962             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 963             continue;
 964         }
 965     }
 966     *newlen = d - dstart;
 967     return d;
 968 }
 969
 970 /* Note: this one is slightly destructive of the source. */
 971
 972 U8*
 973 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 974 {
 975     U8* s = (U8*)p;
 976     U8* send = s + bytelen;
 977     while (s < send) {
 978         U8 tmp = s[0];
 979         s[0] = s[1];
 980         s[1] = tmp;
 981         s += 2;
 982     }
 983     return utf16_to_utf8(p, d, bytelen, newlen);
 984 }
 985
 986 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 987
 988 bool
 989 Perl_is_uni_alnum(pTHX_ UV c)
 990 {
 991     U8 tmpbuf[UTF8_MAXBYTES+1];
 992     uvchr_to_utf8(tmpbuf, c);
 993     return is_utf8_alnum(tmpbuf);
 994 }
 995
 996 bool
 997 Perl_is_uni_alnumc(pTHX_ UV c)
 998 {
 999     U8 tmpbuf[UTF8_MAXBYTES+1];
1000     uvchr_to_utf8(tmpbuf, c);
1001     return is_utf8_alnumc(tmpbuf);
1002 }
1003
1004 bool
1005 Perl_is_uni_idfirst(pTHX_ UV c)
1006 {
1007     U8 tmpbuf[UTF8_MAXBYTES+1];
1008     uvchr_to_utf8(tmpbuf, c);
1009     return is_utf8_idfirst(tmpbuf);
1010 }
1011
1012 bool
1013 Perl_is_uni_alpha(pTHX_ UV c)
1014 {
1015     U8 tmpbuf[UTF8_MAXBYTES+1];
1016     uvchr_to_utf8(tmpbuf, c);
1017     return is_utf8_alpha(tmpbuf);
1018 }
1019
1020 bool
1021 Perl_is_uni_ascii(pTHX_ UV c)
1022 {
1023     U8 tmpbuf[UTF8_MAXBYTES+1];
1024     uvchr_to_utf8(tmpbuf, c);
1025     return is_utf8_ascii(tmpbuf);
1026 }
1027
1028 bool
1029 Perl_is_uni_space(pTHX_ UV c)
1030 {
1031     U8 tmpbuf[UTF8_MAXBYTES+1];
1032     uvchr_to_utf8(tmpbuf, c);
1033     return is_utf8_space(tmpbuf);
1034 }
1035
1036 bool
1037 Perl_is_uni_digit(pTHX_ UV c)
1038 {
1039     U8 tmpbuf[UTF8_MAXBYTES+1];
1040     uvchr_to_utf8(tmpbuf, c);
1041     return is_utf8_digit(tmpbuf);
1042 }
1043
1044 bool
1045 Perl_is_uni_upper(pTHX_ UV c)
1046 {
1047     U8 tmpbuf[UTF8_MAXBYTES+1];
1048     uvchr_to_utf8(tmpbuf, c);
1049     return is_utf8_upper(tmpbuf);
1050 }
1051
1052 bool
1053 Perl_is_uni_lower(pTHX_ UV c)
1054 {
1055     U8 tmpbuf[UTF8_MAXBYTES+1];
1056     uvchr_to_utf8(tmpbuf, c);
1057     return is_utf8_lower(tmpbuf);
1058 }
1059
1060 bool
1061 Perl_is_uni_cntrl(pTHX_ UV c)
1062 {
1063     U8 tmpbuf[UTF8_MAXBYTES+1];
1064     uvchr_to_utf8(tmpbuf, c);
1065     return is_utf8_cntrl(tmpbuf);
1066 }
1067
1068 bool
1069 Perl_is_uni_graph(pTHX_ UV c)
1070 {
1071     U8 tmpbuf[UTF8_MAXBYTES+1];
1072     uvchr_to_utf8(tmpbuf, c);
1073     return is_utf8_graph(tmpbuf);
1074 }
1075
1076 bool
1077 Perl_is_uni_print(pTHX_ UV c)
1078 {
1079     U8 tmpbuf[UTF8_MAXBYTES+1];
1080     uvchr_to_utf8(tmpbuf, c);
1081     return is_utf8_print(tmpbuf);
1082 }
1083
1084 bool
1085 Perl_is_uni_punct(pTHX_ UV c)
1086 {
1087     U8 tmpbuf[UTF8_MAXBYTES+1];
1088     uvchr_to_utf8(tmpbuf, c);
1089     return is_utf8_punct(tmpbuf);
1090 }
1091
1092 bool
1093 Perl_is_uni_xdigit(pTHX_ UV c)
1094 {
1095     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1096     uvchr_to_utf8(tmpbuf, c);
1097     return is_utf8_xdigit(tmpbuf);
1098 }
1099
1100 UV
1101 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1102 {
1103     uvchr_to_utf8(p, c);
1104     return to_utf8_upper(p, p, lenp);
1105 }
1106
1107 UV
1108 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1109 {
1110     uvchr_to_utf8(p, c);
1111     return to_utf8_title(p, p, lenp);
1112 }
1113
1114 UV
1115 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1116 {
1117     uvchr_to_utf8(p, c);
1118     return to_utf8_lower(p, p, lenp);
1119 }
1120
1121 UV
1122 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1123 {
1124     uvchr_to_utf8(p, c);
1125     return to_utf8_fold(p, p, lenp);
1126 }
1127
1128 /* for now these all assume no locale info available for Unicode > 255 */
1129
1130 bool
1131 Perl_is_uni_alnum_lc(pTHX_ UV c)
1132 {
1133     return is_uni_alnum(c);     /* XXX no locale support yet */
1134 }
1135
1136 bool
1137 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1138 {
1139     return is_uni_alnumc(c);    /* XXX no locale support yet */
1140 }
1141
1142 bool
1143 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1144 {
1145     return is_uni_idfirst(c);   /* XXX no locale support yet */
1146 }
1147
1148 bool
1149 Perl_is_uni_alpha_lc(pTHX_ UV c)
1150 {
1151     return is_uni_alpha(c);     /* XXX no locale support yet */
1152 }
1153
1154 bool
1155 Perl_is_uni_ascii_lc(pTHX_ UV c)
1156 {
1157     return is_uni_ascii(c);     /* XXX no locale support yet */
1158 }
1159
1160 bool
1161 Perl_is_uni_space_lc(pTHX_ UV c)
1162 {
1163     return is_uni_space(c);     /* XXX no locale support yet */
1164 }
1165
1166 bool
1167 Perl_is_uni_digit_lc(pTHX_ UV c)
1168 {
1169     return is_uni_digit(c);     /* XXX no locale support yet */
1170 }
1171
1172 bool
1173 Perl_is_uni_upper_lc(pTHX_ UV c)
1174 {
1175     return is_uni_upper(c);     /* XXX no locale support yet */
1176 }
1177
1178 bool
1179 Perl_is_uni_lower_lc(pTHX_ UV c)
1180 {
1181     return is_uni_lower(c);     /* XXX no locale support yet */
1182 }
1183
1184 bool
1185 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1186 {
1187     return is_uni_cntrl(c);     /* XXX no locale support yet */
1188 }
1189
1190 bool
1191 Perl_is_uni_graph_lc(pTHX_ UV c)
1192 {
1193     return is_uni_graph(c);     /* XXX no locale support yet */
1194 }
1195
1196 bool
1197 Perl_is_uni_print_lc(pTHX_ UV c)
1198 {
1199     return is_uni_print(c);     /* XXX no locale support yet */
1200 }
1201
1202 bool
1203 Perl_is_uni_punct_lc(pTHX_ UV c)
1204 {
1205     return is_uni_punct(c);     /* XXX no locale support yet */
1206 }
1207
1208 bool
1209 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1210 {
1211     return is_uni_xdigit(c);    /* XXX no locale support yet */
1212 }
1213
1214 U32
1215 Perl_to_uni_upper_lc(pTHX_ U32 c)
1216 {
1217     /* XXX returns only the first character -- do not use XXX */
1218     /* XXX no locale support yet */
1219     STRLEN len;
1220     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1221     return (U32)to_uni_upper(c, tmpbuf, &len);
1222 }
1223
1224 U32
1225 Perl_to_uni_title_lc(pTHX_ U32 c)
1226 {
1227     /* XXX returns only the first character XXX -- do not use XXX */
1228     /* XXX no locale support yet */
1229     STRLEN len;
1230     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1231     return (U32)to_uni_title(c, tmpbuf, &len);
1232 }
1233
1234 U32
1235 Perl_to_uni_lower_lc(pTHX_ U32 c)
1236 {
1237     /* XXX returns only the first character -- do not use XXX */
1238     /* XXX no locale support yet */
1239     STRLEN len;
1240     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1241     return (U32)to_uni_lower(c, tmpbuf, &len);
1242 }
1243
1244 static bool
1245 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1246                  const char *const swashname)
1247 {
1248     dVAR;
1249     if (!is_utf8_char(p))
1250         return FALSE;
1251     if (!*swash)
1252         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1253     return swash_fetch(*swash, p, TRUE) != 0;
1254 }
1255
1256 bool
1257 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1258 {
1259     dVAR;
1260     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1261      * descendant of isalnum(3), in other words, it doesn't
1262      * contain the '_'. --jhi */
1263     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
1264 }
1265
1266 bool
1267 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1268 {
1269     dVAR;
1270     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
1271 }
1272
1273 bool
1274 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1275 {
1276     dVAR;
1277     if (*p == '_')
1278         return TRUE;
1279     /* is_utf8_idstart would be more logical. */
1280     return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
1281 }
1282
1283 bool
1284 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1285 {
1286     dVAR;
1287     if (*p == '_')
1288         return TRUE;
1289     return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
1290 }
1291
1292 bool
1293 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1294 {
1295     dVAR;
1296     return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
1297 }
1298
1299 bool
1300 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1301 {
1302     dVAR;
1303     return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
1304 }
1305
1306 bool
1307 Perl_is_utf8_space(pTHX_ const U8 *p)
1308 {
1309     dVAR;
1310     return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
1311 }
1312
1313 bool
1314 Perl_is_utf8_digit(pTHX_ const U8 *p)
1315 {
1316     dVAR;
1317     return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
1318 }
1319
1320 bool
1321 Perl_is_utf8_upper(pTHX_ const U8 *p)
1322 {
1323     dVAR;
1324     return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
1325 }
1326
1327 bool
1328 Perl_is_utf8_lower(pTHX_ const U8 *p)
1329 {
1330     dVAR;
1331     return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
1332 }
1333
1334 bool
1335 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1336 {
1337     dVAR;
1338     return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
1339 }
1340
1341 bool
1342 Perl_is_utf8_graph(pTHX_ const U8 *p)
1343 {
1344     dVAR;
1345     return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
1346 }
1347
1348 bool
1349 Perl_is_utf8_print(pTHX_ const U8 *p)
1350 {
1351     dVAR;
1352     return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
1353 }
1354
1355 bool
1356 Perl_is_utf8_punct(pTHX_ const U8 *p)
1357 {
1358     dVAR;
1359     return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
1360 }
1361
1362 bool
1363 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1364 {
1365     dVAR;
1366     return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
1367 }
1368
1369 bool
1370 Perl_is_utf8_mark(pTHX_ const U8 *p)
1371 {
1372     dVAR;
1373     return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
1374 }
1375
1376 /*
1377 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1378
1379 The "p" contains the pointer to the UTF-8 string encoding
1380 the character that is being converted.
1381
1382 The "ustrp" is a pointer to the character buffer to put the
1383 conversion result to.  The "lenp" is a pointer to the length
1384 of the result.
1385
1386 The "swashp" is a pointer to the swash to use.
1387
1388 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1389 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1390 but not always, a multicharacter mapping), is tried first.
1391
1392 The "special" is a string like "utf8::ToSpecLower", which means the
1393 hash %utf8::ToSpecLower.  The access to the hash is through
1394 Perl_to_utf8_case().
1395
1396 The "normal" is a string like "ToLower" which means the swash
1397 %utf8::ToLower.
1398
1399 =cut */
1400
1401 UV
1402 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1403                         SV **swashp, const char *normal, const char *special)
1404 {
1405     dVAR;
1406     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1407     STRLEN len = 0;
1408
1409     const UV uv0 = utf8_to_uvchr(p, NULL);
1410     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1411      * are necessary in EBCDIC, they are redundant no-ops
1412      * in ASCII-ish platforms, and hopefully optimized away. */
1413     const UV uv1 = NATIVE_TO_UNI(uv0);
1414     uvuni_to_utf8(tmpbuf, uv1);
1415
1416     if (!*swashp) /* load on-demand */
1417          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1418
1419     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1420     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1421          /* It might be "special" (sometimes, but not always,
1422           * a multicharacter mapping) */
1423          HV *hv;
1424          SV **svp;
1425
1426          if ((hv  = get_hv(special, FALSE)) &&
1427              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1428              (*svp)) {
1429              const char *s;
1430
1431               s = SvPV_const(*svp, len);
1432               if (len == 1)
1433                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1434               else {
1435 #ifdef EBCDIC
1436                    /* If we have EBCDIC we need to remap the characters
1437                     * since any characters in the low 256 are Unicode
1438                     * code points, not EBCDIC. */
1439                    U8 *t = (U8*)s, *tend = t + len, *d;
1440
1441                    d = tmpbuf;
1442                    if (SvUTF8(*svp)) {
1443                         STRLEN tlen = 0;
1444
1445                         while (t < tend) {
1446                              UV c = utf8_to_uvchr(t, &tlen);
1447                              if (tlen > 0) {
1448                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1449                                   t += tlen;
1450                              }
1451                              else
1452                                   break;
1453                         }
1454                    }
1455                    else {
1456                         while (t < tend) {
1457                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1458                              t++;
1459                         }
1460                    }
1461                    len = d - tmpbuf;
1462                    Copy(tmpbuf, ustrp, len, U8);
1463 #else
1464                    Copy(s, ustrp, len, U8);
1465 #endif
1466               }
1467          }
1468     }
1469
1470     if (!len && *swashp) {
1471          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1472
1473          if (uv2) {
1474               /* It was "normal" (a single character mapping). */
1475               UV uv3 = UNI_TO_NATIVE(uv2);
1476
1477               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1478          }
1479     }
1480
1481     if (!len) /* Neither: just copy. */
1482          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1483
1484     if (lenp)
1485          *lenp = len;
1486
1487     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1488 }
1489
1490 /*
1491 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1492
1493 Convert the UTF-8 encoded character at p to its uppercase version and
1494 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1495 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1496 the uppercase version may be longer than the original character.
1497
1498 The first character of the uppercased version is returned
1499 (but note, as explained above, that there may be more.)
1500
1501 =cut */
1502
1503 UV
1504 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1505 {
1506     dVAR;
1507     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1508                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1509 }
1510
1511 /*
1512 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1513
1514 Convert the UTF-8 encoded character at p to its titlecase version and
1515 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1516 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1517 titlecase version may be longer than the original character.
1518
1519 The first character of the titlecased version is returned
1520 (but note, as explained above, that there may be more.)
1521
1522 =cut */
1523
1524 UV
1525 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1526 {
1527     dVAR;
1528     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1529                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1530 }
1531
1532 /*
1533 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1534
1535 Convert the UTF-8 encoded character at p to its lowercase version and
1536 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1537 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1538 lowercase version may be longer than the original character.
1539
1540 The first character of the lowercased version is returned
1541 (but note, as explained above, that there may be more.)
1542
1543 =cut */
1544
1545 UV
1546 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1547 {
1548     dVAR;
1549     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1550                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1551 }
1552
1553 /*
1554 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1555
1556 Convert the UTF-8 encoded character at p to its foldcase version and
1557 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1558 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1559 foldcase version may be longer than the original character (up to
1560 three characters).
1561
1562 The first character of the foldcased version is returned
1563 (but note, as explained above, that there may be more.)
1564
1565 =cut */
1566
1567 UV
1568 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1569 {
1570     dVAR;
1571     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1572                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1573 }
1574
1575 /* Note:
1576  * A "swash" is a swatch hash.
1577  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1578  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1579  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1580  */
1581 SV*
1582 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1583 {
1584     dVAR;
1585     SV* retval;
1586     SV* const tokenbufsv = sv_newmortal();
1587     dSP;
1588     const size_t pkg_len = strlen(pkg);
1589     const size_t name_len = strlen(name);
1590     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1591     SV* errsv_save;
1592
1593     PUSHSTACKi(PERLSI_MAGIC);
1594     ENTER;
1595     SAVEI32(PL_hints);
1596     PL_hints = 0;
1597     save_re_context();
1598     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1599         ENTER;
1600         errsv_save = newSVsv(ERRSV);
1601         /* It is assumed that callers of this routine are not passing in any
1602            user derived data.  */
1603         /* Need to do this after save_re_context() as it will set PL_tainted to
1604            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1605            Even line to create errsv_save can turn on PL_tainted.  */
1606         SAVEBOOL(PL_tainted);
1607         PL_tainted = 0;
1608         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1609                          NULL);
1610         if (!SvTRUE(ERRSV))
1611             sv_setsv(ERRSV, errsv_save);
1612         SvREFCNT_dec(errsv_save);
1613         LEAVE;
1614     }
1615     SPAGAIN;
1616     PUSHMARK(SP);
1617     EXTEND(SP,5);
1618     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1619     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1620     PUSHs(listsv);
1621     PUSHs(sv_2mortal(newSViv(minbits)));
1622     PUSHs(sv_2mortal(newSViv(none)));
1623     PUTBACK;
1624     if (IN_PERL_COMPILETIME) {
1625         /* XXX ought to be handled by lex_start */
1626         SAVEI32(PL_in_my);
1627         PL_in_my = 0;
1628         sv_setpv(tokenbufsv, PL_tokenbuf);
1629     }
1630     errsv_save = newSVsv(ERRSV);
1631     if (call_method("SWASHNEW", G_SCALAR))
1632         retval = newSVsv(*PL_stack_sp--);
1633     else
1634         retval = &PL_sv_undef;
1635     if (!SvTRUE(ERRSV))
1636         sv_setsv(ERRSV, errsv_save);
1637     SvREFCNT_dec(errsv_save);
1638     LEAVE;
1639     POPSTACK;
1640     if (IN_PERL_COMPILETIME) {
1641         STRLEN len;
1642         const char* const pv = SvPV_const(tokenbufsv, len);
1643
1644         Copy(pv, PL_tokenbuf, len+1, char);
1645         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1646     }
1647     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1648         if (SvPOK(retval))
1649             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1650                        retval);
1651         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1652     }
1653     return retval;
1654 }
1655
1656
1657 /* This API is wrong for special case conversions since we may need to
1658  * return several Unicode characters for a single Unicode character
1659  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1660  * the lower-level routine, and it is similarly broken for returning
1661  * multiple values.  --jhi */
1662 /* Now SWASHGET is recasted into S_swash_get in this file. */
1663
1664 /* Note:
1665  * Returns the value of property/mapping C<swash> for the first character
1666  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1667  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1668  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1669  */
1670 UV
1671 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1672 {
1673     dVAR;
1674     HV* const hv = (HV*)SvRV(swash);
1675     U32 klen;
1676     U32 off;
1677     STRLEN slen;
1678     STRLEN needents;
1679     const U8 *tmps = NULL;
1680     U32 bit;
1681     SV *swatch;
1682     U8 tmputf8[2];
1683     UV c = NATIVE_TO_ASCII(*ptr);
1684
1685     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1686         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1687         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1688         ptr = tmputf8;
1689     }
1690     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1691      * then the "swatch" is a vec() for al the chars which start
1692      * with 0xAA..0xYY
1693      * So the key in the hash (klen) is length of encoded char -1
1694      */
1695     klen = UTF8SKIP(ptr) - 1;
1696     off  = ptr[klen];
1697
1698     if (klen == 0) {
1699       /* If char in invariant then swatch is for all the invariant chars
1700        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1701        */
1702         needents = UTF_CONTINUATION_MARK;
1703         off      = NATIVE_TO_UTF(ptr[klen]);
1704     }
1705     else {
1706       /* If char is encoded then swatch is for the prefix */
1707         needents = (1 << UTF_ACCUMULATION_SHIFT);
1708         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1709     }
1710
1711     /*
1712      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1713      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1714      * it's nothing to sniff at.)  Pity we usually come through at least
1715      * two function calls to get here...
1716      *
1717      * NB: this code assumes that swatches are never modified, once generated!
1718      */
1719
1720     if (hv   == PL_last_swash_hv &&
1721         klen == PL_last_swash_klen &&
1722         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1723     {
1724         tmps = PL_last_swash_tmps;
1725         slen = PL_last_swash_slen;
1726     }
1727     else {
1728         /* Try our second-level swatch cache, kept in a hash. */
1729         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1730
1731         /* If not cached, generate it via swash_get */
1732         if (!svp || !SvPOK(*svp)
1733                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1734             /* We use utf8n_to_uvuni() as we want an index into
1735                Unicode tables, not a native character number.
1736              */
1737             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1738                                            ckWARN(WARN_UTF8) ?
1739                                            0 : UTF8_ALLOW_ANY);
1740             swatch = swash_get(swash,
1741                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1742                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1743                                 needents);
1744
1745             if (IN_PERL_COMPILETIME)
1746                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1747
1748             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1749
1750             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1751                      || (slen << 3) < needents)
1752                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1753         }
1754
1755         PL_last_swash_hv = hv;
1756         PL_last_swash_klen = klen;
1757         /* FIXME change interpvar.h?  */
1758         PL_last_swash_tmps = (U8 *) tmps;
1759         PL_last_swash_slen = slen;
1760         if (klen)
1761             Copy(ptr, PL_last_swash_key, klen, U8);
1762     }
1763
1764     switch ((int)((slen << 3) / needents)) {
1765     case 1:
1766         bit = 1 << (off & 7);
1767         off >>= 3;
1768         return (tmps[off] & bit) != 0;
1769     case 8:
1770         return tmps[off];
1771     case 16:
1772         off <<= 1;
1773         return (tmps[off] << 8) + tmps[off + 1] ;
1774     case 32:
1775         off <<= 2;
1776         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1777     }
1778     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1779     return 0;
1780 }
1781
1782 /* Note:
1783  * Returns a swatch (a bit vector string) for a code point sequence
1784  * that starts from the value C<start> and comprises the number C<span>.
1785  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1786  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1787  */
1788 STATIC SV*
1789 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1790 {
1791     SV *swatch;
1792     U8 *l, *lend, *x, *xend, *s;
1793     STRLEN lcur, xcur, scur;
1794
1795     HV* const hv = (HV*)SvRV(swash);
1796     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1797     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1798     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1799     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1800     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1801     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1802     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1803     const STRLEN bits  = SvUV(*bitssvp);
1804     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1805     const UV     none  = SvUV(*nonesvp);
1806     const UV     end   = start + span;
1807
1808     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1809         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1810                                                  (UV)bits);
1811     }
1812
1813     /* create and initialize $swatch */
1814     swatch = newSVpvs("");
1815     scur   = octets ? (span * octets) : (span + 7) / 8;
1816     SvGROW(swatch, scur + 1);
1817     s = (U8*)SvPVX(swatch);
1818     if (octets && none) {
1819         const U8* const e = s + scur;
1820         while (s < e) {
1821             if (bits == 8)
1822                 *s++ = (U8)(none & 0xff);
1823             else if (bits == 16) {
1824                 *s++ = (U8)((none >>  8) & 0xff);
1825                 *s++ = (U8)( none        & 0xff);
1826             }
1827             else if (bits == 32) {
1828                 *s++ = (U8)((none >> 24) & 0xff);
1829                 *s++ = (U8)((none >> 16) & 0xff);
1830                 *s++ = (U8)((none >>  8) & 0xff);
1831                 *s++ = (U8)( none        & 0xff);
1832             }
1833         }
1834         *s = '\0';
1835     }
1836     else {
1837         (void)memzero((U8*)s, scur + 1);
1838     }
1839     SvCUR_set(swatch, scur);
1840     s = (U8*)SvPVX(swatch);
1841
1842     /* read $swash->{LIST} */
1843     l = (U8*)SvPV(*listsvp, lcur);
1844     lend = l + lcur;
1845     while (l < lend) {
1846         UV min, max, val, key;
1847         STRLEN numlen;
1848         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1849
1850         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1851
1852         numlen = lend - l;
1853         min = grok_hex((char *)l, &numlen, &flags, NULL);
1854         if (numlen)
1855             l += numlen;
1856         else if (nl) {
1857             l = nl + 1; /* 1 is length of "\n" */
1858             continue;
1859         }
1860         else {
1861             l = lend; /* to LIST's end at which \n is not found */
1862             break;
1863         }
1864
1865         if (isBLANK(*l)) {
1866             ++l;
1867             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1868             numlen = lend - l;
1869             max = grok_hex((char *)l, &numlen, &flags, NULL);
1870             if (numlen)
1871                 l += numlen;
1872             else
1873                 max = min;
1874
1875             if (octets) {
1876                 if (isBLANK(*l)) {
1877                     ++l;
1878                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1879                             PERL_SCAN_DISALLOW_PREFIX;
1880                     numlen = lend - l;
1881                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1882                     if (numlen)
1883                         l += numlen;
1884                     else
1885                         val = 0;
1886                 }
1887                 else {
1888                     val = 0;
1889                     if (typeto) {
1890                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1891                                          typestr, l);
1892                     }
1893                 }
1894             }
1895             else
1896                 val = 0; /* bits == 1, then val should be ignored */
1897         }
1898         else {
1899             max = min;
1900             if (octets) {
1901                 val = 0;
1902                 if (typeto) {
1903                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1904                 }
1905             }
1906             else
1907                 val = 0; /* bits == 1, then val should be ignored */
1908         }
1909
1910         if (nl)
1911             l = nl + 1;
1912         else
1913             l = lend;
1914
1915         if (max < start)
1916             continue;
1917
1918         if (octets) {
1919             if (min < start) {
1920                 if (!none || val < none) {
1921                     val += start - min;
1922                 }
1923                 min = start;
1924             }
1925             for (key = min; key <= max; key++) {
1926                 STRLEN offset;
1927                 if (key >= end)
1928                     goto go_out_list;
1929                 /* offset must be non-negative (start <= min <= key < end) */
1930                 offset = octets * (key - start);
1931                 if (bits == 8)
1932                     s[offset] = (U8)(val & 0xff);
1933                 else if (bits == 16) {
1934                     s[offset    ] = (U8)((val >>  8) & 0xff);
1935                     s[offset + 1] = (U8)( val        & 0xff);
1936                 }
1937                 else if (bits == 32) {
1938                     s[offset    ] = (U8)((val >> 24) & 0xff);
1939                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1940                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1941                     s[offset + 3] = (U8)( val        & 0xff);
1942                 }
1943
1944                 if (!none || val < none)
1945                     ++val;
1946             }
1947         }
1948         else { /* bits == 1, then val should be ignored */
1949             if (min < start)
1950                 min = start;
1951             for (key = min; key <= max; key++) {
1952                 const STRLEN offset = (STRLEN)(key - start);
1953                 if (key >= end)
1954                     goto go_out_list;
1955                 s[offset >> 3] |= 1 << (offset & 7);
1956             }
1957         }
1958     } /* while */
1959   go_out_list:
1960
1961     /* read $swash->{EXTRAS} */
1962     x = (U8*)SvPV(*extssvp, xcur);
1963     xend = x + xcur;
1964     while (x < xend) {
1965         STRLEN namelen;
1966         U8 *namestr;
1967         SV** othersvp;
1968         HV* otherhv;
1969         STRLEN otherbits;
1970         SV **otherbitssvp, *other;
1971         U8 *s, *o, *nl;
1972         STRLEN slen, olen;
1973
1974         U8 opc = *x++;
1975         if (opc == '\n')
1976             continue;
1977
1978         nl = (U8*)memchr(x, '\n', xend - x);
1979
1980         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1981             if (nl) {
1982                 x = nl + 1; /* 1 is length of "\n" */
1983                 continue;
1984             }
1985             else {
1986                 x = xend; /* to EXTRAS' end at which \n is not found */
1987                 break;
1988             }
1989         }
1990
1991         namestr = x;
1992         if (nl) {
1993             namelen = nl - namestr;
1994             x = nl + 1;
1995         }
1996         else {
1997             namelen = xend - namestr;
1998             x = xend;
1999         }
2000
2001         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2002         otherhv = (HV*)SvRV(*othersvp);
2003         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2004         otherbits = (STRLEN)SvUV(*otherbitssvp);
2005         if (bits < otherbits)
2006             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2007
2008         /* The "other" swatch must be destroyed after. */
2009         other = swash_get(*othersvp, start, span);
2010         o = (U8*)SvPV(other, olen);
2011
2012         if (!olen)
2013             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2014
2015         s = (U8*)SvPV(swatch, slen);
2016         if (bits == 1 && otherbits == 1) {
2017             if (slen != olen)
2018                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2019
2020             switch (opc) {
2021             case '+':
2022                 while (slen--)
2023                     *s++ |= *o++;
2024                 break;
2025             case '!':
2026                 while (slen--)
2027                     *s++ |= ~*o++;
2028                 break;
2029             case '-':
2030                 while (slen--)
2031                     *s++ &= ~*o++;
2032                 break;
2033             case '&':
2034                 while (slen--)
2035                     *s++ &= *o++;
2036                 break;
2037             default:
2038                 break;
2039             }
2040         }
2041         else {
2042             STRLEN otheroctets = otherbits >> 3;
2043             STRLEN offset = 0;
2044             U8* send = s + slen;
2045
2046             while (s < send) {
2047                 UV otherval = 0;
2048
2049                 if (otherbits == 1) {
2050                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2051                     ++offset;
2052                 }
2053                 else {
2054                     STRLEN vlen = otheroctets;
2055                     otherval = *o++;
2056                     while (--vlen) {
2057                         otherval <<= 8;
2058                         otherval |= *o++;
2059                     }
2060                 }
2061
2062                 if (opc == '+' && otherval)
2063                     /*EMPTY*/;   /* replace with otherval */
2064                 else if (opc == '!' && !otherval)
2065                     otherval = 1;
2066                 else if (opc == '-' && otherval)
2067                     otherval = 0;
2068                 else if (opc == '&' && !otherval)
2069                     otherval = 0;
2070                 else {
2071                     s += octets; /* no replacement */
2072                     continue;
2073                 }
2074
2075                 if (bits == 8)
2076                     *s++ = (U8)( otherval & 0xff);
2077                 else if (bits == 16) {
2078                     *s++ = (U8)((otherval >>  8) & 0xff);
2079                     *s++ = (U8)( otherval        & 0xff);
2080                 }
2081                 else if (bits == 32) {
2082                     *s++ = (U8)((otherval >> 24) & 0xff);
2083                     *s++ = (U8)((otherval >> 16) & 0xff);
2084                     *s++ = (U8)((otherval >>  8) & 0xff);
2085                     *s++ = (U8)( otherval        & 0xff);
2086                 }
2087             }
2088         }
2089         sv_free(other); /* through with it! */
2090     } /* while */
2091     return swatch;
2092 }
2093
2094 /*
2095 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2096
2097 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2098 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2099 bytes available. The return value is the pointer to the byte after the
2100 end of the new character. In other words,
2101
2102     d = uvchr_to_utf8(d, uv);
2103
2104 is the recommended wide native character-aware way of saying
2105
2106     *(d++) = uv;
2107
2108 =cut
2109 */
2110
2111 /* On ASCII machines this is normally a macro but we want a
2112    real function in case XS code wants it
2113 */
2114 U8 *
2115 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2116 {
2117     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2118 }
2119
2120 U8 *
2121 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2122 {
2123     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2124 }
2125
2126 /*
2127 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2128 flags
2129
2130 Returns the native character value of the first character in the string
2131 C<s>
2132 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2133 length, in bytes, of that character.
2134
2135 Allows length and flags to be passed to low level routine.
2136
2137 =cut
2138 */
2139 /* On ASCII machines this is normally a macro but we want
2140    a real function in case XS code wants it
2141 */
2142 UV
2143 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2144 U32 flags)
2145 {
2146     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2147     return UNI_TO_NATIVE(uv);
2148 }
2149
2150 /*
2151 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2152
2153 Build to the scalar dsv a displayable version of the string spv,
2154 length len, the displayable version being at most pvlim bytes long
2155 (if longer, the rest is truncated and "..." will be appended).
2156
2157 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2158 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2159 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2160 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2161 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2162 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2163
2164 The pointer to the PV of the dsv is returned.
2165
2166 =cut */
2167 char *
2168 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2169 {
2170     int truncated = 0;
2171     const char *s, *e;
2172
2173     sv_setpvn(dsv, "", 0);
2174     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2175          UV u;
2176           /* This serves double duty as a flag and a character to print after
2177              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2178           */
2179          char ok = 0;
2180
2181          if (pvlim && SvCUR(dsv) >= pvlim) {
2182               truncated++;
2183               break;
2184          }
2185          u = utf8_to_uvchr((U8*)s, 0);
2186          if (u < 256) {
2187              const unsigned char c = (unsigned char)u & 0xFF;
2188              if (flags & UNI_DISPLAY_BACKSLASH) {
2189                  switch (c) {
2190                  case '\n':
2191                      ok = 'n'; break;
2192                  case '\r':
2193                      ok = 'r'; break;
2194                  case '\t':
2195                      ok = 't'; break;
2196                  case '\f':
2197                      ok = 'f'; break;
2198                  case '\a':
2199                      ok = 'a'; break;
2200                  case '\\':
2201                      ok = '\\'; break;
2202                  default: break;
2203                  }
2204                  if (ok) {
2205                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2206                  }
2207              }
2208              /* isPRINT() is the locale-blind version. */
2209              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2210                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2211                  ok = 1;
2212              }
2213          }
2214          if (!ok)
2215              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2216     }
2217     if (truncated)
2218          sv_catpvs(dsv, "...");
2219
2220     return SvPVX(dsv);
2221 }
2222
2223 /*
2224 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2225
2226 Build to the scalar dsv a displayable version of the scalar sv,
2227 the displayable version being at most pvlim bytes long
2228 (if longer, the rest is truncated and "..." will be appended).
2229
2230 The flags argument is as in pv_uni_display().
2231
2232 The pointer to the PV of the dsv is returned.
2233
2234 =cut */
2235 char *
2236 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2237 {
2238      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2239                                 SvCUR(ssv), pvlim, flags);
2240 }
2241
2242 /*
2243 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2244
2245 Return true if the strings s1 and s2 differ case-insensitively, false
2246 if not (if they are equal case-insensitively).  If u1 is true, the
2247 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2248 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2249 are false, the respective string is assumed to be in native 8-bit
2250 encoding.
2251
2252 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2253 in there (they will point at the beginning of the I<next> character).
2254 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2255 pointers beyond which scanning will not continue under any
2256 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2257 s2+l2 will be used as goal end pointers that will also stop the scan,
2258 and which qualify towards defining a successful match: all the scans
2259 that define an explicit length must reach their goal pointers for
2260 a match to succeed).
2261
2262 For case-insensitiveness, the "casefolding" of Unicode is used
2263 instead of upper/lowercasing both the characters, see
2264 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2265
2266 =cut */
2267 I32
2268 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2269 {
2270      dVAR;
2271      register const U8 *p1  = (const U8*)s1;
2272      register const U8 *p2  = (const U8*)s2;
2273      register const U8 *f1 = NULL;
2274      register const U8 *f2 = NULL;
2275      register U8 *e1 = NULL;
2276      register U8 *q1 = NULL;
2277      register U8 *e2 = NULL;
2278      register U8 *q2 = NULL;
2279      STRLEN n1 = 0, n2 = 0;
2280      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2281      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2282      U8 natbuf[1+1];
2283      STRLEN foldlen1, foldlen2;
2284      bool match;
2285
2286      if (pe1)
2287           e1 = *(U8**)pe1;
2288      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2289           f1 = (const U8*)s1 + l1;
2290      if (pe2)
2291           e2 = *(U8**)pe2;
2292      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2293           f2 = (const U8*)s2 + l2;
2294
2295      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2296           return 1; /* mismatch; possible infinite loop or false positive */
2297
2298      if (!u1 || !u2)
2299           natbuf[1] = 0; /* Need to terminate the buffer. */
2300
2301      while ((e1 == 0 || p1 < e1) &&
2302             (f1 == 0 || p1 < f1) &&
2303             (e2 == 0 || p2 < e2) &&
2304             (f2 == 0 || p2 < f2)) {
2305           if (n1 == 0) {
2306                if (u1)
2307                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2308                else {
2309                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2310                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2311                }
2312                q1 = foldbuf1;
2313                n1 = foldlen1;
2314           }
2315           if (n2 == 0) {
2316                if (u2)
2317                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2318                else {
2319                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2320                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2321                }
2322                q2 = foldbuf2;
2323                n2 = foldlen2;
2324           }
2325           while (n1 && n2) {
2326                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2327                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2328                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2329                    return 1; /* mismatch */
2330                n1 -= UTF8SKIP(q1);
2331                q1 += UTF8SKIP(q1);
2332                n2 -= UTF8SKIP(q2);
2333                q2 += UTF8SKIP(q2);
2334           }
2335           if (n1 == 0)
2336                p1 += u1 ? UTF8SKIP(p1) : 1;
2337           if (n2 == 0)
2338                p2 += u2 ? UTF8SKIP(p2) : 1;
2339
2340      }
2341
2342      /* A match is defined by all the scans that specified
2343       * an explicit length reaching their final goals. */
2344      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2345
2346      if (match) {
2347           if (pe1)
2348                *pe1 = (char*)p1;
2349           if (pe2)
2350                *pe2 = (char*)p2;
2351      }
2352
2353      return match ? 0 : 1; /* 0 match, 1 mismatch */
2354 }
2355
2356 /*
2357  * Local variables:
2358  * c-indentation-style: bsd
2359  * c-basic-offset: 4
2360  * indent-tabs-mode: t
2361  * End:
2362  *
2363  * ex: set ts=8 sts=4 sw=4 noet:
2364  */