utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
   4  *    others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 U8 *
 171 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 172 {
 173     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 174 }
 175
 176 /*
 177
 178 Tests if some arbitrary number of bytes begins in a valid UTF-8
 179 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 180 UTF-8 character.  The actual number of bytes in the UTF-8 character
 181 will be returned if it is valid, otherwise 0.
 182
 183 This is the "slow" version as opposed to the "fast" version which is
 184 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 185 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 186 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 187 you should use the _slow().  In practice this means that the _slow()
 188 will be used very rarely, since the maximum Unicode code point (as of
 189 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 190 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 191 five bytes or more.
 192
 193 =cut */
 194 STATIC STRLEN
 195 S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
 196 {
 197     U8 u = *s;
 198     STRLEN slen;
 199     UV uv, ouv;
 200
 201     if (UTF8_IS_INVARIANT(u))
 202         return 1;
 203
 204     if (!UTF8_IS_START(u))
 205         return 0;
 206
 207     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 208         return 0;
 209
 210     slen = len - 1;
 211     s++;
 212 #ifdef EBCDIC
 213     u = NATIVE_TO_UTF(u);
 214 #endif
 215     u &= UTF_START_MASK(len);
 216     uv  = u;
 217     ouv = uv;
 218     while (slen--) {
 219         if (!UTF8_IS_CONTINUATION(*s))
 220             return 0;
 221         uv = UTF8_ACCUMULATE(uv, *s);
 222         if (uv < ouv)
 223             return 0;
 224         ouv = uv;
 225         s++;
 226     }
 227
 228     if ((STRLEN)UNISKIP(uv) < len)
 229         return 0;
 230
 231     return len;
 232 }
 233
 234 /*
 235 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 236
 237 Tests if some arbitrary number of bytes begins in a valid UTF-8
 238 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 239 UTF-8 character.  The actual number of bytes in the UTF-8 character
 240 will be returned if it is valid, otherwise 0.
 241
 242 =cut */
 243 STRLEN
 244 Perl_is_utf8_char(pTHX_ const U8 *s)
 245 {
 246     const STRLEN len = UTF8SKIP(s);
 247 #ifdef IS_UTF8_CHAR
 248     if (IS_UTF8_CHAR_FAST(len))
 249         return IS_UTF8_CHAR(s, len) ? len : 0;
 250 #endif /* #ifdef IS_UTF8_CHAR */
 251     return is_utf8_char_slow(s, len);
 252 }
 253
 254 /*
 255 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 256
 257 Returns true if first C<len> bytes of the given string form a valid
 258 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 259 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 260 because a valid ASCII string is a valid UTF-8 string.
 261
 262 See also is_utf8_string_loclen() and is_utf8_string_loc().
 263
 264 =cut
 265 */
 266
 267 bool
 268 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 269 {
 270     const U8* x = s;
 271     const U8* send;
 272
 273     if (!len)
 274         len = strlen((const char *)s);
 275     send = s + len;
 276
 277     while (x < send) {
 278         STRLEN c;
 279          /* Inline the easy bits of is_utf8_char() here for speed... */
 280          if (UTF8_IS_INVARIANT(*x))
 281               c = 1;
 282          else if (!UTF8_IS_START(*x))
 283              goto out;
 284          else {
 285               /* ... and call is_utf8_char() only if really needed. */
 286 #ifdef IS_UTF8_CHAR
 287              c = UTF8SKIP(x);
 288              if (IS_UTF8_CHAR_FAST(c)) {
 289                  if (!IS_UTF8_CHAR(x, c))
 290                      goto out;
 291              } else if (!is_utf8_char_slow(x, c))
 292                  goto out;
 293 #else
 294              c = is_utf8_char(x);
 295 #endif /* #ifdef IS_UTF8_CHAR */
 296               if (!c)
 297                   goto out;
 298          }
 299         x += c;
 300     }
 301
 302  out:
 303     if (x != send)
 304         return FALSE;
 305
 306     return TRUE;
 307 }
 308
 309 /*
 310 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 311
 312 Like is_ut8_string() but stores the location of the failure (in the
 313 case of "utf8ness failure") or the location s+len (in the case of
 314 "utf8ness success") in the C<ep>, and the number of UTF-8
 315 encoded characters in the C<el>.
 316
 317 See also is_utf8_string_loc() and is_utf8_string().
 318
 319 =cut
 320 */
 321
 322 bool
 323 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 324 {
 325     const U8* x = s;
 326     const U8* send;
 327     STRLEN c;
 328
 329     if (!len)
 330         len = strlen((const char *)s);
 331     send = s + len;
 332     if (el)
 333         *el = 0;
 334
 335     while (x < send) {
 336          /* Inline the easy bits of is_utf8_char() here for speed... */
 337          if (UTF8_IS_INVARIANT(*x))
 338              c = 1;
 339          else if (!UTF8_IS_START(*x))
 340              goto out;
 341          else {
 342              /* ... and call is_utf8_char() only if really needed. */
 343 #ifdef IS_UTF8_CHAR
 344              c = UTF8SKIP(x);
 345              if (IS_UTF8_CHAR_FAST(c)) {
 346                  if (!IS_UTF8_CHAR(x, c))
 347                      c = 0;
 348              } else
 349                  c = is_utf8_char_slow(x, c);
 350 #else
 351              c = is_utf8_char(x);
 352 #endif /* #ifdef IS_UTF8_CHAR */
 353              if (!c)
 354                  goto out;
 355          }
 356          x += c;
 357          if (el)
 358              (*el)++;
 359     }
 360
 361  out:
 362     if (ep)
 363         *ep = x;
 364     if (x != send)
 365         return FALSE;
 366
 367     return TRUE;
 368 }
 369
 370 /*
 371 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 372
 373 Like is_ut8_string() but stores the location of the failure (in the
 374 case of "utf8ness failure") or the location s+len (in the case of
 375 "utf8ness success") in the C<ep>.
 376
 377 See also is_utf8_string_loclen() and is_utf8_string().
 378
 379 =cut
 380 */
 381
 382 bool
 383 Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep)
 384 {
 385     return is_utf8_string_loclen(s, len, ep, 0);
 386 }
 387
 388 /*
 389 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 390
 391 Bottom level UTF-8 decode routine.
 392 Returns the unicode code point value of the first character in the string C<s>
 393 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 394 C<retlen> will be set to the length, in bytes, of that character.
 395
 396 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 397 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 398 it is assumed that the caller will raise a warning, and this function
 399 will silently just set C<retlen> to C<-1> and return zero.  If the
 400 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 401 malformations will be given, C<retlen> will be set to the expected
 402 length of the UTF-8 character in bytes, and zero will be returned.
 403
 404 The C<flags> can also contain various flags to allow deviations from
 405 the strict UTF-8 encoding (see F<utf8.h>).
 406
 407 Most code should use utf8_to_uvchr() rather than call this directly.
 408
 409 =cut
 410 */
 411
 412 UV
 413 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 414 {
 415     const U8 *s0 = s;
 416     UV uv = *s, ouv = 0;
 417     STRLEN len = 1;
 418     const bool dowarn = ckWARN_d(WARN_UTF8);
 419     const UV startbyte = *s;
 420     STRLEN expectlen = 0;
 421     U32 warning = 0;
 422
 423 /* This list is a superset of the UTF8_ALLOW_XXX. */
 424
 425 #define UTF8_WARN_EMPTY                          1
 426 #define UTF8_WARN_CONTINUATION                   2
 427 #define UTF8_WARN_NON_CONTINUATION               3
 428 #define UTF8_WARN_FE_FF                          4
 429 #define UTF8_WARN_SHORT                          5
 430 #define UTF8_WARN_OVERFLOW                       6
 431 #define UTF8_WARN_SURROGATE                      7
 432 #define UTF8_WARN_LONG                           8
 433 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 434
 435     if (curlen == 0 &&
 436         !(flags & UTF8_ALLOW_EMPTY)) {
 437         warning = UTF8_WARN_EMPTY;
 438         goto malformed;
 439     }
 440
 441     if (UTF8_IS_INVARIANT(uv)) {
 442         if (retlen)
 443             *retlen = 1;
 444         return (UV) (NATIVE_TO_UTF(*s));
 445     }
 446
 447     if (UTF8_IS_CONTINUATION(uv) &&
 448         !(flags & UTF8_ALLOW_CONTINUATION)) {
 449         warning = UTF8_WARN_CONTINUATION;
 450         goto malformed;
 451     }
 452
 453     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 454         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 455         warning = UTF8_WARN_NON_CONTINUATION;
 456         goto malformed;
 457     }
 458
 459 #ifdef EBCDIC
 460     uv = NATIVE_TO_UTF(uv);
 461 #else
 462     if ((uv == 0xfe || uv == 0xff) &&
 463         !(flags & UTF8_ALLOW_FE_FF)) {
 464         warning = UTF8_WARN_FE_FF;
 465         goto malformed;
 466     }
 467 #endif
 468
 469     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 470     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 471     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 472     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 473 #ifdef EBCDIC
 474     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 475     else                        { len =  7; uv &= 0x01; }
 476 #else
 477     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 478     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 479     else                        { len = 13; uv = 0; } /* whoa! */
 480 #endif
 481
 482     if (retlen)
 483         *retlen = len;
 484
 485     expectlen = len;
 486
 487     if ((curlen < expectlen) &&
 488         !(flags & UTF8_ALLOW_SHORT)) {
 489         warning = UTF8_WARN_SHORT;
 490         goto malformed;
 491     }
 492
 493     len--;
 494     s++;
 495     ouv = uv;
 496
 497     while (len--) {
 498         if (!UTF8_IS_CONTINUATION(*s) &&
 499             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 500             s--;
 501             warning = UTF8_WARN_NON_CONTINUATION;
 502             goto malformed;
 503         }
 504         else
 505             uv = UTF8_ACCUMULATE(uv, *s);
 506         if (!(uv > ouv)) {
 507             /* These cannot be allowed. */
 508             if (uv == ouv) {
 509                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 510                     warning = UTF8_WARN_LONG;
 511                     goto malformed;
 512                 }
 513             }
 514             else { /* uv < ouv */
 515                 /* This cannot be allowed. */
 516                 warning = UTF8_WARN_OVERFLOW;
 517                 goto malformed;
 518             }
 519         }
 520         s++;
 521         ouv = uv;
 522     }
 523
 524     if (UNICODE_IS_SURROGATE(uv) &&
 525         !(flags & UTF8_ALLOW_SURROGATE)) {
 526         warning = UTF8_WARN_SURROGATE;
 527         goto malformed;
 528     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 529                !(flags & UTF8_ALLOW_LONG)) {
 530         warning = UTF8_WARN_LONG;
 531         goto malformed;
 532     } else if (UNICODE_IS_ILLEGAL(uv) &&
 533                !(flags & UTF8_ALLOW_FFFF)) {
 534         warning = UTF8_WARN_FFFF;
 535         goto malformed;
 536     }
 537
 538     return uv;
 539
 540 malformed:
 541
 542     if (flags & UTF8_CHECK_ONLY) {
 543         if (retlen)
 544             *retlen = -1;
 545         return 0;
 546     }
 547
 548     if (dowarn) {
 549         SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 550
 551         switch (warning) {
 552         case 0: /* Intentionally empty. */ break;
 553         case UTF8_WARN_EMPTY:
 554             Perl_sv_catpv(aTHX_ sv, "(empty string)");
 555             break;
 556         case UTF8_WARN_CONTINUATION:
 557             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 558             break;
 559         case UTF8_WARN_NON_CONTINUATION:
 560             if (s == s0)
 561                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 562                            (UV)s[1], startbyte);
 563             else {
 564                 const int len = (int)(s-s0);
 565                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 566                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 567             }
 568
 569             break;
 570         case UTF8_WARN_FE_FF:
 571             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 572             break;
 573         case UTF8_WARN_SHORT:
 574             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 575                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 576             expectlen = curlen;         /* distance for caller to skip */
 577             break;
 578         case UTF8_WARN_OVERFLOW:
 579             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 580                            ouv, *s, startbyte);
 581             break;
 582         case UTF8_WARN_SURROGATE:
 583             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 584             break;
 585         case UTF8_WARN_LONG:
 586             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 587                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 588             break;
 589         case UTF8_WARN_FFFF:
 590             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 591             break;
 592         default:
 593             Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
 594             break;
 595         }
 596
 597         if (warning) {
 598             const char * const s = SvPVX_const(sv);
 599
 600             if (PL_op)
 601                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 602                             "%s in %s", s,  OP_DESC(PL_op));
 603             else
 604                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 605         }
 606     }
 607
 608     if (retlen)
 609         *retlen = expectlen ? expectlen : len;
 610
 611     return 0;
 612 }
 613
 614 /*
 615 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 616
 617 Returns the native character value of the first character in the string C<s>
 618 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 619 length, in bytes, of that character.
 620
 621 If C<s> does not point to a well-formed UTF-8 character, zero is
 622 returned and retlen is set, if possible, to -1.
 623
 624 =cut
 625 */
 626
 627 UV
 628 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 629 {
 630     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
 631                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 632 }
 633
 634 /*
 635 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 636
 637 Returns the Unicode code point of the first character in the string C<s>
 638 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 639 length, in bytes, of that character.
 640
 641 This function should only be used when returned UV is considered
 642 an index into the Unicode semantic tables (e.g. swashes).
 643
 644 If C<s> does not point to a well-formed UTF-8 character, zero is
 645 returned and retlen is set, if possible, to -1.
 646
 647 =cut
 648 */
 649
 650 UV
 651 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 652 {
 653     /* Call the low level routine asking for checks */
 654     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 655                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 656 }
 657
 658 /*
 659 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 660
 661 Return the length of the UTF-8 char encoded string C<s> in characters.
 662 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 663 up past C<e>, croaks.
 664
 665 =cut
 666 */
 667
 668 STRLEN
 669 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 670 {
 671     STRLEN len = 0;
 672
 673     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 674      * the bitops (especially ~) can create illegal UTF-8.
 675      * In other words: in Perl UTF-8 is not just for Unicode. */
 676
 677     if (e < s)
 678         goto warn_and_return;
 679     while (s < e) {
 680         const U8 t = UTF8SKIP(s);
 681         if (e - s < t) {
 682             warn_and_return:
 683             if (ckWARN_d(WARN_UTF8)) {
 684                 if (PL_op)
 685                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 686                             "%s in %s", unees, OP_DESC(PL_op));
 687                 else
 688                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 689             }
 690             return len;
 691         }
 692         s += t;
 693         len++;
 694     }
 695
 696     return len;
 697 }
 698
 699 /*
 700 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 701
 702 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 703 and C<b>.
 704
 705 WARNING: use only if you *know* that the pointers point inside the
 706 same UTF-8 buffer.
 707
 708 =cut
 709 */
 710
 711 IV
 712 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 713 {
 714     IV off = 0;
 715
 716     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 717      * the bitops (especially ~) can create illegal UTF-8.
 718      * In other words: in Perl UTF-8 is not just for Unicode. */
 719
 720     if (a < b) {
 721         while (a < b) {
 722             const U8 c = UTF8SKIP(a);
 723             if (b - a < c)
 724                 goto warn_and_return;
 725             a += c;
 726             off--;
 727         }
 728     }
 729     else {
 730         while (b < a) {
 731             const U8 c = UTF8SKIP(b);
 732
 733             if (a - b < c) {
 734                 warn_and_return:
 735                 if (ckWARN_d(WARN_UTF8)) {
 736                     if (PL_op)
 737                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 738                                     "%s in %s", unees, OP_DESC(PL_op));
 739                     else
 740                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 741                 }
 742                 return off;
 743             }
 744             b += c;
 745             off++;
 746         }
 747     }
 748
 749     return off;
 750 }
 751
 752 /*
 753 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 754
 755 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 756 forward or backward.
 757
 758 WARNING: do not use the following unless you *know* C<off> is within
 759 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 760 on the first byte of character or just after the last byte of a character.
 761
 762 =cut
 763 */
 764
 765 U8 *
 766 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 767 {
 768     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 769      * the bitops (especially ~) can create illegal UTF-8.
 770      * In other words: in Perl UTF-8 is not just for Unicode. */
 771
 772     if (off >= 0) {
 773         while (off--)
 774             s += UTF8SKIP(s);
 775     }
 776     else {
 777         while (off++) {
 778             s--;
 779             while (UTF8_IS_CONTINUATION(*s))
 780                 s--;
 781         }
 782     }
 783     return (U8 *)s;
 784 }
 785
 786 /*
 787 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 788
 789 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 790 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 791 updates len to contain the new length.
 792 Returns zero on failure, setting C<len> to -1.
 793
 794 =cut
 795 */
 796
 797 U8 *
 798 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 799 {
 800     U8 *send;
 801     U8 *d;
 802     U8 *save = s;
 803
 804     /* ensure valid UTF-8 and chars < 256 before updating string */
 805     for (send = s + *len; s < send; ) {
 806         U8 c = *s++;
 807
 808         if (!UTF8_IS_INVARIANT(c) &&
 809             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 810              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 811             *len = -1;
 812             return 0;
 813         }
 814     }
 815
 816     d = s = save;
 817     while (s < send) {
 818         STRLEN ulen;
 819         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 820         s += ulen;
 821     }
 822     *d = '\0';
 823     *len = d - save;
 824     return save;
 825 }
 826
 827 /*
 828 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 829
 830 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 831 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 832 the newly-created string, and updates C<len> to contain the new
 833 length.  Returns the original string if no conversion occurs, C<len>
 834 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 835 0 if C<s> is converted or contains all 7bit characters.
 836
 837 =cut
 838 */
 839
 840 U8 *
 841 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 842 {
 843     U8 *d;
 844     const U8 *start = s;
 845     const U8 *send;
 846     I32 count = 0;
 847
 848     if (!*is_utf8)
 849         return (U8 *)start;
 850
 851     /* ensure valid UTF-8 and chars < 256 before converting string */
 852     for (send = s + *len; s < send;) {
 853         U8 c = *s++;
 854         if (!UTF8_IS_INVARIANT(c)) {
 855             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 856                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 857                 count++;
 858             else
 859                 return (U8 *)start;
 860         }
 861     }
 862
 863     *is_utf8 = 0;
 864
 865     Newxz(d, (*len) - count + 1, U8);
 866     s = start; start = d;
 867     while (s < send) {
 868         U8 c = *s++;
 869         if (!UTF8_IS_INVARIANT(c)) {
 870             /* Then it is two-byte encoded */
 871             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 872             c = ASCII_TO_NATIVE(c);
 873         }
 874         *d++ = c;
 875     }
 876     *d = '\0';
 877     *len = d - start;
 878     return (U8 *)start;
 879 }
 880
 881 /*
 882 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 883
 884 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 885 Returns a pointer to the newly-created string, and sets C<len> to
 886 reflect the new length.
 887
 888 If you want to convert to UTF-8 from other encodings than ASCII,
 889 see sv_recode_to_utf8().
 890
 891 =cut
 892 */
 893
 894 U8*
 895 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 896 {
 897     const U8 * const send = s + (*len);
 898     U8 *d;
 899     U8 *dst;
 900
 901     Newxz(d, (*len) * 2 + 1, U8);
 902     dst = d;
 903
 904     while (s < send) {
 905         const UV uv = NATIVE_TO_ASCII(*s++);
 906         if (UNI_IS_INVARIANT(uv))
 907             *d++ = (U8)UTF_TO_NATIVE(uv);
 908         else {
 909             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 910             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 911         }
 912     }
 913     *d = '\0';
 914     *len = d-dst;
 915     return dst;
 916 }
 917
 918 /*
 919  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 920  *
 921  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 922  * We optimize for native, for obvious reasons. */
 923
 924 U8*
 925 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 926 {
 927     U8* pend;
 928     U8* dstart = d;
 929
 930     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 931          d[0] = 0;
 932          *newlen = 1;
 933          return d;
 934     }
 935
 936     if (bytelen & 1)
 937         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 938
 939     pend = p + bytelen;
 940
 941     while (p < pend) {
 942         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 943         p += 2;
 944         if (uv < 0x80) {
 945             *d++ = (U8)uv;
 946             continue;
 947         }
 948         if (uv < 0x800) {
 949             *d++ = (U8)(( uv >>  6)         | 0xc0);
 950             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 951             continue;
 952         }
 953         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 954             UV low = (p[0] << 8) + p[1];
 955             p += 2;
 956             if (low < 0xdc00 || low >= 0xdfff)
 957                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 958             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 959         }
 960         if (uv < 0x10000) {
 961             *d++ = (U8)(( uv >> 12)         | 0xe0);
 962             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 963             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 964             continue;
 965         }
 966         else {
 967             *d++ = (U8)(( uv >> 18)         | 0xf0);
 968             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 969             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 970             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 971             continue;
 972         }
 973     }
 974     *newlen = d - dstart;
 975     return d;
 976 }
 977
 978 /* Note: this one is slightly destructive of the source. */
 979
 980 U8*
 981 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 982 {
 983     U8* s = (U8*)p;
 984     U8* send = s + bytelen;
 985     while (s < send) {
 986         U8 tmp = s[0];
 987         s[0] = s[1];
 988         s[1] = tmp;
 989         s += 2;
 990     }
 991     return utf16_to_utf8(p, d, bytelen, newlen);
 992 }
 993
 994 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 995
 996 bool
 997 Perl_is_uni_alnum(pTHX_ UV c)
 998 {
 999     U8 tmpbuf[UTF8_MAXBYTES+1];
1000     uvchr_to_utf8(tmpbuf, c);
1001     return is_utf8_alnum(tmpbuf);
1002 }
1003
1004 bool
1005 Perl_is_uni_alnumc(pTHX_ UV c)
1006 {
1007     U8 tmpbuf[UTF8_MAXBYTES+1];
1008     uvchr_to_utf8(tmpbuf, c);
1009     return is_utf8_alnumc(tmpbuf);
1010 }
1011
1012 bool
1013 Perl_is_uni_idfirst(pTHX_ UV c)
1014 {
1015     U8 tmpbuf[UTF8_MAXBYTES+1];
1016     uvchr_to_utf8(tmpbuf, c);
1017     return is_utf8_idfirst(tmpbuf);
1018 }
1019
1020 bool
1021 Perl_is_uni_alpha(pTHX_ UV c)
1022 {
1023     U8 tmpbuf[UTF8_MAXBYTES+1];
1024     uvchr_to_utf8(tmpbuf, c);
1025     return is_utf8_alpha(tmpbuf);
1026 }
1027
1028 bool
1029 Perl_is_uni_ascii(pTHX_ UV c)
1030 {
1031     U8 tmpbuf[UTF8_MAXBYTES+1];
1032     uvchr_to_utf8(tmpbuf, c);
1033     return is_utf8_ascii(tmpbuf);
1034 }
1035
1036 bool
1037 Perl_is_uni_space(pTHX_ UV c)
1038 {
1039     U8 tmpbuf[UTF8_MAXBYTES+1];
1040     uvchr_to_utf8(tmpbuf, c);
1041     return is_utf8_space(tmpbuf);
1042 }
1043
1044 bool
1045 Perl_is_uni_digit(pTHX_ UV c)
1046 {
1047     U8 tmpbuf[UTF8_MAXBYTES+1];
1048     uvchr_to_utf8(tmpbuf, c);
1049     return is_utf8_digit(tmpbuf);
1050 }
1051
1052 bool
1053 Perl_is_uni_upper(pTHX_ UV c)
1054 {
1055     U8 tmpbuf[UTF8_MAXBYTES+1];
1056     uvchr_to_utf8(tmpbuf, c);
1057     return is_utf8_upper(tmpbuf);
1058 }
1059
1060 bool
1061 Perl_is_uni_lower(pTHX_ UV c)
1062 {
1063     U8 tmpbuf[UTF8_MAXBYTES+1];
1064     uvchr_to_utf8(tmpbuf, c);
1065     return is_utf8_lower(tmpbuf);
1066 }
1067
1068 bool
1069 Perl_is_uni_cntrl(pTHX_ UV c)
1070 {
1071     U8 tmpbuf[UTF8_MAXBYTES+1];
1072     uvchr_to_utf8(tmpbuf, c);
1073     return is_utf8_cntrl(tmpbuf);
1074 }
1075
1076 bool
1077 Perl_is_uni_graph(pTHX_ UV c)
1078 {
1079     U8 tmpbuf[UTF8_MAXBYTES+1];
1080     uvchr_to_utf8(tmpbuf, c);
1081     return is_utf8_graph(tmpbuf);
1082 }
1083
1084 bool
1085 Perl_is_uni_print(pTHX_ UV c)
1086 {
1087     U8 tmpbuf[UTF8_MAXBYTES+1];
1088     uvchr_to_utf8(tmpbuf, c);
1089     return is_utf8_print(tmpbuf);
1090 }
1091
1092 bool
1093 Perl_is_uni_punct(pTHX_ UV c)
1094 {
1095     U8 tmpbuf[UTF8_MAXBYTES+1];
1096     uvchr_to_utf8(tmpbuf, c);
1097     return is_utf8_punct(tmpbuf);
1098 }
1099
1100 bool
1101 Perl_is_uni_xdigit(pTHX_ UV c)
1102 {
1103     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1104     uvchr_to_utf8(tmpbuf, c);
1105     return is_utf8_xdigit(tmpbuf);
1106 }
1107
1108 UV
1109 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1110 {
1111     uvchr_to_utf8(p, c);
1112     return to_utf8_upper(p, p, lenp);
1113 }
1114
1115 UV
1116 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1117 {
1118     uvchr_to_utf8(p, c);
1119     return to_utf8_title(p, p, lenp);
1120 }
1121
1122 UV
1123 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1124 {
1125     uvchr_to_utf8(p, c);
1126     return to_utf8_lower(p, p, lenp);
1127 }
1128
1129 UV
1130 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1131 {
1132     uvchr_to_utf8(p, c);
1133     return to_utf8_fold(p, p, lenp);
1134 }
1135
1136 /* for now these all assume no locale info available for Unicode > 255 */
1137
1138 bool
1139 Perl_is_uni_alnum_lc(pTHX_ UV c)
1140 {
1141     return is_uni_alnum(c);     /* XXX no locale support yet */
1142 }
1143
1144 bool
1145 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1146 {
1147     return is_uni_alnumc(c);    /* XXX no locale support yet */
1148 }
1149
1150 bool
1151 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1152 {
1153     return is_uni_idfirst(c);   /* XXX no locale support yet */
1154 }
1155
1156 bool
1157 Perl_is_uni_alpha_lc(pTHX_ UV c)
1158 {
1159     return is_uni_alpha(c);     /* XXX no locale support yet */
1160 }
1161
1162 bool
1163 Perl_is_uni_ascii_lc(pTHX_ UV c)
1164 {
1165     return is_uni_ascii(c);     /* XXX no locale support yet */
1166 }
1167
1168 bool
1169 Perl_is_uni_space_lc(pTHX_ UV c)
1170 {
1171     return is_uni_space(c);     /* XXX no locale support yet */
1172 }
1173
1174 bool
1175 Perl_is_uni_digit_lc(pTHX_ UV c)
1176 {
1177     return is_uni_digit(c);     /* XXX no locale support yet */
1178 }
1179
1180 bool
1181 Perl_is_uni_upper_lc(pTHX_ UV c)
1182 {
1183     return is_uni_upper(c);     /* XXX no locale support yet */
1184 }
1185
1186 bool
1187 Perl_is_uni_lower_lc(pTHX_ UV c)
1188 {
1189     return is_uni_lower(c);     /* XXX no locale support yet */
1190 }
1191
1192 bool
1193 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1194 {
1195     return is_uni_cntrl(c);     /* XXX no locale support yet */
1196 }
1197
1198 bool
1199 Perl_is_uni_graph_lc(pTHX_ UV c)
1200 {
1201     return is_uni_graph(c);     /* XXX no locale support yet */
1202 }
1203
1204 bool
1205 Perl_is_uni_print_lc(pTHX_ UV c)
1206 {
1207     return is_uni_print(c);     /* XXX no locale support yet */
1208 }
1209
1210 bool
1211 Perl_is_uni_punct_lc(pTHX_ UV c)
1212 {
1213     return is_uni_punct(c);     /* XXX no locale support yet */
1214 }
1215
1216 bool
1217 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1218 {
1219     return is_uni_xdigit(c);    /* XXX no locale support yet */
1220 }
1221
1222 U32
1223 Perl_to_uni_upper_lc(pTHX_ U32 c)
1224 {
1225     /* XXX returns only the first character -- do not use XXX */
1226     /* XXX no locale support yet */
1227     STRLEN len;
1228     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1229     return (U32)to_uni_upper(c, tmpbuf, &len);
1230 }
1231
1232 U32
1233 Perl_to_uni_title_lc(pTHX_ U32 c)
1234 {
1235     /* XXX returns only the first character XXX -- do not use XXX */
1236     /* XXX no locale support yet */
1237     STRLEN len;
1238     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1239     return (U32)to_uni_title(c, tmpbuf, &len);
1240 }
1241
1242 U32
1243 Perl_to_uni_lower_lc(pTHX_ U32 c)
1244 {
1245     /* XXX returns only the first character -- do not use XXX */
1246     /* XXX no locale support yet */
1247     STRLEN len;
1248     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1249     return (U32)to_uni_lower(c, tmpbuf, &len);
1250 }
1251
1252 bool
1253 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1254 {
1255     if (!is_utf8_char(p))
1256         return FALSE;
1257     if (!PL_utf8_alnum)
1258         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1259          * descendant of isalnum(3), in other words, it doesn't
1260          * contain the '_'. --jhi */
1261         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1262     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1263 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1264 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1265     if (!PL_utf8_alnum)
1266         PL_utf8_alnum = swash_init("utf8", "",
1267             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1268     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1269 #endif
1270 }
1271
1272 bool
1273 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1274 {
1275     if (!is_utf8_char(p))
1276         return FALSE;
1277     if (!PL_utf8_alnum)
1278         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1279     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1280 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1281 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1282     if (!PL_utf8_alnum)
1283         PL_utf8_alnum = swash_init("utf8", "",
1284             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1285     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1286 #endif
1287 }
1288
1289 bool
1290 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1291 {
1292     if (*p == '_')
1293         return TRUE;
1294     if (!is_utf8_char(p))
1295         return FALSE;
1296     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1297         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1298     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1299 }
1300
1301 bool
1302 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1303 {
1304     if (*p == '_')
1305         return TRUE;
1306     if (!is_utf8_char(p))
1307         return FALSE;
1308     if (!PL_utf8_idcont)
1309         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1310     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1311 }
1312
1313 bool
1314 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1315 {
1316     if (!is_utf8_char(p))
1317         return FALSE;
1318     if (!PL_utf8_alpha)
1319         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1320     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1321 }
1322
1323 bool
1324 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1325 {
1326     if (!is_utf8_char(p))
1327         return FALSE;
1328     if (!PL_utf8_ascii)
1329         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1330     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1331 }
1332
1333 bool
1334 Perl_is_utf8_space(pTHX_ const U8 *p)
1335 {
1336     if (!is_utf8_char(p))
1337         return FALSE;
1338     if (!PL_utf8_space)
1339         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1340     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1341 }
1342
1343 bool
1344 Perl_is_utf8_digit(pTHX_ const U8 *p)
1345 {
1346     if (!is_utf8_char(p))
1347         return FALSE;
1348     if (!PL_utf8_digit)
1349         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1350     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1351 }
1352
1353 bool
1354 Perl_is_utf8_upper(pTHX_ const U8 *p)
1355 {
1356     if (!is_utf8_char(p))
1357         return FALSE;
1358     if (!PL_utf8_upper)
1359         PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1360     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1361 }
1362
1363 bool
1364 Perl_is_utf8_lower(pTHX_ const U8 *p)
1365 {
1366     if (!is_utf8_char(p))
1367         return FALSE;
1368     if (!PL_utf8_lower)
1369         PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1370     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1371 }
1372
1373 bool
1374 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1375 {
1376     if (!is_utf8_char(p))
1377         return FALSE;
1378     if (!PL_utf8_cntrl)
1379         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1380     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1381 }
1382
1383 bool
1384 Perl_is_utf8_graph(pTHX_ const U8 *p)
1385 {
1386     if (!is_utf8_char(p))
1387         return FALSE;
1388     if (!PL_utf8_graph)
1389         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1390     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1391 }
1392
1393 bool
1394 Perl_is_utf8_print(pTHX_ const U8 *p)
1395 {
1396     if (!is_utf8_char(p))
1397         return FALSE;
1398     if (!PL_utf8_print)
1399         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1400     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1401 }
1402
1403 bool
1404 Perl_is_utf8_punct(pTHX_ const U8 *p)
1405 {
1406     if (!is_utf8_char(p))
1407         return FALSE;
1408     if (!PL_utf8_punct)
1409         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1410     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1411 }
1412
1413 bool
1414 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1415 {
1416     if (!is_utf8_char(p))
1417         return FALSE;
1418     if (!PL_utf8_xdigit)
1419         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1420     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1421 }
1422
1423 bool
1424 Perl_is_utf8_mark(pTHX_ const U8 *p)
1425 {
1426     if (!is_utf8_char(p))
1427         return FALSE;
1428     if (!PL_utf8_mark)
1429         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1430     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1431 }
1432
1433 /*
1434 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1435
1436 The "p" contains the pointer to the UTF-8 string encoding
1437 the character that is being converted.
1438
1439 The "ustrp" is a pointer to the character buffer to put the
1440 conversion result to.  The "lenp" is a pointer to the length
1441 of the result.
1442
1443 The "swashp" is a pointer to the swash to use.
1444
1445 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1446 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1447 but not always, a multicharacter mapping), is tried first.
1448
1449 The "special" is a string like "utf8::ToSpecLower", which means the
1450 hash %utf8::ToSpecLower.  The access to the hash is through
1451 Perl_to_utf8_case().
1452
1453 The "normal" is a string like "ToLower" which means the swash
1454 %utf8::ToLower.
1455
1456 =cut */
1457
1458 UV
1459 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1460                         SV **swashp, const char *normal, const char *special)
1461 {
1462     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1463     STRLEN len = 0;
1464
1465     const UV uv0 = utf8_to_uvchr(p, NULL);
1466     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1467      * are necessary in EBCDIC, they are redundant no-ops
1468      * in ASCII-ish platforms, and hopefully optimized away. */
1469     const UV uv1 = NATIVE_TO_UNI(uv0);
1470     uvuni_to_utf8(tmpbuf, uv1);
1471
1472     if (!*swashp) /* load on-demand */
1473          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1474
1475     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1476     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1477          /* It might be "special" (sometimes, but not always,
1478           * a multicharacter mapping) */
1479          HV *hv;
1480          SV **svp;
1481
1482          if ((hv  = get_hv(special, FALSE)) &&
1483              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1484              (*svp)) {
1485              const char *s;
1486
1487               s = SvPV_const(*svp, len);
1488               if (len == 1)
1489                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1490               else {
1491 #ifdef EBCDIC
1492                    /* If we have EBCDIC we need to remap the characters
1493                     * since any characters in the low 256 are Unicode
1494                     * code points, not EBCDIC. */
1495                    U8 *t = (U8*)s, *tend = t + len, *d;
1496
1497                    d = tmpbuf;
1498                    if (SvUTF8(*svp)) {
1499                         STRLEN tlen = 0;
1500
1501                         while (t < tend) {
1502                              UV c = utf8_to_uvchr(t, &tlen);
1503                              if (tlen > 0) {
1504                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1505                                   t += tlen;
1506                              }
1507                              else
1508                                   break;
1509                         }
1510                    }
1511                    else {
1512                         while (t < tend) {
1513                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1514                              t++;
1515                         }
1516                    }
1517                    len = d - tmpbuf;
1518                    Copy(tmpbuf, ustrp, len, U8);
1519 #else
1520                    Copy(s, ustrp, len, U8);
1521 #endif
1522               }
1523          }
1524     }
1525
1526     if (!len && *swashp) {
1527          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1528
1529          if (uv2) {
1530               /* It was "normal" (a single character mapping). */
1531               UV uv3 = UNI_TO_NATIVE(uv2);
1532
1533               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1534          }
1535     }
1536
1537     if (!len) /* Neither: just copy. */
1538          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1539
1540     if (lenp)
1541          *lenp = len;
1542
1543     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1544 }
1545
1546 /*
1547 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1548
1549 Convert the UTF-8 encoded character at p to its uppercase version and
1550 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1551 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1552 the uppercase version may be longer than the original character.
1553
1554 The first character of the uppercased version is returned
1555 (but note, as explained above, that there may be more.)
1556
1557 =cut */
1558
1559 UV
1560 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1561 {
1562     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1563                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1564 }
1565
1566 /*
1567 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1568
1569 Convert the UTF-8 encoded character at p to its titlecase version and
1570 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1571 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1572 titlecase version may be longer than the original character.
1573
1574 The first character of the titlecased version is returned
1575 (but note, as explained above, that there may be more.)
1576
1577 =cut */
1578
1579 UV
1580 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1581 {
1582     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1583                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1584 }
1585
1586 /*
1587 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1588
1589 Convert the UTF-8 encoded character at p to its lowercase version and
1590 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1591 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1592 lowercase version may be longer than the original character.
1593
1594 The first character of the lowercased version is returned
1595 (but note, as explained above, that there may be more.)
1596
1597 =cut */
1598
1599 UV
1600 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1601 {
1602     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1603                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1604 }
1605
1606 /*
1607 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1608
1609 Convert the UTF-8 encoded character at p to its foldcase version and
1610 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1611 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1612 foldcase version may be longer than the original character (up to
1613 three characters).
1614
1615 The first character of the foldcased version is returned
1616 (but note, as explained above, that there may be more.)
1617
1618 =cut */
1619
1620 UV
1621 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1622 {
1623     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1624                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1625 }
1626
1627 /* a "swash" is a swatch hash */
1628
1629 SV*
1630 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1631 {
1632     dVAR;
1633     SV* retval;
1634     SV* const tokenbufsv = sv_newmortal();
1635     dSP;
1636     const size_t pkg_len = strlen(pkg);
1637     const size_t name_len = strlen(name);
1638     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1639     SV* errsv_save;
1640
1641     PUSHSTACKi(PERLSI_MAGIC);
1642     ENTER;
1643     SAVEI32(PL_hints);
1644     PL_hints = 0;
1645     save_re_context();
1646     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1647         ENTER;
1648         errsv_save = newSVsv(ERRSV);
1649         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1650                          Nullsv);
1651         if (!SvTRUE(ERRSV))
1652             sv_setsv(ERRSV, errsv_save);
1653         SvREFCNT_dec(errsv_save);
1654         LEAVE;
1655     }
1656     SPAGAIN;
1657     PUSHMARK(SP);
1658     EXTEND(SP,5);
1659     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1660     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1661     PUSHs(listsv);
1662     PUSHs(sv_2mortal(newSViv(minbits)));
1663     PUSHs(sv_2mortal(newSViv(none)));
1664     PUTBACK;
1665     if (IN_PERL_COMPILETIME) {
1666         /* XXX ought to be handled by lex_start */
1667         SAVEI32(PL_in_my);
1668         PL_in_my = 0;
1669         sv_setpv(tokenbufsv, PL_tokenbuf);
1670     }
1671     errsv_save = newSVsv(ERRSV);
1672     if (call_method("SWASHNEW", G_SCALAR))
1673         retval = newSVsv(*PL_stack_sp--);
1674     else
1675         retval = &PL_sv_undef;
1676     if (!SvTRUE(ERRSV))
1677         sv_setsv(ERRSV, errsv_save);
1678     SvREFCNT_dec(errsv_save);
1679     LEAVE;
1680     POPSTACK;
1681     if (IN_PERL_COMPILETIME) {
1682         STRLEN len;
1683         const char* const pv = SvPV_const(tokenbufsv, len);
1684
1685         Copy(pv, PL_tokenbuf, len+1, char);
1686         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1687     }
1688     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1689         if (SvPOK(retval))
1690             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1691                        retval);
1692         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1693     }
1694     return retval;
1695 }
1696
1697
1698 /* This API is wrong for special case conversions since we may need to
1699  * return several Unicode characters for a single Unicode character
1700  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1701  * the lower-level routine, and it is similarly broken for returning
1702  * multiple values.  --jhi */
1703 UV
1704 Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
1705 {
1706     dVAR;
1707     HV* const hv = (HV*)SvRV(sv);
1708     U32 klen;
1709     U32 off;
1710     STRLEN slen;
1711     STRLEN needents;
1712     const U8 *tmps = NULL;
1713     U32 bit;
1714     SV *retval;
1715     U8 tmputf8[2];
1716     UV c = NATIVE_TO_ASCII(*ptr);
1717
1718     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1719         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1720         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1721         ptr = tmputf8;
1722     }
1723     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1724      * then the "swatch" is a vec() for al the chars which start
1725      * with 0xAA..0xYY
1726      * So the key in the hash (klen) is length of encoded char -1
1727      */
1728     klen = UTF8SKIP(ptr) - 1;
1729     off  = ptr[klen];
1730
1731     if (klen == 0)
1732      {
1733       /* If char in invariant then swatch is for all the invariant chars
1734        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1735        */
1736       needents = UTF_CONTINUATION_MARK;
1737       off      = NATIVE_TO_UTF(ptr[klen]);
1738      }
1739     else
1740      {
1741       /* If char is encoded then swatch is for the prefix */
1742       needents = (1 << UTF_ACCUMULATION_SHIFT);
1743       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1744      }
1745
1746     /*
1747      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1748      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1749      * it's nothing to sniff at.)  Pity we usually come through at least
1750      * two function calls to get here...
1751      *
1752      * NB: this code assumes that swatches are never modified, once generated!
1753      */
1754
1755     if (hv   == PL_last_swash_hv &&
1756         klen == PL_last_swash_klen &&
1757         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1758     {
1759         tmps = PL_last_swash_tmps;
1760         slen = PL_last_swash_slen;
1761     }
1762     else {
1763         /* Try our second-level swatch cache, kept in a hash. */
1764         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1765
1766         /* If not cached, generate it via utf8::SWASHGET */
1767         if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1768             dSP;
1769             /* We use utf8n_to_uvuni() as we want an index into
1770                Unicode tables, not a native character number.
1771              */
1772             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1773                                            ckWARN(WARN_UTF8) ?
1774                                            0 : UTF8_ALLOW_ANY);
1775             SV *errsv_save;
1776             ENTER;
1777             SAVETMPS;
1778             save_re_context();
1779             PUSHSTACKi(PERLSI_MAGIC);
1780             PUSHMARK(SP);
1781             EXTEND(SP,3);
1782             PUSHs((SV*)sv);
1783             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1784             PUSHs(sv_2mortal(newSViv((klen) ?
1785                                      (code_point & ~(needents - 1)) : 0)));
1786             PUSHs(sv_2mortal(newSViv(needents)));
1787             PUTBACK;
1788             errsv_save = newSVsv(ERRSV);
1789             if (call_method("SWASHGET", G_SCALAR))
1790                 retval = newSVsv(*PL_stack_sp--);
1791             else
1792                 retval = &PL_sv_undef;
1793             if (!SvTRUE(ERRSV))
1794                 sv_setsv(ERRSV, errsv_save);
1795             SvREFCNT_dec(errsv_save);
1796             POPSTACK;
1797             FREETMPS;
1798             LEAVE;
1799             if (IN_PERL_COMPILETIME)
1800                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1801
1802             svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1803
1804             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1805                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1806         }
1807
1808         PL_last_swash_hv = hv;
1809         PL_last_swash_klen = klen;
1810         /* FIXME change interpvar.h?  */
1811         PL_last_swash_tmps = (U8 *) tmps;
1812         PL_last_swash_slen = slen;
1813         if (klen)
1814             Copy(ptr, PL_last_swash_key, klen, U8);
1815     }
1816
1817     switch ((int)((slen << 3) / needents)) {
1818     case 1:
1819         bit = 1 << (off & 7);
1820         off >>= 3;
1821         return (tmps[off] & bit) != 0;
1822     case 8:
1823         return tmps[off];
1824     case 16:
1825         off <<= 1;
1826         return (tmps[off] << 8) + tmps[off + 1] ;
1827     case 32:
1828         off <<= 2;
1829         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1830     }
1831     Perl_croak(aTHX_ "panic: swash_fetch");
1832     return 0;
1833 }
1834
1835
1836 /*
1837 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1838
1839 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1840 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1841 bytes available. The return value is the pointer to the byte after the
1842 end of the new character. In other words,
1843
1844     d = uvchr_to_utf8(d, uv);
1845
1846 is the recommended wide native character-aware way of saying
1847
1848     *(d++) = uv;
1849
1850 =cut
1851 */
1852
1853 /* On ASCII machines this is normally a macro but we want a
1854    real function in case XS code wants it
1855 */
1856 #undef Perl_uvchr_to_utf8
1857 U8 *
1858 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1859 {
1860     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1861 }
1862
1863 U8 *
1864 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1865 {
1866     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1867 }
1868
1869 /*
1870 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1871
1872 Returns the native character value of the first character in the string C<s>
1873 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1874 length, in bytes, of that character.
1875
1876 Allows length and flags to be passed to low level routine.
1877
1878 =cut
1879 */
1880 /* On ASCII machines this is normally a macro but we want
1881    a real function in case XS code wants it
1882 */
1883 #undef Perl_utf8n_to_uvchr
1884 UV
1885 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1886 {
1887     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1888     return UNI_TO_NATIVE(uv);
1889 }
1890
1891 /*
1892 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1893
1894 Build to the scalar dsv a displayable version of the string spv,
1895 length len, the displayable version being at most pvlim bytes long
1896 (if longer, the rest is truncated and "..." will be appended).
1897
1898 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1899 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1900 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1901 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1902 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1903 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1904
1905 The pointer to the PV of the dsv is returned.
1906
1907 =cut */
1908 char *
1909 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1910 {
1911     int truncated = 0;
1912     const char *s, *e;
1913
1914     sv_setpvn(dsv, "", 0);
1915     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1916          UV u;
1917           /* This serves double duty as a flag and a character to print after
1918              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1919           */
1920          char ok = 0;
1921
1922          if (pvlim && SvCUR(dsv) >= pvlim) {
1923               truncated++;
1924               break;
1925          }
1926          u = utf8_to_uvchr((U8*)s, 0);
1927          if (u < 256) {
1928              const unsigned char c = (unsigned char)u & 0xFF;
1929              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1930                  switch (c) {
1931                  case '\n':
1932                      ok = 'n'; break;
1933                  case '\r':
1934                      ok = 'r'; break;
1935                  case '\t':
1936                      ok = 't'; break;
1937                  case '\f':
1938                      ok = 'f'; break;
1939                  case '\a':
1940                      ok = 'a'; break;
1941                  case '\\':
1942                      ok = '\\'; break;
1943                  default: break;
1944                  }
1945                  if (ok) {
1946                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1947                  }
1948              }
1949              /* isPRINT() is the locale-blind version. */
1950              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1951                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1952                  ok = 1;
1953              }
1954          }
1955          if (!ok)
1956              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1957     }
1958     if (truncated)
1959          sv_catpvn(dsv, "...", 3);
1960
1961     return SvPVX(dsv);
1962 }
1963
1964 /*
1965 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1966
1967 Build to the scalar dsv a displayable version of the scalar sv,
1968 the displayable version being at most pvlim bytes long
1969 (if longer, the rest is truncated and "..." will be appended).
1970
1971 The flags argument is as in pv_uni_display().
1972
1973 The pointer to the PV of the dsv is returned.
1974
1975 =cut */
1976 char *
1977 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1978 {
1979      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1980                                 SvCUR(ssv), pvlim, flags);
1981 }
1982
1983 /*
1984 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1985
1986 Return true if the strings s1 and s2 differ case-insensitively, false
1987 if not (if they are equal case-insensitively).  If u1 is true, the
1988 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1989 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1990 are false, the respective string is assumed to be in native 8-bit
1991 encoding.
1992
1993 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1994 in there (they will point at the beginning of the I<next> character).
1995 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1996 pointers beyond which scanning will not continue under any
1997 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1998 s2+l2 will be used as goal end pointers that will also stop the scan,
1999 and which qualify towards defining a successful match: all the scans
2000 that define an explicit length must reach their goal pointers for
2001 a match to succeed).
2002
2003 For case-insensitiveness, the "casefolding" of Unicode is used
2004 instead of upper/lowercasing both the characters, see
2005 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2006
2007 =cut */
2008 I32
2009 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2010 {
2011      register const U8 *p1  = (const U8*)s1;
2012      register const U8 *p2  = (const U8*)s2;
2013      register const U8 *f1 = 0, *f2 = 0;
2014      register U8 *e1 = 0, *q1 = 0;
2015      register U8 *e2 = 0, *q2 = 0;
2016      STRLEN n1 = 0, n2 = 0;
2017      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2018      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2019      U8 natbuf[1+1];
2020      STRLEN foldlen1, foldlen2;
2021      bool match;
2022
2023      if (pe1)
2024           e1 = *(U8**)pe1;
2025      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2026           f1 = (const U8*)s1 + l1;
2027      if (pe2)
2028           e2 = *(U8**)pe2;
2029      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2030           f2 = (const U8*)s2 + l2;
2031
2032      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2033           return 1; /* mismatch; possible infinite loop or false positive */
2034
2035      if (!u1 || !u2)
2036           natbuf[1] = 0; /* Need to terminate the buffer. */
2037
2038      while ((e1 == 0 || p1 < e1) &&
2039             (f1 == 0 || p1 < f1) &&
2040             (e2 == 0 || p2 < e2) &&
2041             (f2 == 0 || p2 < f2)) {
2042           if (n1 == 0) {
2043                if (u1)
2044                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2045                else {
2046                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2047                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2048                }
2049                q1 = foldbuf1;
2050                n1 = foldlen1;
2051           }
2052           if (n2 == 0) {
2053                if (u2)
2054                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2055                else {
2056                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2057                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2058                }
2059                q2 = foldbuf2;
2060                n2 = foldlen2;
2061           }
2062           while (n1 && n2) {
2063                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2064                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2065                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2066                    return 1; /* mismatch */
2067                n1 -= UTF8SKIP(q1);
2068                q1 += UTF8SKIP(q1);
2069                n2 -= UTF8SKIP(q2);
2070                q2 += UTF8SKIP(q2);
2071           }
2072           if (n1 == 0)
2073                p1 += u1 ? UTF8SKIP(p1) : 1;
2074           if (n2 == 0)
2075                p2 += u2 ? UTF8SKIP(p2) : 1;
2076
2077      }
2078
2079      /* A match is defined by all the scans that specified
2080       * an explicit length reaching their final goals. */
2081      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2082
2083      if (match) {
2084           if (pe1)
2085                *pe1 = (char*)p1;
2086           if (pe2)
2087                *pe2 = (char*)p2;
2088      }
2089
2090      return match ? 0 : 1; /* 0 match, 1 mismatch */
2091 }
2092
2093 /*
2094  * Local variables:
2095  * c-indentation-style: bsd
2096  * c-basic-offset: 4
2097  * indent-tabs-mode: t
2098  * End:
2099  *
2100  * ex: set ts=8 sts=4 sw=4 noet:
2101  */