utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241 #ifdef IS_UTF8_CHAR
 242     if (IS_UTF8_CHAR_FAST(len))
 243         return IS_UTF8_CHAR(s, len) ? len : 0;
 244 #endif /* #ifdef IS_UTF8_CHAR */
 245     return is_utf8_char_slow(s, len);
 246 }
 247
 248 /*
 249 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 250
 251 Returns true if first C<len> bytes of the given string form a valid
 252 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 253 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 254 because a valid ASCII string is a valid UTF-8 string.
 255
 256 See also is_utf8_string_loclen() and is_utf8_string_loc().
 257
 258 =cut
 259 */
 260
 261 bool
 262 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 263 {
 264     const U8* x = s;
 265     const U8* send;
 266
 267     if (!len)
 268         len = strlen((const char *)s);
 269     send = s + len;
 270
 271     while (x < send) {
 272         STRLEN c;
 273          /* Inline the easy bits of is_utf8_char() here for speed... */
 274          if (UTF8_IS_INVARIANT(*x))
 275               c = 1;
 276          else if (!UTF8_IS_START(*x))
 277              goto out;
 278          else {
 279               /* ... and call is_utf8_char() only if really needed. */
 280 #ifdef IS_UTF8_CHAR
 281              c = UTF8SKIP(x);
 282              if (IS_UTF8_CHAR_FAST(c)) {
 283                  if (!IS_UTF8_CHAR(x, c))
 284                      goto out;
 285              } else if (!is_utf8_char_slow(x, c))
 286                  goto out;
 287 #else
 288              c = is_utf8_char(x);
 289 #endif /* #ifdef IS_UTF8_CHAR */
 290               if (!c)
 291                   goto out;
 292          }
 293         x += c;
 294     }
 295
 296  out:
 297     if (x != send)
 298         return FALSE;
 299
 300     return TRUE;
 301 }
 302
 303 /*
 304 Implemented as a macro in utf8.h
 305
 306 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 307
 308 Like is_utf8_string() but stores the location of the failure (in the
 309 case of "utf8ness failure") or the location s+len (in the case of
 310 "utf8ness success") in the C<ep>.
 311
 312 See also is_utf8_string_loclen() and is_utf8_string().
 313
 314 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 315
 316 Like is_utf8_string() but stores the location of the failure (in the
 317 case of "utf8ness failure") or the location s+len (in the case of
 318 "utf8ness success") in the C<ep>, and the number of UTF-8
 319 encoded characters in the C<el>.
 320
 321 See also is_utf8_string_loc() and is_utf8_string().
 322
 323 =cut
 324 */
 325
 326 bool
 327 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 328 {
 329     const U8* x = s;
 330     const U8* send;
 331     STRLEN c;
 332
 333     if (!len)
 334         len = strlen((const char *)s);
 335     send = s + len;
 336     if (el)
 337         *el = 0;
 338
 339     while (x < send) {
 340          /* Inline the easy bits of is_utf8_char() here for speed... */
 341          if (UTF8_IS_INVARIANT(*x))
 342              c = 1;
 343          else if (!UTF8_IS_START(*x))
 344              goto out;
 345          else {
 346              /* ... and call is_utf8_char() only if really needed. */
 347 #ifdef IS_UTF8_CHAR
 348              c = UTF8SKIP(x);
 349              if (IS_UTF8_CHAR_FAST(c)) {
 350                  if (!IS_UTF8_CHAR(x, c))
 351                      c = 0;
 352              } else
 353                  c = is_utf8_char_slow(x, c);
 354 #else
 355              c = is_utf8_char(x);
 356 #endif /* #ifdef IS_UTF8_CHAR */
 357              if (!c)
 358                  goto out;
 359          }
 360          x += c;
 361          if (el)
 362              (*el)++;
 363     }
 364
 365  out:
 366     if (ep)
 367         *ep = x;
 368     if (x != send)
 369         return FALSE;
 370
 371     return TRUE;
 372 }
 373
 374 /*
 375
 376 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 377
 378 Bottom level UTF-8 decode routine.
 379 Returns the unicode code point value of the first character in the string C<s>
 380 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 381 C<retlen> will be set to the length, in bytes, of that character.
 382
 383 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 384 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 385 it is assumed that the caller will raise a warning, and this function
 386 will silently just set C<retlen> to C<-1> and return zero.  If the
 387 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 388 malformations will be given, C<retlen> will be set to the expected
 389 length of the UTF-8 character in bytes, and zero will be returned.
 390
 391 The C<flags> can also contain various flags to allow deviations from
 392 the strict UTF-8 encoding (see F<utf8.h>).
 393
 394 Most code should use utf8_to_uvchr() rather than call this directly.
 395
 396 =cut
 397 */
 398
 399 UV
 400 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 401 {
 402     dVAR;
 403     const U8 *s0 = s;
 404     UV uv = *s, ouv = 0;
 405     STRLEN len = 1;
 406     const bool dowarn = ckWARN_d(WARN_UTF8);
 407     const UV startbyte = *s;
 408     STRLEN expectlen = 0;
 409     U32 warning = 0;
 410
 411 /* This list is a superset of the UTF8_ALLOW_XXX. */
 412
 413 #define UTF8_WARN_EMPTY                          1
 414 #define UTF8_WARN_CONTINUATION                   2
 415 #define UTF8_WARN_NON_CONTINUATION               3
 416 #define UTF8_WARN_FE_FF                          4
 417 #define UTF8_WARN_SHORT                          5
 418 #define UTF8_WARN_OVERFLOW                       6
 419 #define UTF8_WARN_SURROGATE                      7
 420 #define UTF8_WARN_LONG                           8
 421 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 422
 423     if (curlen == 0 &&
 424         !(flags & UTF8_ALLOW_EMPTY)) {
 425         warning = UTF8_WARN_EMPTY;
 426         goto malformed;
 427     }
 428
 429     if (UTF8_IS_INVARIANT(uv)) {
 430         if (retlen)
 431             *retlen = 1;
 432         return (UV) (NATIVE_TO_UTF(*s));
 433     }
 434
 435     if (UTF8_IS_CONTINUATION(uv) &&
 436         !(flags & UTF8_ALLOW_CONTINUATION)) {
 437         warning = UTF8_WARN_CONTINUATION;
 438         goto malformed;
 439     }
 440
 441     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 442         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 443         warning = UTF8_WARN_NON_CONTINUATION;
 444         goto malformed;
 445     }
 446
 447 #ifdef EBCDIC
 448     uv = NATIVE_TO_UTF(uv);
 449 #else
 450     if ((uv == 0xfe || uv == 0xff) &&
 451         !(flags & UTF8_ALLOW_FE_FF)) {
 452         warning = UTF8_WARN_FE_FF;
 453         goto malformed;
 454     }
 455 #endif
 456
 457     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 458     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 459     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 460     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 461 #ifdef EBCDIC
 462     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 463     else                        { len =  7; uv &= 0x01; }
 464 #else
 465     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 466     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 467     else                        { len = 13; uv = 0; } /* whoa! */
 468 #endif
 469
 470     if (retlen)
 471         *retlen = len;
 472
 473     expectlen = len;
 474
 475     if ((curlen < expectlen) &&
 476         !(flags & UTF8_ALLOW_SHORT)) {
 477         warning = UTF8_WARN_SHORT;
 478         goto malformed;
 479     }
 480
 481     len--;
 482     s++;
 483     ouv = uv;
 484
 485     while (len--) {
 486         if (!UTF8_IS_CONTINUATION(*s) &&
 487             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 488             s--;
 489             warning = UTF8_WARN_NON_CONTINUATION;
 490             goto malformed;
 491         }
 492         else
 493             uv = UTF8_ACCUMULATE(uv, *s);
 494         if (!(uv > ouv)) {
 495             /* These cannot be allowed. */
 496             if (uv == ouv) {
 497                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 498                     warning = UTF8_WARN_LONG;
 499                     goto malformed;
 500                 }
 501             }
 502             else { /* uv < ouv */
 503                 /* This cannot be allowed. */
 504                 warning = UTF8_WARN_OVERFLOW;
 505                 goto malformed;
 506             }
 507         }
 508         s++;
 509         ouv = uv;
 510     }
 511
 512     if (UNICODE_IS_SURROGATE(uv) &&
 513         !(flags & UTF8_ALLOW_SURROGATE)) {
 514         warning = UTF8_WARN_SURROGATE;
 515         goto malformed;
 516     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 517                !(flags & UTF8_ALLOW_LONG)) {
 518         warning = UTF8_WARN_LONG;
 519         goto malformed;
 520     } else if (UNICODE_IS_ILLEGAL(uv) &&
 521                !(flags & UTF8_ALLOW_FFFF)) {
 522         warning = UTF8_WARN_FFFF;
 523         goto malformed;
 524     }
 525
 526     return uv;
 527
 528 malformed:
 529
 530     if (flags & UTF8_CHECK_ONLY) {
 531         if (retlen)
 532             *retlen = -1;
 533         return 0;
 534     }
 535
 536     if (dowarn) {
 537         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 538
 539         switch (warning) {
 540         case 0: /* Intentionally empty. */ break;
 541         case UTF8_WARN_EMPTY:
 542             sv_catpvs(sv, "(empty string)");
 543             break;
 544         case UTF8_WARN_CONTINUATION:
 545             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 546             break;
 547         case UTF8_WARN_NON_CONTINUATION:
 548             if (s == s0)
 549                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 550                            (UV)s[1], startbyte);
 551             else {
 552                 const int len = (int)(s-s0);
 553                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 554                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 555             }
 556
 557             break;
 558         case UTF8_WARN_FE_FF:
 559             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 560             break;
 561         case UTF8_WARN_SHORT:
 562             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 563                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 564             expectlen = curlen;         /* distance for caller to skip */
 565             break;
 566         case UTF8_WARN_OVERFLOW:
 567             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 568                            ouv, *s, startbyte);
 569             break;
 570         case UTF8_WARN_SURROGATE:
 571             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 572             break;
 573         case UTF8_WARN_LONG:
 574             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 575                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 576             break;
 577         case UTF8_WARN_FFFF:
 578             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 579             break;
 580         default:
 581             sv_catpvs(sv, "(unknown reason)");
 582             break;
 583         }
 584
 585         if (warning) {
 586             const char * const s = SvPVX_const(sv);
 587
 588             if (PL_op)
 589                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 590                             "%s in %s", s,  OP_DESC(PL_op));
 591             else
 592                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 593         }
 594     }
 595
 596     if (retlen)
 597         *retlen = expectlen ? expectlen : len;
 598
 599     return 0;
 600 }
 601
 602 /*
 603 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 604
 605 Returns the native character value of the first character in the string C<s>
 606 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 607 length, in bytes, of that character.
 608
 609 If C<s> does not point to a well-formed UTF-8 character, zero is
 610 returned and retlen is set, if possible, to -1.
 611
 612 =cut
 613 */
 614
 615 UV
 616 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 617 {
 618     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 619                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 620 }
 621
 622 /*
 623 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 624
 625 Returns the Unicode code point of the first character in the string C<s>
 626 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 627 length, in bytes, of that character.
 628
 629 This function should only be used when returned UV is considered
 630 an index into the Unicode semantic tables (e.g. swashes).
 631
 632 If C<s> does not point to a well-formed UTF-8 character, zero is
 633 returned and retlen is set, if possible, to -1.
 634
 635 =cut
 636 */
 637
 638 UV
 639 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 640 {
 641     /* Call the low level routine asking for checks */
 642     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 643                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 644 }
 645
 646 /*
 647 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 648
 649 Return the length of the UTF-8 char encoded string C<s> in characters.
 650 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 651 up past C<e>, croaks.
 652
 653 =cut
 654 */
 655
 656 STRLEN
 657 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 658 {
 659     dVAR;
 660     STRLEN len = 0;
 661
 662     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 663      * the bitops (especially ~) can create illegal UTF-8.
 664      * In other words: in Perl UTF-8 is not just for Unicode. */
 665
 666     if (e < s)
 667         goto warn_and_return;
 668     while (s < e) {
 669         const U8 t = UTF8SKIP(s);
 670         if (e - s < t) {
 671             warn_and_return:
 672             if (ckWARN_d(WARN_UTF8)) {
 673                 if (PL_op)
 674                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 675                             "%s in %s", unees, OP_DESC(PL_op));
 676                 else
 677                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 678             }
 679             return len;
 680         }
 681         s += t;
 682         len++;
 683     }
 684
 685     return len;
 686 }
 687
 688 /*
 689 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 690
 691 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 692 and C<b>.
 693
 694 WARNING: use only if you *know* that the pointers point inside the
 695 same UTF-8 buffer.
 696
 697 =cut
 698 */
 699
 700 IV
 701 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 702 {
 703     dVAR;
 704     IV off = 0;
 705
 706     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 707      * the bitops (especially ~) can create illegal UTF-8.
 708      * In other words: in Perl UTF-8 is not just for Unicode. */
 709
 710     if (a < b) {
 711         while (a < b) {
 712             const U8 c = UTF8SKIP(a);
 713             if (b - a < c)
 714                 goto warn_and_return;
 715             a += c;
 716             off--;
 717         }
 718     }
 719     else {
 720         while (b < a) {
 721             const U8 c = UTF8SKIP(b);
 722
 723             if (a - b < c) {
 724                 warn_and_return:
 725                 if (ckWARN_d(WARN_UTF8)) {
 726                     if (PL_op)
 727                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 728                                     "%s in %s", unees, OP_DESC(PL_op));
 729                     else
 730                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 731                 }
 732                 return off;
 733             }
 734             b += c;
 735             off++;
 736         }
 737     }
 738
 739     return off;
 740 }
 741
 742 /*
 743 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 744
 745 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 746 forward or backward.
 747
 748 WARNING: do not use the following unless you *know* C<off> is within
 749 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 750 on the first byte of character or just after the last byte of a character.
 751
 752 =cut
 753 */
 754
 755 U8 *
 756 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 757 {
 758     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 759      * the bitops (especially ~) can create illegal UTF-8.
 760      * In other words: in Perl UTF-8 is not just for Unicode. */
 761
 762     if (off >= 0) {
 763         while (off--)
 764             s += UTF8SKIP(s);
 765     }
 766     else {
 767         while (off++) {
 768             s--;
 769             while (UTF8_IS_CONTINUATION(*s))
 770                 s--;
 771         }
 772     }
 773     return (U8 *)s;
 774 }
 775
 776 /*
 777 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 778
 779 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 780 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 781 updates len to contain the new length.
 782 Returns zero on failure, setting C<len> to -1.
 783
 784 =cut
 785 */
 786
 787 U8 *
 788 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 789 {
 790     U8 *send;
 791     U8 *d;
 792     U8 *save = s;
 793
 794     /* ensure valid UTF-8 and chars < 256 before updating string */
 795     for (send = s + *len; s < send; ) {
 796         U8 c = *s++;
 797
 798         if (!UTF8_IS_INVARIANT(c) &&
 799             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 800              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 801             *len = -1;
 802             return 0;
 803         }
 804     }
 805
 806     d = s = save;
 807     while (s < send) {
 808         STRLEN ulen;
 809         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 810         s += ulen;
 811     }
 812     *d = '\0';
 813     *len = d - save;
 814     return save;
 815 }
 816
 817 /*
 818 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 819
 820 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 821 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 822 the newly-created string, and updates C<len> to contain the new
 823 length.  Returns the original string if no conversion occurs, C<len>
 824 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 825 0 if C<s> is converted or contains all 7bit characters.
 826
 827 =cut
 828 */
 829
 830 U8 *
 831 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 832 {
 833     U8 *d;
 834     const U8 *start = s;
 835     const U8 *send;
 836     I32 count = 0;
 837
 838     if (!*is_utf8)
 839         return (U8 *)start;
 840
 841     /* ensure valid UTF-8 and chars < 256 before converting string */
 842     for (send = s + *len; s < send;) {
 843         U8 c = *s++;
 844         if (!UTF8_IS_INVARIANT(c)) {
 845             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 846                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 847                 count++;
 848             else
 849                 return (U8 *)start;
 850         }
 851     }
 852
 853     *is_utf8 = 0;
 854
 855     Newx(d, (*len) - count + 1, U8);
 856     s = start; start = d;
 857     while (s < send) {
 858         U8 c = *s++;
 859         if (!UTF8_IS_INVARIANT(c)) {
 860             /* Then it is two-byte encoded */
 861             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 862             c = ASCII_TO_NATIVE(c);
 863         }
 864         *d++ = c;
 865     }
 866     *d = '\0';
 867     *len = d - start;
 868     return (U8 *)start;
 869 }
 870
 871 /*
 872 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 873
 874 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 875 Returns a pointer to the newly-created string, and sets C<len> to
 876 reflect the new length.
 877
 878 If you want to convert to UTF-8 from other encodings than ASCII,
 879 see sv_recode_to_utf8().
 880
 881 =cut
 882 */
 883
 884 U8*
 885 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 886 {
 887     const U8 * const send = s + (*len);
 888     U8 *d;
 889     U8 *dst;
 890
 891     Newx(d, (*len) * 2 + 1, U8);
 892     dst = d;
 893
 894     while (s < send) {
 895         const UV uv = NATIVE_TO_ASCII(*s++);
 896         if (UNI_IS_INVARIANT(uv))
 897             *d++ = (U8)UTF_TO_NATIVE(uv);
 898         else {
 899             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 900             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 901         }
 902     }
 903     *d = '\0';
 904     *len = d-dst;
 905     return dst;
 906 }
 907
 908 /*
 909  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 910  *
 911  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 912  * We optimize for native, for obvious reasons. */
 913
 914 U8*
 915 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 916 {
 917     U8* pend;
 918     U8* dstart = d;
 919
 920     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 921          d[0] = 0;
 922          *newlen = 1;
 923          return d;
 924     }
 925
 926     if (bytelen & 1)
 927         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 928
 929     pend = p + bytelen;
 930
 931     while (p < pend) {
 932         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 933         p += 2;
 934         if (uv < 0x80) {
 935             *d++ = (U8)uv;
 936             continue;
 937         }
 938         if (uv < 0x800) {
 939             *d++ = (U8)(( uv >>  6)         | 0xc0);
 940             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 941             continue;
 942         }
 943         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 944             UV low = (p[0] << 8) + p[1];
 945             p += 2;
 946             if (low < 0xdc00 || low >= 0xdfff)
 947                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 948             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 949         }
 950         if (uv < 0x10000) {
 951             *d++ = (U8)(( uv >> 12)         | 0xe0);
 952             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 953             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 954             continue;
 955         }
 956         else {
 957             *d++ = (U8)(( uv >> 18)         | 0xf0);
 958             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 959             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 960             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 961             continue;
 962         }
 963     }
 964     *newlen = d - dstart;
 965     return d;
 966 }
 967
 968 /* Note: this one is slightly destructive of the source. */
 969
 970 U8*
 971 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 972 {
 973     U8* s = (U8*)p;
 974     U8* send = s + bytelen;
 975     while (s < send) {
 976         U8 tmp = s[0];
 977         s[0] = s[1];
 978         s[1] = tmp;
 979         s += 2;
 980     }
 981     return utf16_to_utf8(p, d, bytelen, newlen);
 982 }
 983
 984 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 985
 986 bool
 987 Perl_is_uni_alnum(pTHX_ UV c)
 988 {
 989     U8 tmpbuf[UTF8_MAXBYTES+1];
 990     uvchr_to_utf8(tmpbuf, c);
 991     return is_utf8_alnum(tmpbuf);
 992 }
 993
 994 bool
 995 Perl_is_uni_alnumc(pTHX_ UV c)
 996 {
 997     U8 tmpbuf[UTF8_MAXBYTES+1];
 998     uvchr_to_utf8(tmpbuf, c);
 999     return is_utf8_alnumc(tmpbuf);
1000 }
1001
1002 bool
1003 Perl_is_uni_idfirst(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_idfirst(tmpbuf);
1008 }
1009
1010 bool
1011 Perl_is_uni_alpha(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_alpha(tmpbuf);
1016 }
1017
1018 bool
1019 Perl_is_uni_ascii(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_ascii(tmpbuf);
1024 }
1025
1026 bool
1027 Perl_is_uni_space(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_space(tmpbuf);
1032 }
1033
1034 bool
1035 Perl_is_uni_digit(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_digit(tmpbuf);
1040 }
1041
1042 bool
1043 Perl_is_uni_upper(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_upper(tmpbuf);
1048 }
1049
1050 bool
1051 Perl_is_uni_lower(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_lower(tmpbuf);
1056 }
1057
1058 bool
1059 Perl_is_uni_cntrl(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_cntrl(tmpbuf);
1064 }
1065
1066 bool
1067 Perl_is_uni_graph(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_graph(tmpbuf);
1072 }
1073
1074 bool
1075 Perl_is_uni_print(pTHX_ UV c)
1076 {
1077     U8 tmpbuf[UTF8_MAXBYTES+1];
1078     uvchr_to_utf8(tmpbuf, c);
1079     return is_utf8_print(tmpbuf);
1080 }
1081
1082 bool
1083 Perl_is_uni_punct(pTHX_ UV c)
1084 {
1085     U8 tmpbuf[UTF8_MAXBYTES+1];
1086     uvchr_to_utf8(tmpbuf, c);
1087     return is_utf8_punct(tmpbuf);
1088 }
1089
1090 bool
1091 Perl_is_uni_xdigit(pTHX_ UV c)
1092 {
1093     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1094     uvchr_to_utf8(tmpbuf, c);
1095     return is_utf8_xdigit(tmpbuf);
1096 }
1097
1098 UV
1099 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1100 {
1101     uvchr_to_utf8(p, c);
1102     return to_utf8_upper(p, p, lenp);
1103 }
1104
1105 UV
1106 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1107 {
1108     uvchr_to_utf8(p, c);
1109     return to_utf8_title(p, p, lenp);
1110 }
1111
1112 UV
1113 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1114 {
1115     uvchr_to_utf8(p, c);
1116     return to_utf8_lower(p, p, lenp);
1117 }
1118
1119 UV
1120 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1121 {
1122     uvchr_to_utf8(p, c);
1123     return to_utf8_fold(p, p, lenp);
1124 }
1125
1126 /* for now these all assume no locale info available for Unicode > 255 */
1127
1128 bool
1129 Perl_is_uni_alnum_lc(pTHX_ UV c)
1130 {
1131     return is_uni_alnum(c);     /* XXX no locale support yet */
1132 }
1133
1134 bool
1135 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1136 {
1137     return is_uni_alnumc(c);    /* XXX no locale support yet */
1138 }
1139
1140 bool
1141 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1142 {
1143     return is_uni_idfirst(c);   /* XXX no locale support yet */
1144 }
1145
1146 bool
1147 Perl_is_uni_alpha_lc(pTHX_ UV c)
1148 {
1149     return is_uni_alpha(c);     /* XXX no locale support yet */
1150 }
1151
1152 bool
1153 Perl_is_uni_ascii_lc(pTHX_ UV c)
1154 {
1155     return is_uni_ascii(c);     /* XXX no locale support yet */
1156 }
1157
1158 bool
1159 Perl_is_uni_space_lc(pTHX_ UV c)
1160 {
1161     return is_uni_space(c);     /* XXX no locale support yet */
1162 }
1163
1164 bool
1165 Perl_is_uni_digit_lc(pTHX_ UV c)
1166 {
1167     return is_uni_digit(c);     /* XXX no locale support yet */
1168 }
1169
1170 bool
1171 Perl_is_uni_upper_lc(pTHX_ UV c)
1172 {
1173     return is_uni_upper(c);     /* XXX no locale support yet */
1174 }
1175
1176 bool
1177 Perl_is_uni_lower_lc(pTHX_ UV c)
1178 {
1179     return is_uni_lower(c);     /* XXX no locale support yet */
1180 }
1181
1182 bool
1183 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1184 {
1185     return is_uni_cntrl(c);     /* XXX no locale support yet */
1186 }
1187
1188 bool
1189 Perl_is_uni_graph_lc(pTHX_ UV c)
1190 {
1191     return is_uni_graph(c);     /* XXX no locale support yet */
1192 }
1193
1194 bool
1195 Perl_is_uni_print_lc(pTHX_ UV c)
1196 {
1197     return is_uni_print(c);     /* XXX no locale support yet */
1198 }
1199
1200 bool
1201 Perl_is_uni_punct_lc(pTHX_ UV c)
1202 {
1203     return is_uni_punct(c);     /* XXX no locale support yet */
1204 }
1205
1206 bool
1207 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1208 {
1209     return is_uni_xdigit(c);    /* XXX no locale support yet */
1210 }
1211
1212 U32
1213 Perl_to_uni_upper_lc(pTHX_ U32 c)
1214 {
1215     /* XXX returns only the first character -- do not use XXX */
1216     /* XXX no locale support yet */
1217     STRLEN len;
1218     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1219     return (U32)to_uni_upper(c, tmpbuf, &len);
1220 }
1221
1222 U32
1223 Perl_to_uni_title_lc(pTHX_ U32 c)
1224 {
1225     /* XXX returns only the first character XXX -- do not use XXX */
1226     /* XXX no locale support yet */
1227     STRLEN len;
1228     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1229     return (U32)to_uni_title(c, tmpbuf, &len);
1230 }
1231
1232 U32
1233 Perl_to_uni_lower_lc(pTHX_ U32 c)
1234 {
1235     /* XXX returns only the first character -- do not use XXX */
1236     /* XXX no locale support yet */
1237     STRLEN len;
1238     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1239     return (U32)to_uni_lower(c, tmpbuf, &len);
1240 }
1241
1242 static bool
1243 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1244                  const char *const swashname)
1245 {
1246     dVAR;
1247     if (!is_utf8_char(p))
1248         return FALSE;
1249     if (!*swash)
1250         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1251     return swash_fetch(*swash, p, TRUE) != 0;
1252 }
1253
1254 bool
1255 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1256 {
1257     dVAR;
1258     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1259      * descendant of isalnum(3), in other words, it doesn't
1260      * contain the '_'. --jhi */
1261     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
1262 }
1263
1264 bool
1265 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1266 {
1267     dVAR;
1268     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
1269 }
1270
1271 bool
1272 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1273 {
1274     dVAR;
1275     if (*p == '_')
1276         return TRUE;
1277     /* is_utf8_idstart would be more logical. */
1278     return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
1279 }
1280
1281 bool
1282 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1283 {
1284     dVAR;
1285     if (*p == '_')
1286         return TRUE;
1287     return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
1288 }
1289
1290 bool
1291 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1292 {
1293     dVAR;
1294     return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
1295 }
1296
1297 bool
1298 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1299 {
1300     dVAR;
1301     return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
1302 }
1303
1304 bool
1305 Perl_is_utf8_space(pTHX_ const U8 *p)
1306 {
1307     dVAR;
1308     return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
1309 }
1310
1311 bool
1312 Perl_is_utf8_digit(pTHX_ const U8 *p)
1313 {
1314     dVAR;
1315     return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
1316 }
1317
1318 bool
1319 Perl_is_utf8_upper(pTHX_ const U8 *p)
1320 {
1321     dVAR;
1322     return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
1323 }
1324
1325 bool
1326 Perl_is_utf8_lower(pTHX_ const U8 *p)
1327 {
1328     dVAR;
1329     return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
1330 }
1331
1332 bool
1333 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1334 {
1335     dVAR;
1336     return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
1337 }
1338
1339 bool
1340 Perl_is_utf8_graph(pTHX_ const U8 *p)
1341 {
1342     dVAR;
1343     return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
1344 }
1345
1346 bool
1347 Perl_is_utf8_print(pTHX_ const U8 *p)
1348 {
1349     dVAR;
1350     return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
1351 }
1352
1353 bool
1354 Perl_is_utf8_punct(pTHX_ const U8 *p)
1355 {
1356     dVAR;
1357     return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
1358 }
1359
1360 bool
1361 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1362 {
1363     dVAR;
1364     return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
1365 }
1366
1367 bool
1368 Perl_is_utf8_mark(pTHX_ const U8 *p)
1369 {
1370     dVAR;
1371     return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
1372 }
1373
1374 /*
1375 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1376
1377 The "p" contains the pointer to the UTF-8 string encoding
1378 the character that is being converted.
1379
1380 The "ustrp" is a pointer to the character buffer to put the
1381 conversion result to.  The "lenp" is a pointer to the length
1382 of the result.
1383
1384 The "swashp" is a pointer to the swash to use.
1385
1386 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1387 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1388 but not always, a multicharacter mapping), is tried first.
1389
1390 The "special" is a string like "utf8::ToSpecLower", which means the
1391 hash %utf8::ToSpecLower.  The access to the hash is through
1392 Perl_to_utf8_case().
1393
1394 The "normal" is a string like "ToLower" which means the swash
1395 %utf8::ToLower.
1396
1397 =cut */
1398
1399 UV
1400 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1401                         SV **swashp, const char *normal, const char *special)
1402 {
1403     dVAR;
1404     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1405     STRLEN len = 0;
1406
1407     const UV uv0 = utf8_to_uvchr(p, NULL);
1408     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1409      * are necessary in EBCDIC, they are redundant no-ops
1410      * in ASCII-ish platforms, and hopefully optimized away. */
1411     const UV uv1 = NATIVE_TO_UNI(uv0);
1412     uvuni_to_utf8(tmpbuf, uv1);
1413
1414     if (!*swashp) /* load on-demand */
1415          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1416
1417     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1418     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1419          /* It might be "special" (sometimes, but not always,
1420           * a multicharacter mapping) */
1421          HV *hv;
1422          SV **svp;
1423
1424          if ((hv  = get_hv(special, FALSE)) &&
1425              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1426              (*svp)) {
1427              const char *s;
1428
1429               s = SvPV_const(*svp, len);
1430               if (len == 1)
1431                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1432               else {
1433 #ifdef EBCDIC
1434                    /* If we have EBCDIC we need to remap the characters
1435                     * since any characters in the low 256 are Unicode
1436                     * code points, not EBCDIC. */
1437                    U8 *t = (U8*)s, *tend = t + len, *d;
1438
1439                    d = tmpbuf;
1440                    if (SvUTF8(*svp)) {
1441                         STRLEN tlen = 0;
1442
1443                         while (t < tend) {
1444                              UV c = utf8_to_uvchr(t, &tlen);
1445                              if (tlen > 0) {
1446                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1447                                   t += tlen;
1448                              }
1449                              else
1450                                   break;
1451                         }
1452                    }
1453                    else {
1454                         while (t < tend) {
1455                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1456                              t++;
1457                         }
1458                    }
1459                    len = d - tmpbuf;
1460                    Copy(tmpbuf, ustrp, len, U8);
1461 #else
1462                    Copy(s, ustrp, len, U8);
1463 #endif
1464               }
1465          }
1466     }
1467
1468     if (!len && *swashp) {
1469          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1470
1471          if (uv2) {
1472               /* It was "normal" (a single character mapping). */
1473               UV uv3 = UNI_TO_NATIVE(uv2);
1474
1475               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1476          }
1477     }
1478
1479     if (!len) /* Neither: just copy. */
1480          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1481
1482     if (lenp)
1483          *lenp = len;
1484
1485     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1486 }
1487
1488 /*
1489 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1490
1491 Convert the UTF-8 encoded character at p to its uppercase version and
1492 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1493 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1494 the uppercase version may be longer than the original character.
1495
1496 The first character of the uppercased version is returned
1497 (but note, as explained above, that there may be more.)
1498
1499 =cut */
1500
1501 UV
1502 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1503 {
1504     dVAR;
1505     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1506                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1507 }
1508
1509 /*
1510 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1511
1512 Convert the UTF-8 encoded character at p to its titlecase version and
1513 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1514 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1515 titlecase version may be longer than the original character.
1516
1517 The first character of the titlecased version is returned
1518 (but note, as explained above, that there may be more.)
1519
1520 =cut */
1521
1522 UV
1523 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1524 {
1525     dVAR;
1526     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1527                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1528 }
1529
1530 /*
1531 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1532
1533 Convert the UTF-8 encoded character at p to its lowercase version and
1534 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1535 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1536 lowercase version may be longer than the original character.
1537
1538 The first character of the lowercased version is returned
1539 (but note, as explained above, that there may be more.)
1540
1541 =cut */
1542
1543 UV
1544 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1545 {
1546     dVAR;
1547     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1548                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1549 }
1550
1551 /*
1552 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1553
1554 Convert the UTF-8 encoded character at p to its foldcase version and
1555 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1556 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1557 foldcase version may be longer than the original character (up to
1558 three characters).
1559
1560 The first character of the foldcased version is returned
1561 (but note, as explained above, that there may be more.)
1562
1563 =cut */
1564
1565 UV
1566 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1567 {
1568     dVAR;
1569     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1570                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1571 }
1572
1573 /* Note:
1574  * A "swash" is a swatch hash.
1575  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1576  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1577  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1578  */
1579 SV*
1580 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1581 {
1582     dVAR;
1583     SV* retval;
1584     SV* const tokenbufsv = sv_newmortal();
1585     dSP;
1586     const size_t pkg_len = strlen(pkg);
1587     const size_t name_len = strlen(name);
1588     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1589     SV* errsv_save;
1590
1591     PUSHSTACKi(PERLSI_MAGIC);
1592     ENTER;
1593     SAVEI32(PL_hints);
1594     PL_hints = 0;
1595     /* It is assumed that callers of this routine are not passing in any
1596        user derived data.  */
1597     SAVEBOOL(PL_tainted);
1598     PL_tainted = 0;
1599     save_re_context();
1600     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1601         ENTER;
1602         errsv_save = newSVsv(ERRSV);
1603         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1604                          NULL);
1605         if (!SvTRUE(ERRSV))
1606             sv_setsv(ERRSV, errsv_save);
1607         SvREFCNT_dec(errsv_save);
1608         LEAVE;
1609     }
1610     SPAGAIN;
1611     PUSHMARK(SP);
1612     EXTEND(SP,5);
1613     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1614     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1615     PUSHs(listsv);
1616     PUSHs(sv_2mortal(newSViv(minbits)));
1617     PUSHs(sv_2mortal(newSViv(none)));
1618     PUTBACK;
1619     if (IN_PERL_COMPILETIME) {
1620         /* XXX ought to be handled by lex_start */
1621         SAVEI32(PL_in_my);
1622         PL_in_my = 0;
1623         sv_setpv(tokenbufsv, PL_tokenbuf);
1624     }
1625     errsv_save = newSVsv(ERRSV);
1626     if (call_method("SWASHNEW", G_SCALAR))
1627         retval = newSVsv(*PL_stack_sp--);
1628     else
1629         retval = &PL_sv_undef;
1630     if (!SvTRUE(ERRSV))
1631         sv_setsv(ERRSV, errsv_save);
1632     SvREFCNT_dec(errsv_save);
1633     LEAVE;
1634     POPSTACK;
1635     if (IN_PERL_COMPILETIME) {
1636         STRLEN len;
1637         const char* const pv = SvPV_const(tokenbufsv, len);
1638
1639         Copy(pv, PL_tokenbuf, len+1, char);
1640         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1641     }
1642     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1643         if (SvPOK(retval))
1644             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1645                        retval);
1646         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1647     }
1648     return retval;
1649 }
1650
1651
1652 /* This API is wrong for special case conversions since we may need to
1653  * return several Unicode characters for a single Unicode character
1654  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1655  * the lower-level routine, and it is similarly broken for returning
1656  * multiple values.  --jhi */
1657 /* Now SWASHGET is recasted into S_swash_get in this file. */
1658
1659 /* Note:
1660  * Returns the value of property/mapping C<swash> for the first character
1661  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1662  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1663  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1664  */
1665 UV
1666 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1667 {
1668     dVAR;
1669     HV* const hv = (HV*)SvRV(swash);
1670     U32 klen;
1671     U32 off;
1672     STRLEN slen;
1673     STRLEN needents;
1674     const U8 *tmps = NULL;
1675     U32 bit;
1676     SV *swatch;
1677     U8 tmputf8[2];
1678     UV c = NATIVE_TO_ASCII(*ptr);
1679
1680     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1681         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1682         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1683         ptr = tmputf8;
1684     }
1685     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1686      * then the "swatch" is a vec() for al the chars which start
1687      * with 0xAA..0xYY
1688      * So the key in the hash (klen) is length of encoded char -1
1689      */
1690     klen = UTF8SKIP(ptr) - 1;
1691     off  = ptr[klen];
1692
1693     if (klen == 0) {
1694       /* If char in invariant then swatch is for all the invariant chars
1695        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1696        */
1697         needents = UTF_CONTINUATION_MARK;
1698         off      = NATIVE_TO_UTF(ptr[klen]);
1699     }
1700     else {
1701       /* If char is encoded then swatch is for the prefix */
1702         needents = (1 << UTF_ACCUMULATION_SHIFT);
1703         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1704     }
1705
1706     /*
1707      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1708      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1709      * it's nothing to sniff at.)  Pity we usually come through at least
1710      * two function calls to get here...
1711      *
1712      * NB: this code assumes that swatches are never modified, once generated!
1713      */
1714
1715     if (hv   == PL_last_swash_hv &&
1716         klen == PL_last_swash_klen &&
1717         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1718     {
1719         tmps = PL_last_swash_tmps;
1720         slen = PL_last_swash_slen;
1721     }
1722     else {
1723         /* Try our second-level swatch cache, kept in a hash. */
1724         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1725
1726         /* If not cached, generate it via swash_get */
1727         if (!svp || !SvPOK(*svp)
1728                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1729             /* We use utf8n_to_uvuni() as we want an index into
1730                Unicode tables, not a native character number.
1731              */
1732             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1733                                            ckWARN(WARN_UTF8) ?
1734                                            0 : UTF8_ALLOW_ANY);
1735             swatch = swash_get(swash,
1736                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1737                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1738                                 needents);
1739
1740             if (IN_PERL_COMPILETIME)
1741                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1742
1743             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1744
1745             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1746                      || (slen << 3) < needents)
1747                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1748         }
1749
1750         PL_last_swash_hv = hv;
1751         PL_last_swash_klen = klen;
1752         /* FIXME change interpvar.h?  */
1753         PL_last_swash_tmps = (U8 *) tmps;
1754         PL_last_swash_slen = slen;
1755         if (klen)
1756             Copy(ptr, PL_last_swash_key, klen, U8);
1757     }
1758
1759     switch ((int)((slen << 3) / needents)) {
1760     case 1:
1761         bit = 1 << (off & 7);
1762         off >>= 3;
1763         return (tmps[off] & bit) != 0;
1764     case 8:
1765         return tmps[off];
1766     case 16:
1767         off <<= 1;
1768         return (tmps[off] << 8) + tmps[off + 1] ;
1769     case 32:
1770         off <<= 2;
1771         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1772     }
1773     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1774     return 0;
1775 }
1776
1777 /* Note:
1778  * Returns a swatch (a bit vector string) for a code point sequence
1779  * that starts from the value C<start> and comprises the number C<span>.
1780  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1781  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1782  */
1783 STATIC SV*
1784 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1785 {
1786     SV *swatch;
1787     U8 *l, *lend, *x, *xend, *s;
1788     STRLEN lcur, xcur, scur;
1789
1790     HV* const hv = (HV*)SvRV(swash);
1791     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1792     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1793     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1794     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1795     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1796     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1797     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1798     const STRLEN bits  = SvUV(*bitssvp);
1799     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1800     const UV     none  = SvUV(*nonesvp);
1801     const UV     end   = start + span;
1802
1803     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1804         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1805                                                  (UV)bits);
1806     }
1807
1808     /* create and initialize $swatch */
1809     swatch = newSVpvs("");
1810     scur   = octets ? (span * octets) : (span + 7) / 8;
1811     SvGROW(swatch, scur + 1);
1812     s = (U8*)SvPVX(swatch);
1813     if (octets && none) {
1814         const U8* const e = s + scur;
1815         while (s < e) {
1816             if (bits == 8)
1817                 *s++ = (U8)(none & 0xff);
1818             else if (bits == 16) {
1819                 *s++ = (U8)((none >>  8) & 0xff);
1820                 *s++ = (U8)( none        & 0xff);
1821             }
1822             else if (bits == 32) {
1823                 *s++ = (U8)((none >> 24) & 0xff);
1824                 *s++ = (U8)((none >> 16) & 0xff);
1825                 *s++ = (U8)((none >>  8) & 0xff);
1826                 *s++ = (U8)( none        & 0xff);
1827             }
1828         }
1829         *s = '\0';
1830     }
1831     else {
1832         (void)memzero((U8*)s, scur + 1);
1833     }
1834     SvCUR_set(swatch, scur);
1835     s = (U8*)SvPVX(swatch);
1836
1837     /* read $swash->{LIST} */
1838     l = (U8*)SvPV(*listsvp, lcur);
1839     lend = l + lcur;
1840     while (l < lend) {
1841         UV min, max, val, key;
1842         STRLEN numlen;
1843         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1844
1845         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1846
1847         numlen = lend - l;
1848         min = grok_hex((char *)l, &numlen, &flags, NULL);
1849         if (numlen)
1850             l += numlen;
1851         else if (nl) {
1852             l = nl + 1; /* 1 is length of "\n" */
1853             continue;
1854         }
1855         else {
1856             l = lend; /* to LIST's end at which \n is not found */
1857             break;
1858         }
1859
1860         if (isBLANK(*l)) {
1861             ++l;
1862             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1863             numlen = lend - l;
1864             max = grok_hex((char *)l, &numlen, &flags, NULL);
1865             if (numlen)
1866                 l += numlen;
1867             else
1868                 max = min;
1869
1870             if (octets) {
1871                 if (isBLANK(*l)) {
1872                     ++l;
1873                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1874                             PERL_SCAN_DISALLOW_PREFIX;
1875                     numlen = lend - l;
1876                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1877                     if (numlen)
1878                         l += numlen;
1879                     else
1880                         val = 0;
1881                 }
1882                 else {
1883                     val = 0;
1884                     if (typeto) {
1885                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1886                                          typestr, l);
1887                     }
1888                 }
1889             }
1890             else
1891                 val = 0; /* bits == 1, then val should be ignored */
1892         }
1893         else {
1894             max = min;
1895             if (octets) {
1896                 val = 0;
1897                 if (typeto) {
1898                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1899                 }
1900             }
1901             else
1902                 val = 0; /* bits == 1, then val should be ignored */
1903         }
1904
1905         if (nl)
1906             l = nl + 1;
1907         else
1908             l = lend;
1909
1910         if (max < start)
1911             continue;
1912
1913         if (octets) {
1914             if (min < start) {
1915                 if (!none || val < none) {
1916                     val += start - min;
1917                 }
1918                 min = start;
1919             }
1920             for (key = min; key <= max; key++) {
1921                 STRLEN offset;
1922                 if (key >= end)
1923                     goto go_out_list;
1924                 /* offset must be non-negative (start <= min <= key < end) */
1925                 offset = octets * (key - start);
1926                 if (bits == 8)
1927                     s[offset] = (U8)(val & 0xff);
1928                 else if (bits == 16) {
1929                     s[offset    ] = (U8)((val >>  8) & 0xff);
1930                     s[offset + 1] = (U8)( val        & 0xff);
1931                 }
1932                 else if (bits == 32) {
1933                     s[offset    ] = (U8)((val >> 24) & 0xff);
1934                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1935                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1936                     s[offset + 3] = (U8)( val        & 0xff);
1937                 }
1938
1939                 if (!none || val < none)
1940                     ++val;
1941             }
1942         }
1943         else { /* bits == 1, then val should be ignored */
1944             if (min < start)
1945                 min = start;
1946             for (key = min; key <= max; key++) {
1947                 const STRLEN offset = (STRLEN)(key - start);
1948                 if (key >= end)
1949                     goto go_out_list;
1950                 s[offset >> 3] |= 1 << (offset & 7);
1951             }
1952         }
1953     } /* while */
1954   go_out_list:
1955
1956     /* read $swash->{EXTRAS} */
1957     x = (U8*)SvPV(*extssvp, xcur);
1958     xend = x + xcur;
1959     while (x < xend) {
1960         STRLEN namelen;
1961         U8 *namestr;
1962         SV** othersvp;
1963         HV* otherhv;
1964         STRLEN otherbits;
1965         SV **otherbitssvp, *other;
1966         U8 *s, *o, *nl;
1967         STRLEN slen, olen;
1968
1969         U8 opc = *x++;
1970         if (opc == '\n')
1971             continue;
1972
1973         nl = (U8*)memchr(x, '\n', xend - x);
1974
1975         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1976             if (nl) {
1977                 x = nl + 1; /* 1 is length of "\n" */
1978                 continue;
1979             }
1980             else {
1981                 x = xend; /* to EXTRAS' end at which \n is not found */
1982                 break;
1983             }
1984         }
1985
1986         namestr = x;
1987         if (nl) {
1988             namelen = nl - namestr;
1989             x = nl + 1;
1990         }
1991         else {
1992             namelen = xend - namestr;
1993             x = xend;
1994         }
1995
1996         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1997         otherhv = (HV*)SvRV(*othersvp);
1998         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1999         otherbits = (STRLEN)SvUV(*otherbitssvp);
2000         if (bits < otherbits)
2001             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2002
2003         /* The "other" swatch must be destroyed after. */
2004         other = swash_get(*othersvp, start, span);
2005         o = (U8*)SvPV(other, olen);
2006
2007         if (!olen)
2008             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2009
2010         s = (U8*)SvPV(swatch, slen);
2011         if (bits == 1 && otherbits == 1) {
2012             if (slen != olen)
2013                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2014
2015             switch (opc) {
2016             case '+':
2017                 while (slen--)
2018                     *s++ |= *o++;
2019                 break;
2020             case '!':
2021                 while (slen--)
2022                     *s++ |= ~*o++;
2023                 break;
2024             case '-':
2025                 while (slen--)
2026                     *s++ &= ~*o++;
2027                 break;
2028             case '&':
2029                 while (slen--)
2030                     *s++ &= *o++;
2031                 break;
2032             default:
2033                 break;
2034             }
2035         }
2036         else {
2037             STRLEN otheroctets = otherbits >> 3;
2038             STRLEN offset = 0;
2039             U8* send = s + slen;
2040
2041             while (s < send) {
2042                 UV otherval = 0;
2043
2044                 if (otherbits == 1) {
2045                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2046                     ++offset;
2047                 }
2048                 else {
2049                     STRLEN vlen = otheroctets;
2050                     otherval = *o++;
2051                     while (--vlen) {
2052                         otherval <<= 8;
2053                         otherval |= *o++;
2054                     }
2055                 }
2056
2057                 if (opc == '+' && otherval)
2058                     /*EMPTY*/;   /* replace with otherval */
2059                 else if (opc == '!' && !otherval)
2060                     otherval = 1;
2061                 else if (opc == '-' && otherval)
2062                     otherval = 0;
2063                 else if (opc == '&' && !otherval)
2064                     otherval = 0;
2065                 else {
2066                     s += octets; /* no replacement */
2067                     continue;
2068                 }
2069
2070                 if (bits == 8)
2071                     *s++ = (U8)( otherval & 0xff);
2072                 else if (bits == 16) {
2073                     *s++ = (U8)((otherval >>  8) & 0xff);
2074                     *s++ = (U8)( otherval        & 0xff);
2075                 }
2076                 else if (bits == 32) {
2077                     *s++ = (U8)((otherval >> 24) & 0xff);
2078                     *s++ = (U8)((otherval >> 16) & 0xff);
2079                     *s++ = (U8)((otherval >>  8) & 0xff);
2080                     *s++ = (U8)( otherval        & 0xff);
2081                 }
2082             }
2083         }
2084         sv_free(other); /* through with it! */
2085     } /* while */
2086     return swatch;
2087 }
2088
2089 /*
2090 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2091
2092 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2093 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2094 bytes available. The return value is the pointer to the byte after the
2095 end of the new character. In other words,
2096
2097     d = uvchr_to_utf8(d, uv);
2098
2099 is the recommended wide native character-aware way of saying
2100
2101     *(d++) = uv;
2102
2103 =cut
2104 */
2105
2106 /* On ASCII machines this is normally a macro but we want a
2107    real function in case XS code wants it
2108 */
2109 U8 *
2110 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2111 {
2112     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2113 }
2114
2115 U8 *
2116 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2117 {
2118     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2119 }
2120
2121 /*
2122 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2123 flags
2124
2125 Returns the native character value of the first character in the string
2126 C<s>
2127 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2128 length, in bytes, of that character.
2129
2130 Allows length and flags to be passed to low level routine.
2131
2132 =cut
2133 */
2134 /* On ASCII machines this is normally a macro but we want
2135    a real function in case XS code wants it
2136 */
2137 UV
2138 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2139 U32 flags)
2140 {
2141     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2142     return UNI_TO_NATIVE(uv);
2143 }
2144
2145 /*
2146 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2147
2148 Build to the scalar dsv a displayable version of the string spv,
2149 length len, the displayable version being at most pvlim bytes long
2150 (if longer, the rest is truncated and "..." will be appended).
2151
2152 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2153 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2154 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2155 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2156 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2157 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2158
2159 The pointer to the PV of the dsv is returned.
2160
2161 =cut */
2162 char *
2163 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2164 {
2165     int truncated = 0;
2166     const char *s, *e;
2167
2168     sv_setpvn(dsv, "", 0);
2169     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2170          UV u;
2171           /* This serves double duty as a flag and a character to print after
2172              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2173           */
2174          char ok = 0;
2175
2176          if (pvlim && SvCUR(dsv) >= pvlim) {
2177               truncated++;
2178               break;
2179          }
2180          u = utf8_to_uvchr((U8*)s, 0);
2181          if (u < 256) {
2182              const unsigned char c = (unsigned char)u & 0xFF;
2183              if (flags & UNI_DISPLAY_BACKSLASH) {
2184                  switch (c) {
2185                  case '\n':
2186                      ok = 'n'; break;
2187                  case '\r':
2188                      ok = 'r'; break;
2189                  case '\t':
2190                      ok = 't'; break;
2191                  case '\f':
2192                      ok = 'f'; break;
2193                  case '\a':
2194                      ok = 'a'; break;
2195                  case '\\':
2196                      ok = '\\'; break;
2197                  default: break;
2198                  }
2199                  if (ok) {
2200                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2201                  }
2202              }
2203              /* isPRINT() is the locale-blind version. */
2204              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2205                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2206                  ok = 1;
2207              }
2208          }
2209          if (!ok)
2210              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2211     }
2212     if (truncated)
2213          sv_catpvs(dsv, "...");
2214
2215     return SvPVX(dsv);
2216 }
2217
2218 /*
2219 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2220
2221 Build to the scalar dsv a displayable version of the scalar sv,
2222 the displayable version being at most pvlim bytes long
2223 (if longer, the rest is truncated and "..." will be appended).
2224
2225 The flags argument is as in pv_uni_display().
2226
2227 The pointer to the PV of the dsv is returned.
2228
2229 =cut */
2230 char *
2231 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2232 {
2233      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2234                                 SvCUR(ssv), pvlim, flags);
2235 }
2236
2237 /*
2238 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2239
2240 Return true if the strings s1 and s2 differ case-insensitively, false
2241 if not (if they are equal case-insensitively).  If u1 is true, the
2242 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2243 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2244 are false, the respective string is assumed to be in native 8-bit
2245 encoding.
2246
2247 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2248 in there (they will point at the beginning of the I<next> character).
2249 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2250 pointers beyond which scanning will not continue under any
2251 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2252 s2+l2 will be used as goal end pointers that will also stop the scan,
2253 and which qualify towards defining a successful match: all the scans
2254 that define an explicit length must reach their goal pointers for
2255 a match to succeed).
2256
2257 For case-insensitiveness, the "casefolding" of Unicode is used
2258 instead of upper/lowercasing both the characters, see
2259 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2260
2261 =cut */
2262 I32
2263 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2264 {
2265      dVAR;
2266      register const U8 *p1  = (const U8*)s1;
2267      register const U8 *p2  = (const U8*)s2;
2268      register const U8 *f1 = NULL;
2269      register const U8 *f2 = NULL;
2270      register U8 *e1 = NULL;
2271      register U8 *q1 = NULL;
2272      register U8 *e2 = NULL;
2273      register U8 *q2 = NULL;
2274      STRLEN n1 = 0, n2 = 0;
2275      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2276      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2277      U8 natbuf[1+1];
2278      STRLEN foldlen1, foldlen2;
2279      bool match;
2280
2281      if (pe1)
2282           e1 = *(U8**)pe1;
2283      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2284           f1 = (const U8*)s1 + l1;
2285      if (pe2)
2286           e2 = *(U8**)pe2;
2287      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2288           f2 = (const U8*)s2 + l2;
2289
2290      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2291           return 1; /* mismatch; possible infinite loop or false positive */
2292
2293      if (!u1 || !u2)
2294           natbuf[1] = 0; /* Need to terminate the buffer. */
2295
2296      while ((e1 == 0 || p1 < e1) &&
2297             (f1 == 0 || p1 < f1) &&
2298             (e2 == 0 || p2 < e2) &&
2299             (f2 == 0 || p2 < f2)) {
2300           if (n1 == 0) {
2301                if (u1)
2302                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2303                else {
2304                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2305                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2306                }
2307                q1 = foldbuf1;
2308                n1 = foldlen1;
2309           }
2310           if (n2 == 0) {
2311                if (u2)
2312                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2313                else {
2314                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2315                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2316                }
2317                q2 = foldbuf2;
2318                n2 = foldlen2;
2319           }
2320           while (n1 && n2) {
2321                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2322                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2323                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2324                    return 1; /* mismatch */
2325                n1 -= UTF8SKIP(q1);
2326                q1 += UTF8SKIP(q1);
2327                n2 -= UTF8SKIP(q2);
2328                q2 += UTF8SKIP(q2);
2329           }
2330           if (n1 == 0)
2331                p1 += u1 ? UTF8SKIP(p1) : 1;
2332           if (n2 == 0)
2333                p2 += u2 ? UTF8SKIP(p2) : 1;
2334
2335      }
2336
2337      /* A match is defined by all the scans that specified
2338       * an explicit length reaching their final goals. */
2339      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2340
2341      if (match) {
2342           if (pe1)
2343                *pe1 = (char*)p1;
2344           if (pe2)
2345                *pe2 = (char*)p2;
2346      }
2347
2348      return match ? 0 : 1; /* 0 match, 1 mismatch */
2349 }
2350
2351 /*
2352  * Local variables:
2353  * c-indentation-style: bsd
2354  * c-basic-offset: 4
2355  * indent-tabs-mode: t
2356  * End:
2357  *
2358  * ex: set ts=8 sts=4 sw=4 noet:
2359  */