utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241 #ifdef IS_UTF8_CHAR
 242     if (IS_UTF8_CHAR_FAST(len))
 243         return IS_UTF8_CHAR(s, len) ? len : 0;
 244 #endif /* #ifdef IS_UTF8_CHAR */
 245     return is_utf8_char_slow(s, len);
 246 }
 247
 248 /*
 249 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 250
 251 Returns true if first C<len> bytes of the given string form a valid
 252 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 253 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 254 because a valid ASCII string is a valid UTF-8 string.
 255
 256 See also is_utf8_string_loclen() and is_utf8_string_loc().
 257
 258 =cut
 259 */
 260
 261 bool
 262 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 263 {
 264     const U8* x = s;
 265     const U8* send;
 266
 267     if (!len)
 268         len = strlen((const char *)s);
 269     send = s + len;
 270
 271     while (x < send) {
 272         STRLEN c;
 273          /* Inline the easy bits of is_utf8_char() here for speed... */
 274          if (UTF8_IS_INVARIANT(*x))
 275               c = 1;
 276          else if (!UTF8_IS_START(*x))
 277              goto out;
 278          else {
 279               /* ... and call is_utf8_char() only if really needed. */
 280 #ifdef IS_UTF8_CHAR
 281              c = UTF8SKIP(x);
 282              if (IS_UTF8_CHAR_FAST(c)) {
 283                  if (!IS_UTF8_CHAR(x, c))
 284                      goto out;
 285              } else if (!is_utf8_char_slow(x, c))
 286                  goto out;
 287 #else
 288              c = is_utf8_char(x);
 289 #endif /* #ifdef IS_UTF8_CHAR */
 290               if (!c)
 291                   goto out;
 292          }
 293         x += c;
 294     }
 295
 296  out:
 297     if (x != send)
 298         return FALSE;
 299
 300     return TRUE;
 301 }
 302
 303 /*
 304 Implemented as a macro in utf8.h
 305
 306 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 307
 308 Like is_utf8_string() but stores the location of the failure (in the
 309 case of "utf8ness failure") or the location s+len (in the case of
 310 "utf8ness success") in the C<ep>.
 311
 312 See also is_utf8_string_loclen() and is_utf8_string().
 313
 314 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 315
 316 Like is_utf8_string() but stores the location of the failure (in the
 317 case of "utf8ness failure") or the location s+len (in the case of
 318 "utf8ness success") in the C<ep>, and the number of UTF-8
 319 encoded characters in the C<el>.
 320
 321 See also is_utf8_string_loc() and is_utf8_string().
 322
 323 =cut
 324 */
 325
 326 bool
 327 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 328 {
 329     const U8* x = s;
 330     const U8* send;
 331     STRLEN c;
 332
 333     if (!len)
 334         len = strlen((const char *)s);
 335     send = s + len;
 336     if (el)
 337         *el = 0;
 338
 339     while (x < send) {
 340          /* Inline the easy bits of is_utf8_char() here for speed... */
 341          if (UTF8_IS_INVARIANT(*x))
 342              c = 1;
 343          else if (!UTF8_IS_START(*x))
 344              goto out;
 345          else {
 346              /* ... and call is_utf8_char() only if really needed. */
 347 #ifdef IS_UTF8_CHAR
 348              c = UTF8SKIP(x);
 349              if (IS_UTF8_CHAR_FAST(c)) {
 350                  if (!IS_UTF8_CHAR(x, c))
 351                      c = 0;
 352              } else
 353                  c = is_utf8_char_slow(x, c);
 354 #else
 355              c = is_utf8_char(x);
 356 #endif /* #ifdef IS_UTF8_CHAR */
 357              if (!c)
 358                  goto out;
 359          }
 360          x += c;
 361          if (el)
 362              (*el)++;
 363     }
 364
 365  out:
 366     if (ep)
 367         *ep = x;
 368     if (x != send)
 369         return FALSE;
 370
 371     return TRUE;
 372 }
 373
 374 /*
 375
 376 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 377
 378 Bottom level UTF-8 decode routine.
 379 Returns the unicode code point value of the first character in the string C<s>
 380 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 381 C<retlen> will be set to the length, in bytes, of that character.
 382
 383 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 384 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 385 it is assumed that the caller will raise a warning, and this function
 386 will silently just set C<retlen> to C<-1> and return zero.  If the
 387 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 388 malformations will be given, C<retlen> will be set to the expected
 389 length of the UTF-8 character in bytes, and zero will be returned.
 390
 391 The C<flags> can also contain various flags to allow deviations from
 392 the strict UTF-8 encoding (see F<utf8.h>).
 393
 394 Most code should use utf8_to_uvchr() rather than call this directly.
 395
 396 =cut
 397 */
 398
 399 UV
 400 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 401 {
 402     dVAR;
 403     const U8 *s0 = s;
 404     UV uv = *s, ouv = 0;
 405     STRLEN len = 1;
 406     const bool dowarn = ckWARN_d(WARN_UTF8);
 407     const UV startbyte = *s;
 408     STRLEN expectlen = 0;
 409     U32 warning = 0;
 410
 411 /* This list is a superset of the UTF8_ALLOW_XXX. */
 412
 413 #define UTF8_WARN_EMPTY                          1
 414 #define UTF8_WARN_CONTINUATION                   2
 415 #define UTF8_WARN_NON_CONTINUATION               3
 416 #define UTF8_WARN_FE_FF                          4
 417 #define UTF8_WARN_SHORT                          5
 418 #define UTF8_WARN_OVERFLOW                       6
 419 #define UTF8_WARN_SURROGATE                      7
 420 #define UTF8_WARN_LONG                           8
 421 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 422
 423     if (curlen == 0 &&
 424         !(flags & UTF8_ALLOW_EMPTY)) {
 425         warning = UTF8_WARN_EMPTY;
 426         goto malformed;
 427     }
 428
 429     if (UTF8_IS_INVARIANT(uv)) {
 430         if (retlen)
 431             *retlen = 1;
 432         return (UV) (NATIVE_TO_UTF(*s));
 433     }
 434
 435     if (UTF8_IS_CONTINUATION(uv) &&
 436         !(flags & UTF8_ALLOW_CONTINUATION)) {
 437         warning = UTF8_WARN_CONTINUATION;
 438         goto malformed;
 439     }
 440
 441     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 442         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 443         warning = UTF8_WARN_NON_CONTINUATION;
 444         goto malformed;
 445     }
 446
 447 #ifdef EBCDIC
 448     uv = NATIVE_TO_UTF(uv);
 449 #else
 450     if ((uv == 0xfe || uv == 0xff) &&
 451         !(flags & UTF8_ALLOW_FE_FF)) {
 452         warning = UTF8_WARN_FE_FF;
 453         goto malformed;
 454     }
 455 #endif
 456
 457     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 458     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 459     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 460     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 461 #ifdef EBCDIC
 462     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 463     else                        { len =  7; uv &= 0x01; }
 464 #else
 465     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 466     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 467     else                        { len = 13; uv = 0; } /* whoa! */
 468 #endif
 469
 470     if (retlen)
 471         *retlen = len;
 472
 473     expectlen = len;
 474
 475     if ((curlen < expectlen) &&
 476         !(flags & UTF8_ALLOW_SHORT)) {
 477         warning = UTF8_WARN_SHORT;
 478         goto malformed;
 479     }
 480
 481     len--;
 482     s++;
 483     ouv = uv;
 484
 485     while (len--) {
 486         if (!UTF8_IS_CONTINUATION(*s) &&
 487             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 488             s--;
 489             warning = UTF8_WARN_NON_CONTINUATION;
 490             goto malformed;
 491         }
 492         else
 493             uv = UTF8_ACCUMULATE(uv, *s);
 494         if (!(uv > ouv)) {
 495             /* These cannot be allowed. */
 496             if (uv == ouv) {
 497                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 498                     warning = UTF8_WARN_LONG;
 499                     goto malformed;
 500                 }
 501             }
 502             else { /* uv < ouv */
 503                 /* This cannot be allowed. */
 504                 warning = UTF8_WARN_OVERFLOW;
 505                 goto malformed;
 506             }
 507         }
 508         s++;
 509         ouv = uv;
 510     }
 511
 512     if (UNICODE_IS_SURROGATE(uv) &&
 513         !(flags & UTF8_ALLOW_SURROGATE)) {
 514         warning = UTF8_WARN_SURROGATE;
 515         goto malformed;
 516     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 517                !(flags & UTF8_ALLOW_LONG)) {
 518         warning = UTF8_WARN_LONG;
 519         goto malformed;
 520     } else if (UNICODE_IS_ILLEGAL(uv) &&
 521                !(flags & UTF8_ALLOW_FFFF)) {
 522         warning = UTF8_WARN_FFFF;
 523         goto malformed;
 524     }
 525
 526     return uv;
 527
 528 malformed:
 529
 530     if (flags & UTF8_CHECK_ONLY) {
 531         if (retlen)
 532             *retlen = -1;
 533         return 0;
 534     }
 535
 536     if (dowarn) {
 537         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 538
 539         switch (warning) {
 540         case 0: /* Intentionally empty. */ break;
 541         case UTF8_WARN_EMPTY:
 542             sv_catpvs(sv, "(empty string)");
 543             break;
 544         case UTF8_WARN_CONTINUATION:
 545             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 546             break;
 547         case UTF8_WARN_NON_CONTINUATION:
 548             if (s == s0)
 549                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 550                            (UV)s[1], startbyte);
 551             else {
 552                 const int len = (int)(s-s0);
 553                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 554                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 555             }
 556
 557             break;
 558         case UTF8_WARN_FE_FF:
 559             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 560             break;
 561         case UTF8_WARN_SHORT:
 562             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 563                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 564             expectlen = curlen;         /* distance for caller to skip */
 565             break;
 566         case UTF8_WARN_OVERFLOW:
 567             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 568                            ouv, *s, startbyte);
 569             break;
 570         case UTF8_WARN_SURROGATE:
 571             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 572             break;
 573         case UTF8_WARN_LONG:
 574             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 575                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 576             break;
 577         case UTF8_WARN_FFFF:
 578             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 579             break;
 580         default:
 581             sv_catpvs(sv, "(unknown reason)");
 582             break;
 583         }
 584
 585         if (warning) {
 586             const char * const s = SvPVX_const(sv);
 587
 588             if (PL_op)
 589                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 590                             "%s in %s", s,  OP_DESC(PL_op));
 591             else
 592                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 593         }
 594     }
 595
 596     if (retlen)
 597         *retlen = expectlen ? expectlen : len;
 598
 599     return 0;
 600 }
 601
 602 /*
 603 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 604
 605 Returns the native character value of the first character in the string C<s>
 606 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 607 length, in bytes, of that character.
 608
 609 If C<s> does not point to a well-formed UTF-8 character, zero is
 610 returned and retlen is set, if possible, to -1.
 611
 612 =cut
 613 */
 614
 615 UV
 616 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 617 {
 618     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 619                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 620 }
 621
 622 /*
 623 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 624
 625 Returns the Unicode code point of the first character in the string C<s>
 626 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 627 length, in bytes, of that character.
 628
 629 This function should only be used when returned UV is considered
 630 an index into the Unicode semantic tables (e.g. swashes).
 631
 632 If C<s> does not point to a well-formed UTF-8 character, zero is
 633 returned and retlen is set, if possible, to -1.
 634
 635 =cut
 636 */
 637
 638 UV
 639 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 640 {
 641     /* Call the low level routine asking for checks */
 642     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 643                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 644 }
 645
 646 /*
 647 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 648
 649 Return the length of the UTF-8 char encoded string C<s> in characters.
 650 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 651 up past C<e>, croaks.
 652
 653 =cut
 654 */
 655
 656 STRLEN
 657 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 658 {
 659     dVAR;
 660     STRLEN len = 0;
 661
 662     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 663      * the bitops (especially ~) can create illegal UTF-8.
 664      * In other words: in Perl UTF-8 is not just for Unicode. */
 665
 666     if (e < s)
 667         goto warn_and_return;
 668     while (s < e) {
 669         const U8 t = UTF8SKIP(s);
 670         if (e - s < t) {
 671             warn_and_return:
 672             if (ckWARN_d(WARN_UTF8)) {
 673                 if (PL_op)
 674                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 675                             "%s in %s", unees, OP_DESC(PL_op));
 676                 else
 677                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 678             }
 679             return len;
 680         }
 681         s += t;
 682         len++;
 683     }
 684
 685     return len;
 686 }
 687
 688 /*
 689 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 690
 691 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 692 and C<b>.
 693
 694 WARNING: use only if you *know* that the pointers point inside the
 695 same UTF-8 buffer.
 696
 697 =cut
 698 */
 699
 700 IV
 701 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 702 {
 703     dVAR;
 704     IV off = 0;
 705
 706     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 707      * the bitops (especially ~) can create illegal UTF-8.
 708      * In other words: in Perl UTF-8 is not just for Unicode. */
 709
 710     if (a < b) {
 711         while (a < b) {
 712             const U8 c = UTF8SKIP(a);
 713             if (b - a < c)
 714                 goto warn_and_return;
 715             a += c;
 716             off--;
 717         }
 718     }
 719     else {
 720         while (b < a) {
 721             const U8 c = UTF8SKIP(b);
 722
 723             if (a - b < c) {
 724                 warn_and_return:
 725                 if (ckWARN_d(WARN_UTF8)) {
 726                     if (PL_op)
 727                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 728                                     "%s in %s", unees, OP_DESC(PL_op));
 729                     else
 730                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 731                 }
 732                 return off;
 733             }
 734             b += c;
 735             off++;
 736         }
 737     }
 738
 739     return off;
 740 }
 741
 742 /*
 743 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 744
 745 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 746 forward or backward.
 747
 748 WARNING: do not use the following unless you *know* C<off> is within
 749 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 750 on the first byte of character or just after the last byte of a character.
 751
 752 =cut
 753 */
 754
 755 U8 *
 756 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 757 {
 758     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 759      * the bitops (especially ~) can create illegal UTF-8.
 760      * In other words: in Perl UTF-8 is not just for Unicode. */
 761
 762     if (off >= 0) {
 763         while (off--)
 764             s += UTF8SKIP(s);
 765     }
 766     else {
 767         while (off++) {
 768             s--;
 769             while (UTF8_IS_CONTINUATION(*s))
 770                 s--;
 771         }
 772     }
 773     return (U8 *)s;
 774 }
 775
 776 /*
 777 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 778
 779 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 780 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 781 updates len to contain the new length.
 782 Returns zero on failure, setting C<len> to -1.
 783
 784 =cut
 785 */
 786
 787 U8 *
 788 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 789 {
 790     U8 *send;
 791     U8 *d;
 792     U8 *save = s;
 793
 794     /* ensure valid UTF-8 and chars < 256 before updating string */
 795     for (send = s + *len; s < send; ) {
 796         U8 c = *s++;
 797
 798         if (!UTF8_IS_INVARIANT(c) &&
 799             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 800              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 801             *len = -1;
 802             return 0;
 803         }
 804     }
 805
 806     d = s = save;
 807     while (s < send) {
 808         STRLEN ulen;
 809         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 810         s += ulen;
 811     }
 812     *d = '\0';
 813     *len = d - save;
 814     return save;
 815 }
 816
 817 /*
 818 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 819
 820 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 821 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 822 the newly-created string, and updates C<len> to contain the new
 823 length.  Returns the original string if no conversion occurs, C<len>
 824 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 825 0 if C<s> is converted or contains all 7bit characters.
 826
 827 =cut
 828 */
 829
 830 U8 *
 831 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 832 {
 833     U8 *d;
 834     const U8 *start = s;
 835     const U8 *send;
 836     I32 count = 0;
 837
 838     if (!*is_utf8)
 839         return (U8 *)start;
 840
 841     /* ensure valid UTF-8 and chars < 256 before converting string */
 842     for (send = s + *len; s < send;) {
 843         U8 c = *s++;
 844         if (!UTF8_IS_INVARIANT(c)) {
 845             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 846                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 847                 count++;
 848             else
 849                 return (U8 *)start;
 850         }
 851     }
 852
 853     *is_utf8 = 0;
 854
 855     Newx(d, (*len) - count + 1, U8);
 856     s = start; start = d;
 857     while (s < send) {
 858         U8 c = *s++;
 859         if (!UTF8_IS_INVARIANT(c)) {
 860             /* Then it is two-byte encoded */
 861             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 862             c = ASCII_TO_NATIVE(c);
 863         }
 864         *d++ = c;
 865     }
 866     *d = '\0';
 867     *len = d - start;
 868     return (U8 *)start;
 869 }
 870
 871 /*
 872 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 873
 874 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 875 Returns a pointer to the newly-created string, and sets C<len> to
 876 reflect the new length.
 877
 878 If you want to convert to UTF-8 from other encodings than ASCII,
 879 see sv_recode_to_utf8().
 880
 881 =cut
 882 */
 883
 884 U8*
 885 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 886 {
 887     const U8 * const send = s + (*len);
 888     U8 *d;
 889     U8 *dst;
 890
 891     Newx(d, (*len) * 2 + 1, U8);
 892     dst = d;
 893
 894     while (s < send) {
 895         const UV uv = NATIVE_TO_ASCII(*s++);
 896         if (UNI_IS_INVARIANT(uv))
 897             *d++ = (U8)UTF_TO_NATIVE(uv);
 898         else {
 899             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 900             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 901         }
 902     }
 903     *d = '\0';
 904     *len = d-dst;
 905     return dst;
 906 }
 907
 908 /*
 909  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 910  *
 911  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 912  * We optimize for native, for obvious reasons. */
 913
 914 U8*
 915 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 916 {
 917     U8* pend;
 918     U8* dstart = d;
 919
 920     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 921          d[0] = 0;
 922          *newlen = 1;
 923          return d;
 924     }
 925
 926     if (bytelen & 1)
 927         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 928
 929     pend = p + bytelen;
 930
 931     while (p < pend) {
 932         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 933         p += 2;
 934         if (uv < 0x80) {
 935             *d++ = (U8)uv;
 936             continue;
 937         }
 938         if (uv < 0x800) {
 939             *d++ = (U8)(( uv >>  6)         | 0xc0);
 940             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 941             continue;
 942         }
 943         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 944             UV low = (p[0] << 8) + p[1];
 945             p += 2;
 946             if (low < 0xdc00 || low >= 0xdfff)
 947                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 948             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 949         }
 950         if (uv < 0x10000) {
 951             *d++ = (U8)(( uv >> 12)         | 0xe0);
 952             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 953             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 954             continue;
 955         }
 956         else {
 957             *d++ = (U8)(( uv >> 18)         | 0xf0);
 958             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 959             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 960             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 961             continue;
 962         }
 963     }
 964     *newlen = d - dstart;
 965     return d;
 966 }
 967
 968 /* Note: this one is slightly destructive of the source. */
 969
 970 U8*
 971 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 972 {
 973     U8* s = (U8*)p;
 974     U8* send = s + bytelen;
 975     while (s < send) {
 976         U8 tmp = s[0];
 977         s[0] = s[1];
 978         s[1] = tmp;
 979         s += 2;
 980     }
 981     return utf16_to_utf8(p, d, bytelen, newlen);
 982 }
 983
 984 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 985
 986 bool
 987 Perl_is_uni_alnum(pTHX_ UV c)
 988 {
 989     U8 tmpbuf[UTF8_MAXBYTES+1];
 990     uvchr_to_utf8(tmpbuf, c);
 991     return is_utf8_alnum(tmpbuf);
 992 }
 993
 994 bool
 995 Perl_is_uni_alnumc(pTHX_ UV c)
 996 {
 997     U8 tmpbuf[UTF8_MAXBYTES+1];
 998     uvchr_to_utf8(tmpbuf, c);
 999     return is_utf8_alnumc(tmpbuf);
1000 }
1001
1002 bool
1003 Perl_is_uni_idfirst(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_idfirst(tmpbuf);
1008 }
1009
1010 bool
1011 Perl_is_uni_alpha(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_alpha(tmpbuf);
1016 }
1017
1018 bool
1019 Perl_is_uni_ascii(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_ascii(tmpbuf);
1024 }
1025
1026 bool
1027 Perl_is_uni_space(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_space(tmpbuf);
1032 }
1033
1034 bool
1035 Perl_is_uni_digit(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_digit(tmpbuf);
1040 }
1041
1042 bool
1043 Perl_is_uni_upper(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_upper(tmpbuf);
1048 }
1049
1050 bool
1051 Perl_is_uni_lower(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_lower(tmpbuf);
1056 }
1057
1058 bool
1059 Perl_is_uni_cntrl(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_cntrl(tmpbuf);
1064 }
1065
1066 bool
1067 Perl_is_uni_graph(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_graph(tmpbuf);
1072 }
1073
1074 bool
1075 Perl_is_uni_print(pTHX_ UV c)
1076 {
1077     U8 tmpbuf[UTF8_MAXBYTES+1];
1078     uvchr_to_utf8(tmpbuf, c);
1079     return is_utf8_print(tmpbuf);
1080 }
1081
1082 bool
1083 Perl_is_uni_punct(pTHX_ UV c)
1084 {
1085     U8 tmpbuf[UTF8_MAXBYTES+1];
1086     uvchr_to_utf8(tmpbuf, c);
1087     return is_utf8_punct(tmpbuf);
1088 }
1089
1090 bool
1091 Perl_is_uni_xdigit(pTHX_ UV c)
1092 {
1093     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1094     uvchr_to_utf8(tmpbuf, c);
1095     return is_utf8_xdigit(tmpbuf);
1096 }
1097
1098 UV
1099 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1100 {
1101     uvchr_to_utf8(p, c);
1102     return to_utf8_upper(p, p, lenp);
1103 }
1104
1105 UV
1106 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1107 {
1108     uvchr_to_utf8(p, c);
1109     return to_utf8_title(p, p, lenp);
1110 }
1111
1112 UV
1113 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1114 {
1115     uvchr_to_utf8(p, c);
1116     return to_utf8_lower(p, p, lenp);
1117 }
1118
1119 UV
1120 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1121 {
1122     uvchr_to_utf8(p, c);
1123     return to_utf8_fold(p, p, lenp);
1124 }
1125
1126 /* for now these all assume no locale info available for Unicode > 255 */
1127
1128 bool
1129 Perl_is_uni_alnum_lc(pTHX_ UV c)
1130 {
1131     return is_uni_alnum(c);     /* XXX no locale support yet */
1132 }
1133
1134 bool
1135 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1136 {
1137     return is_uni_alnumc(c);    /* XXX no locale support yet */
1138 }
1139
1140 bool
1141 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1142 {
1143     return is_uni_idfirst(c);   /* XXX no locale support yet */
1144 }
1145
1146 bool
1147 Perl_is_uni_alpha_lc(pTHX_ UV c)
1148 {
1149     return is_uni_alpha(c);     /* XXX no locale support yet */
1150 }
1151
1152 bool
1153 Perl_is_uni_ascii_lc(pTHX_ UV c)
1154 {
1155     return is_uni_ascii(c);     /* XXX no locale support yet */
1156 }
1157
1158 bool
1159 Perl_is_uni_space_lc(pTHX_ UV c)
1160 {
1161     return is_uni_space(c);     /* XXX no locale support yet */
1162 }
1163
1164 bool
1165 Perl_is_uni_digit_lc(pTHX_ UV c)
1166 {
1167     return is_uni_digit(c);     /* XXX no locale support yet */
1168 }
1169
1170 bool
1171 Perl_is_uni_upper_lc(pTHX_ UV c)
1172 {
1173     return is_uni_upper(c);     /* XXX no locale support yet */
1174 }
1175
1176 bool
1177 Perl_is_uni_lower_lc(pTHX_ UV c)
1178 {
1179     return is_uni_lower(c);     /* XXX no locale support yet */
1180 }
1181
1182 bool
1183 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1184 {
1185     return is_uni_cntrl(c);     /* XXX no locale support yet */
1186 }
1187
1188 bool
1189 Perl_is_uni_graph_lc(pTHX_ UV c)
1190 {
1191     return is_uni_graph(c);     /* XXX no locale support yet */
1192 }
1193
1194 bool
1195 Perl_is_uni_print_lc(pTHX_ UV c)
1196 {
1197     return is_uni_print(c);     /* XXX no locale support yet */
1198 }
1199
1200 bool
1201 Perl_is_uni_punct_lc(pTHX_ UV c)
1202 {
1203     return is_uni_punct(c);     /* XXX no locale support yet */
1204 }
1205
1206 bool
1207 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1208 {
1209     return is_uni_xdigit(c);    /* XXX no locale support yet */
1210 }
1211
1212 U32
1213 Perl_to_uni_upper_lc(pTHX_ U32 c)
1214 {
1215     /* XXX returns only the first character -- do not use XXX */
1216     /* XXX no locale support yet */
1217     STRLEN len;
1218     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1219     return (U32)to_uni_upper(c, tmpbuf, &len);
1220 }
1221
1222 U32
1223 Perl_to_uni_title_lc(pTHX_ U32 c)
1224 {
1225     /* XXX returns only the first character XXX -- do not use XXX */
1226     /* XXX no locale support yet */
1227     STRLEN len;
1228     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1229     return (U32)to_uni_title(c, tmpbuf, &len);
1230 }
1231
1232 U32
1233 Perl_to_uni_lower_lc(pTHX_ U32 c)
1234 {
1235     /* XXX returns only the first character -- do not use XXX */
1236     /* XXX no locale support yet */
1237     STRLEN len;
1238     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1239     return (U32)to_uni_lower(c, tmpbuf, &len);
1240 }
1241
1242 static bool
1243 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1244                  const char *const swashname)
1245 {
1246     dVAR;
1247     if (!is_utf8_char(p))
1248         return FALSE;
1249     if (!*swash)
1250         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1251     return swash_fetch(*swash, p, TRUE) != 0;
1252 }
1253
1254 bool
1255 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1256 {
1257     dVAR;
1258     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1259      * descendant of isalnum(3), in other words, it doesn't
1260      * contain the '_'. --jhi */
1261     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
1262 }
1263
1264 bool
1265 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1266 {
1267     dVAR;
1268     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
1269 }
1270
1271 bool
1272 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1273 {
1274     dVAR;
1275     if (*p == '_')
1276         return TRUE;
1277     /* is_utf8_idstart would be more logical. */
1278     return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
1279 }
1280
1281 bool
1282 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1283 {
1284     dVAR;
1285     if (*p == '_')
1286         return TRUE;
1287     return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
1288 }
1289
1290 bool
1291 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1292 {
1293     dVAR;
1294     return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
1295 }
1296
1297 bool
1298 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1299 {
1300     dVAR;
1301     return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
1302 }
1303
1304 bool
1305 Perl_is_utf8_space(pTHX_ const U8 *p)
1306 {
1307     dVAR;
1308     return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
1309 }
1310
1311 bool
1312 Perl_is_utf8_digit(pTHX_ const U8 *p)
1313 {
1314     dVAR;
1315     return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
1316 }
1317
1318 bool
1319 Perl_is_utf8_upper(pTHX_ const U8 *p)
1320 {
1321     dVAR;
1322     return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
1323 }
1324
1325 bool
1326 Perl_is_utf8_lower(pTHX_ const U8 *p)
1327 {
1328     dVAR;
1329     return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
1330 }
1331
1332 bool
1333 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1334 {
1335     dVAR;
1336     return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
1337 }
1338
1339 bool
1340 Perl_is_utf8_graph(pTHX_ const U8 *p)
1341 {
1342     dVAR;
1343     return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
1344 }
1345
1346 bool
1347 Perl_is_utf8_print(pTHX_ const U8 *p)
1348 {
1349     dVAR;
1350     return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
1351 }
1352
1353 bool
1354 Perl_is_utf8_punct(pTHX_ const U8 *p)
1355 {
1356     dVAR;
1357     return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
1358 }
1359
1360 bool
1361 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1362 {
1363     dVAR;
1364     return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
1365 }
1366
1367 bool
1368 Perl_is_utf8_mark(pTHX_ const U8 *p)
1369 {
1370     dVAR;
1371     return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
1372 }
1373
1374 /*
1375 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1376
1377 The "p" contains the pointer to the UTF-8 string encoding
1378 the character that is being converted.
1379
1380 The "ustrp" is a pointer to the character buffer to put the
1381 conversion result to.  The "lenp" is a pointer to the length
1382 of the result.
1383
1384 The "swashp" is a pointer to the swash to use.
1385
1386 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1387 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1388 but not always, a multicharacter mapping), is tried first.
1389
1390 The "special" is a string like "utf8::ToSpecLower", which means the
1391 hash %utf8::ToSpecLower.  The access to the hash is through
1392 Perl_to_utf8_case().
1393
1394 The "normal" is a string like "ToLower" which means the swash
1395 %utf8::ToLower.
1396
1397 =cut */
1398
1399 UV
1400 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1401                         SV **swashp, const char *normal, const char *special)
1402 {
1403     dVAR;
1404     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1405     STRLEN len = 0;
1406
1407     const UV uv0 = utf8_to_uvchr(p, NULL);
1408     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1409      * are necessary in EBCDIC, they are redundant no-ops
1410      * in ASCII-ish platforms, and hopefully optimized away. */
1411     const UV uv1 = NATIVE_TO_UNI(uv0);
1412     uvuni_to_utf8(tmpbuf, uv1);
1413
1414     if (!*swashp) /* load on-demand */
1415          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1416
1417     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1418     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1419          /* It might be "special" (sometimes, but not always,
1420           * a multicharacter mapping) */
1421          HV *hv;
1422          SV **svp;
1423
1424          if ((hv  = get_hv(special, FALSE)) &&
1425              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1426              (*svp)) {
1427              const char *s;
1428
1429               s = SvPV_const(*svp, len);
1430               if (len == 1)
1431                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1432               else {
1433 #ifdef EBCDIC
1434                    /* If we have EBCDIC we need to remap the characters
1435                     * since any characters in the low 256 are Unicode
1436                     * code points, not EBCDIC. */
1437                    U8 *t = (U8*)s, *tend = t + len, *d;
1438
1439                    d = tmpbuf;
1440                    if (SvUTF8(*svp)) {
1441                         STRLEN tlen = 0;
1442
1443                         while (t < tend) {
1444                              UV c = utf8_to_uvchr(t, &tlen);
1445                              if (tlen > 0) {
1446                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1447                                   t += tlen;
1448                              }
1449                              else
1450                                   break;
1451                         }
1452                    }
1453                    else {
1454                         while (t < tend) {
1455                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1456                              t++;
1457                         }
1458                    }
1459                    len = d - tmpbuf;
1460                    Copy(tmpbuf, ustrp, len, U8);
1461 #else
1462                    Copy(s, ustrp, len, U8);
1463 #endif
1464               }
1465          }
1466     }
1467
1468     if (!len && *swashp) {
1469          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1470
1471          if (uv2) {
1472               /* It was "normal" (a single character mapping). */
1473               UV uv3 = UNI_TO_NATIVE(uv2);
1474
1475               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1476          }
1477     }
1478
1479     if (!len) /* Neither: just copy. */
1480          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1481
1482     if (lenp)
1483          *lenp = len;
1484
1485     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1486 }
1487
1488 /*
1489 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1490
1491 Convert the UTF-8 encoded character at p to its uppercase version and
1492 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1493 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1494 the uppercase version may be longer than the original character.
1495
1496 The first character of the uppercased version is returned
1497 (but note, as explained above, that there may be more.)
1498
1499 =cut */
1500
1501 UV
1502 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1503 {
1504     dVAR;
1505     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1506                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1507 }
1508
1509 /*
1510 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1511
1512 Convert the UTF-8 encoded character at p to its titlecase version and
1513 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1514 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1515 titlecase version may be longer than the original character.
1516
1517 The first character of the titlecased version is returned
1518 (but note, as explained above, that there may be more.)
1519
1520 =cut */
1521
1522 UV
1523 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1524 {
1525     dVAR;
1526     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1527                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1528 }
1529
1530 /*
1531 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1532
1533 Convert the UTF-8 encoded character at p to its lowercase version and
1534 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1535 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1536 lowercase version may be longer than the original character.
1537
1538 The first character of the lowercased version is returned
1539 (but note, as explained above, that there may be more.)
1540
1541 =cut */
1542
1543 UV
1544 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1545 {
1546     dVAR;
1547     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1548                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1549 }
1550
1551 /*
1552 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1553
1554 Convert the UTF-8 encoded character at p to its foldcase version and
1555 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1556 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1557 foldcase version may be longer than the original character (up to
1558 three characters).
1559
1560 The first character of the foldcased version is returned
1561 (but note, as explained above, that there may be more.)
1562
1563 =cut */
1564
1565 UV
1566 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1567 {
1568     dVAR;
1569     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1570                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1571 }
1572
1573 /* Note:
1574  * A "swash" is a swatch hash.
1575  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1576  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1577  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1578  */
1579 SV*
1580 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1581 {
1582     dVAR;
1583     SV* retval;
1584     SV* const tokenbufsv = sv_newmortal();
1585     dSP;
1586     const size_t pkg_len = strlen(pkg);
1587     const size_t name_len = strlen(name);
1588     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1589     SV* errsv_save;
1590
1591     PUSHSTACKi(PERLSI_MAGIC);
1592     ENTER;
1593     SAVEI32(PL_hints);
1594     PL_hints = 0;
1595     save_re_context();
1596     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1597         ENTER;
1598         errsv_save = newSVsv(ERRSV);
1599         /* It is assumed that callers of this routine are not passing in any
1600            user derived data.  */
1601         /* Need to do this after save_re_context() as it will set PL_tainted to
1602            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1603            Even line to create errsv_save can turn on PL_tainted.  */
1604         SAVEBOOL(PL_tainted);
1605         PL_tainted = 0;
1606         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1607                          NULL);
1608         if (!SvTRUE(ERRSV))
1609             sv_setsv(ERRSV, errsv_save);
1610         SvREFCNT_dec(errsv_save);
1611         LEAVE;
1612     }
1613     SPAGAIN;
1614     PUSHMARK(SP);
1615     EXTEND(SP,5);
1616     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1617     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1618     PUSHs(listsv);
1619     PUSHs(sv_2mortal(newSViv(minbits)));
1620     PUSHs(sv_2mortal(newSViv(none)));
1621     PUTBACK;
1622     if (IN_PERL_COMPILETIME) {
1623         /* XXX ought to be handled by lex_start */
1624         SAVEI32(PL_in_my);
1625         PL_in_my = 0;
1626         sv_setpv(tokenbufsv, PL_tokenbuf);
1627     }
1628     errsv_save = newSVsv(ERRSV);
1629     if (call_method("SWASHNEW", G_SCALAR))
1630         retval = newSVsv(*PL_stack_sp--);
1631     else
1632         retval = &PL_sv_undef;
1633     if (!SvTRUE(ERRSV))
1634         sv_setsv(ERRSV, errsv_save);
1635     SvREFCNT_dec(errsv_save);
1636     LEAVE;
1637     POPSTACK;
1638     if (IN_PERL_COMPILETIME) {
1639         STRLEN len;
1640         const char* const pv = SvPV_const(tokenbufsv, len);
1641
1642         Copy(pv, PL_tokenbuf, len+1, char);
1643         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1644     }
1645     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1646         if (SvPOK(retval))
1647             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1648                        retval);
1649         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1650     }
1651     return retval;
1652 }
1653
1654
1655 /* This API is wrong for special case conversions since we may need to
1656  * return several Unicode characters for a single Unicode character
1657  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1658  * the lower-level routine, and it is similarly broken for returning
1659  * multiple values.  --jhi */
1660 /* Now SWASHGET is recasted into S_swash_get in this file. */
1661
1662 /* Note:
1663  * Returns the value of property/mapping C<swash> for the first character
1664  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1665  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1666  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1667  */
1668 UV
1669 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1670 {
1671     dVAR;
1672     HV* const hv = (HV*)SvRV(swash);
1673     U32 klen;
1674     U32 off;
1675     STRLEN slen;
1676     STRLEN needents;
1677     const U8 *tmps = NULL;
1678     U32 bit;
1679     SV *swatch;
1680     U8 tmputf8[2];
1681     UV c = NATIVE_TO_ASCII(*ptr);
1682
1683     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1684         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1685         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1686         ptr = tmputf8;
1687     }
1688     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1689      * then the "swatch" is a vec() for al the chars which start
1690      * with 0xAA..0xYY
1691      * So the key in the hash (klen) is length of encoded char -1
1692      */
1693     klen = UTF8SKIP(ptr) - 1;
1694     off  = ptr[klen];
1695
1696     if (klen == 0) {
1697       /* If char in invariant then swatch is for all the invariant chars
1698        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1699        */
1700         needents = UTF_CONTINUATION_MARK;
1701         off      = NATIVE_TO_UTF(ptr[klen]);
1702     }
1703     else {
1704       /* If char is encoded then swatch is for the prefix */
1705         needents = (1 << UTF_ACCUMULATION_SHIFT);
1706         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1707     }
1708
1709     /*
1710      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1711      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1712      * it's nothing to sniff at.)  Pity we usually come through at least
1713      * two function calls to get here...
1714      *
1715      * NB: this code assumes that swatches are never modified, once generated!
1716      */
1717
1718     if (hv   == PL_last_swash_hv &&
1719         klen == PL_last_swash_klen &&
1720         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1721     {
1722         tmps = PL_last_swash_tmps;
1723         slen = PL_last_swash_slen;
1724     }
1725     else {
1726         /* Try our second-level swatch cache, kept in a hash. */
1727         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1728
1729         /* If not cached, generate it via swash_get */
1730         if (!svp || !SvPOK(*svp)
1731                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1732             /* We use utf8n_to_uvuni() as we want an index into
1733                Unicode tables, not a native character number.
1734              */
1735             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1736                                            ckWARN(WARN_UTF8) ?
1737                                            0 : UTF8_ALLOW_ANY);
1738             swatch = swash_get(swash,
1739                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1740                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1741                                 needents);
1742
1743             if (IN_PERL_COMPILETIME)
1744                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1745
1746             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1747
1748             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1749                      || (slen << 3) < needents)
1750                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1751         }
1752
1753         PL_last_swash_hv = hv;
1754         PL_last_swash_klen = klen;
1755         /* FIXME change interpvar.h?  */
1756         PL_last_swash_tmps = (U8 *) tmps;
1757         PL_last_swash_slen = slen;
1758         if (klen)
1759             Copy(ptr, PL_last_swash_key, klen, U8);
1760     }
1761
1762     switch ((int)((slen << 3) / needents)) {
1763     case 1:
1764         bit = 1 << (off & 7);
1765         off >>= 3;
1766         return (tmps[off] & bit) != 0;
1767     case 8:
1768         return tmps[off];
1769     case 16:
1770         off <<= 1;
1771         return (tmps[off] << 8) + tmps[off + 1] ;
1772     case 32:
1773         off <<= 2;
1774         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1775     }
1776     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1777     return 0;
1778 }
1779
1780 /* Note:
1781  * Returns a swatch (a bit vector string) for a code point sequence
1782  * that starts from the value C<start> and comprises the number C<span>.
1783  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1784  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1785  */
1786 STATIC SV*
1787 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1788 {
1789     SV *swatch;
1790     U8 *l, *lend, *x, *xend, *s;
1791     STRLEN lcur, xcur, scur;
1792
1793     HV* const hv = (HV*)SvRV(swash);
1794     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1795     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1796     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1797     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1798     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1799     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1800     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1801     const STRLEN bits  = SvUV(*bitssvp);
1802     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1803     const UV     none  = SvUV(*nonesvp);
1804     const UV     end   = start + span;
1805
1806     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1807         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1808                                                  (UV)bits);
1809     }
1810
1811     /* create and initialize $swatch */
1812     swatch = newSVpvs("");
1813     scur   = octets ? (span * octets) : (span + 7) / 8;
1814     SvGROW(swatch, scur + 1);
1815     s = (U8*)SvPVX(swatch);
1816     if (octets && none) {
1817         const U8* const e = s + scur;
1818         while (s < e) {
1819             if (bits == 8)
1820                 *s++ = (U8)(none & 0xff);
1821             else if (bits == 16) {
1822                 *s++ = (U8)((none >>  8) & 0xff);
1823                 *s++ = (U8)( none        & 0xff);
1824             }
1825             else if (bits == 32) {
1826                 *s++ = (U8)((none >> 24) & 0xff);
1827                 *s++ = (U8)((none >> 16) & 0xff);
1828                 *s++ = (U8)((none >>  8) & 0xff);
1829                 *s++ = (U8)( none        & 0xff);
1830             }
1831         }
1832         *s = '\0';
1833     }
1834     else {
1835         (void)memzero((U8*)s, scur + 1);
1836     }
1837     SvCUR_set(swatch, scur);
1838     s = (U8*)SvPVX(swatch);
1839
1840     /* read $swash->{LIST} */
1841     l = (U8*)SvPV(*listsvp, lcur);
1842     lend = l + lcur;
1843     while (l < lend) {
1844         UV min, max, val, key;
1845         STRLEN numlen;
1846         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1847
1848         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1849
1850         numlen = lend - l;
1851         min = grok_hex((char *)l, &numlen, &flags, NULL);
1852         if (numlen)
1853             l += numlen;
1854         else if (nl) {
1855             l = nl + 1; /* 1 is length of "\n" */
1856             continue;
1857         }
1858         else {
1859             l = lend; /* to LIST's end at which \n is not found */
1860             break;
1861         }
1862
1863         if (isBLANK(*l)) {
1864             ++l;
1865             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1866             numlen = lend - l;
1867             max = grok_hex((char *)l, &numlen, &flags, NULL);
1868             if (numlen)
1869                 l += numlen;
1870             else
1871                 max = min;
1872
1873             if (octets) {
1874                 if (isBLANK(*l)) {
1875                     ++l;
1876                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1877                             PERL_SCAN_DISALLOW_PREFIX;
1878                     numlen = lend - l;
1879                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1880                     if (numlen)
1881                         l += numlen;
1882                     else
1883                         val = 0;
1884                 }
1885                 else {
1886                     val = 0;
1887                     if (typeto) {
1888                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1889                                          typestr, l);
1890                     }
1891                 }
1892             }
1893             else
1894                 val = 0; /* bits == 1, then val should be ignored */
1895         }
1896         else {
1897             max = min;
1898             if (octets) {
1899                 val = 0;
1900                 if (typeto) {
1901                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1902                 }
1903             }
1904             else
1905                 val = 0; /* bits == 1, then val should be ignored */
1906         }
1907
1908         if (nl)
1909             l = nl + 1;
1910         else
1911             l = lend;
1912
1913         if (max < start)
1914             continue;
1915
1916         if (octets) {
1917             if (min < start) {
1918                 if (!none || val < none) {
1919                     val += start - min;
1920                 }
1921                 min = start;
1922             }
1923             for (key = min; key <= max; key++) {
1924                 STRLEN offset;
1925                 if (key >= end)
1926                     goto go_out_list;
1927                 /* offset must be non-negative (start <= min <= key < end) */
1928                 offset = octets * (key - start);
1929                 if (bits == 8)
1930                     s[offset] = (U8)(val & 0xff);
1931                 else if (bits == 16) {
1932                     s[offset    ] = (U8)((val >>  8) & 0xff);
1933                     s[offset + 1] = (U8)( val        & 0xff);
1934                 }
1935                 else if (bits == 32) {
1936                     s[offset    ] = (U8)((val >> 24) & 0xff);
1937                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1938                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1939                     s[offset + 3] = (U8)( val        & 0xff);
1940                 }
1941
1942                 if (!none || val < none)
1943                     ++val;
1944             }
1945         }
1946         else { /* bits == 1, then val should be ignored */
1947             if (min < start)
1948                 min = start;
1949             for (key = min; key <= max; key++) {
1950                 const STRLEN offset = (STRLEN)(key - start);
1951                 if (key >= end)
1952                     goto go_out_list;
1953                 s[offset >> 3] |= 1 << (offset & 7);
1954             }
1955         }
1956     } /* while */
1957   go_out_list:
1958
1959     /* read $swash->{EXTRAS} */
1960     x = (U8*)SvPV(*extssvp, xcur);
1961     xend = x + xcur;
1962     while (x < xend) {
1963         STRLEN namelen;
1964         U8 *namestr;
1965         SV** othersvp;
1966         HV* otherhv;
1967         STRLEN otherbits;
1968         SV **otherbitssvp, *other;
1969         U8 *s, *o, *nl;
1970         STRLEN slen, olen;
1971
1972         U8 opc = *x++;
1973         if (opc == '\n')
1974             continue;
1975
1976         nl = (U8*)memchr(x, '\n', xend - x);
1977
1978         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1979             if (nl) {
1980                 x = nl + 1; /* 1 is length of "\n" */
1981                 continue;
1982             }
1983             else {
1984                 x = xend; /* to EXTRAS' end at which \n is not found */
1985                 break;
1986             }
1987         }
1988
1989         namestr = x;
1990         if (nl) {
1991             namelen = nl - namestr;
1992             x = nl + 1;
1993         }
1994         else {
1995             namelen = xend - namestr;
1996             x = xend;
1997         }
1998
1999         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2000         otherhv = (HV*)SvRV(*othersvp);
2001         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2002         otherbits = (STRLEN)SvUV(*otherbitssvp);
2003         if (bits < otherbits)
2004             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2005
2006         /* The "other" swatch must be destroyed after. */
2007         other = swash_get(*othersvp, start, span);
2008         o = (U8*)SvPV(other, olen);
2009
2010         if (!olen)
2011             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2012
2013         s = (U8*)SvPV(swatch, slen);
2014         if (bits == 1 && otherbits == 1) {
2015             if (slen != olen)
2016                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2017
2018             switch (opc) {
2019             case '+':
2020                 while (slen--)
2021                     *s++ |= *o++;
2022                 break;
2023             case '!':
2024                 while (slen--)
2025                     *s++ |= ~*o++;
2026                 break;
2027             case '-':
2028                 while (slen--)
2029                     *s++ &= ~*o++;
2030                 break;
2031             case '&':
2032                 while (slen--)
2033                     *s++ &= *o++;
2034                 break;
2035             default:
2036                 break;
2037             }
2038         }
2039         else {
2040             STRLEN otheroctets = otherbits >> 3;
2041             STRLEN offset = 0;
2042             U8* send = s + slen;
2043
2044             while (s < send) {
2045                 UV otherval = 0;
2046
2047                 if (otherbits == 1) {
2048                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2049                     ++offset;
2050                 }
2051                 else {
2052                     STRLEN vlen = otheroctets;
2053                     otherval = *o++;
2054                     while (--vlen) {
2055                         otherval <<= 8;
2056                         otherval |= *o++;
2057                     }
2058                 }
2059
2060                 if (opc == '+' && otherval)
2061                     /*EMPTY*/;   /* replace with otherval */
2062                 else if (opc == '!' && !otherval)
2063                     otherval = 1;
2064                 else if (opc == '-' && otherval)
2065                     otherval = 0;
2066                 else if (opc == '&' && !otherval)
2067                     otherval = 0;
2068                 else {
2069                     s += octets; /* no replacement */
2070                     continue;
2071                 }
2072
2073                 if (bits == 8)
2074                     *s++ = (U8)( otherval & 0xff);
2075                 else if (bits == 16) {
2076                     *s++ = (U8)((otherval >>  8) & 0xff);
2077                     *s++ = (U8)( otherval        & 0xff);
2078                 }
2079                 else if (bits == 32) {
2080                     *s++ = (U8)((otherval >> 24) & 0xff);
2081                     *s++ = (U8)((otherval >> 16) & 0xff);
2082                     *s++ = (U8)((otherval >>  8) & 0xff);
2083                     *s++ = (U8)( otherval        & 0xff);
2084                 }
2085             }
2086         }
2087         sv_free(other); /* through with it! */
2088     } /* while */
2089     return swatch;
2090 }
2091
2092 /*
2093 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2094
2095 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2096 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2097 bytes available. The return value is the pointer to the byte after the
2098 end of the new character. In other words,
2099
2100     d = uvchr_to_utf8(d, uv);
2101
2102 is the recommended wide native character-aware way of saying
2103
2104     *(d++) = uv;
2105
2106 =cut
2107 */
2108
2109 /* On ASCII machines this is normally a macro but we want a
2110    real function in case XS code wants it
2111 */
2112 U8 *
2113 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2114 {
2115     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2116 }
2117
2118 U8 *
2119 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2120 {
2121     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2122 }
2123
2124 /*
2125 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2126 flags
2127
2128 Returns the native character value of the first character in the string
2129 C<s>
2130 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2131 length, in bytes, of that character.
2132
2133 Allows length and flags to be passed to low level routine.
2134
2135 =cut
2136 */
2137 /* On ASCII machines this is normally a macro but we want
2138    a real function in case XS code wants it
2139 */
2140 UV
2141 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2142 U32 flags)
2143 {
2144     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2145     return UNI_TO_NATIVE(uv);
2146 }
2147
2148 /*
2149 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2150
2151 Build to the scalar dsv a displayable version of the string spv,
2152 length len, the displayable version being at most pvlim bytes long
2153 (if longer, the rest is truncated and "..." will be appended).
2154
2155 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2156 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2157 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2158 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2159 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2160 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2161
2162 The pointer to the PV of the dsv is returned.
2163
2164 =cut */
2165 char *
2166 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2167 {
2168     int truncated = 0;
2169     const char *s, *e;
2170
2171     sv_setpvn(dsv, "", 0);
2172     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2173          UV u;
2174           /* This serves double duty as a flag and a character to print after
2175              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2176           */
2177          char ok = 0;
2178
2179          if (pvlim && SvCUR(dsv) >= pvlim) {
2180               truncated++;
2181               break;
2182          }
2183          u = utf8_to_uvchr((U8*)s, 0);
2184          if (u < 256) {
2185              const unsigned char c = (unsigned char)u & 0xFF;
2186              if (flags & UNI_DISPLAY_BACKSLASH) {
2187                  switch (c) {
2188                  case '\n':
2189                      ok = 'n'; break;
2190                  case '\r':
2191                      ok = 'r'; break;
2192                  case '\t':
2193                      ok = 't'; break;
2194                  case '\f':
2195                      ok = 'f'; break;
2196                  case '\a':
2197                      ok = 'a'; break;
2198                  case '\\':
2199                      ok = '\\'; break;
2200                  default: break;
2201                  }
2202                  if (ok) {
2203                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2204                  }
2205              }
2206              /* isPRINT() is the locale-blind version. */
2207              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2208                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2209                  ok = 1;
2210              }
2211          }
2212          if (!ok)
2213              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2214     }
2215     if (truncated)
2216          sv_catpvs(dsv, "...");
2217
2218     return SvPVX(dsv);
2219 }
2220
2221 /*
2222 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2223
2224 Build to the scalar dsv a displayable version of the scalar sv,
2225 the displayable version being at most pvlim bytes long
2226 (if longer, the rest is truncated and "..." will be appended).
2227
2228 The flags argument is as in pv_uni_display().
2229
2230 The pointer to the PV of the dsv is returned.
2231
2232 =cut */
2233 char *
2234 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2235 {
2236      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2237                                 SvCUR(ssv), pvlim, flags);
2238 }
2239
2240 /*
2241 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2242
2243 Return true if the strings s1 and s2 differ case-insensitively, false
2244 if not (if they are equal case-insensitively).  If u1 is true, the
2245 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2246 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2247 are false, the respective string is assumed to be in native 8-bit
2248 encoding.
2249
2250 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2251 in there (they will point at the beginning of the I<next> character).
2252 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2253 pointers beyond which scanning will not continue under any
2254 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2255 s2+l2 will be used as goal end pointers that will also stop the scan,
2256 and which qualify towards defining a successful match: all the scans
2257 that define an explicit length must reach their goal pointers for
2258 a match to succeed).
2259
2260 For case-insensitiveness, the "casefolding" of Unicode is used
2261 instead of upper/lowercasing both the characters, see
2262 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2263
2264 =cut */
2265 I32
2266 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2267 {
2268      dVAR;
2269      register const U8 *p1  = (const U8*)s1;
2270      register const U8 *p2  = (const U8*)s2;
2271      register const U8 *f1 = NULL;
2272      register const U8 *f2 = NULL;
2273      register U8 *e1 = NULL;
2274      register U8 *q1 = NULL;
2275      register U8 *e2 = NULL;
2276      register U8 *q2 = NULL;
2277      STRLEN n1 = 0, n2 = 0;
2278      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2279      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2280      U8 natbuf[1+1];
2281      STRLEN foldlen1, foldlen2;
2282      bool match;
2283
2284      if (pe1)
2285           e1 = *(U8**)pe1;
2286      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2287           f1 = (const U8*)s1 + l1;
2288      if (pe2)
2289           e2 = *(U8**)pe2;
2290      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2291           f2 = (const U8*)s2 + l2;
2292
2293      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2294           return 1; /* mismatch; possible infinite loop or false positive */
2295
2296      if (!u1 || !u2)
2297           natbuf[1] = 0; /* Need to terminate the buffer. */
2298
2299      while ((e1 == 0 || p1 < e1) &&
2300             (f1 == 0 || p1 < f1) &&
2301             (e2 == 0 || p2 < e2) &&
2302             (f2 == 0 || p2 < f2)) {
2303           if (n1 == 0) {
2304                if (u1)
2305                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2306                else {
2307                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2308                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2309                }
2310                q1 = foldbuf1;
2311                n1 = foldlen1;
2312           }
2313           if (n2 == 0) {
2314                if (u2)
2315                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2316                else {
2317                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2318                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2319                }
2320                q2 = foldbuf2;
2321                n2 = foldlen2;
2322           }
2323           while (n1 && n2) {
2324                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2325                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2326                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2327                    return 1; /* mismatch */
2328                n1 -= UTF8SKIP(q1);
2329                q1 += UTF8SKIP(q1);
2330                n2 -= UTF8SKIP(q2);
2331                q2 += UTF8SKIP(q2);
2332           }
2333           if (n1 == 0)
2334                p1 += u1 ? UTF8SKIP(p1) : 1;
2335           if (n2 == 0)
2336                p2 += u2 ? UTF8SKIP(p2) : 1;
2337
2338      }
2339
2340      /* A match is defined by all the scans that specified
2341       * an explicit length reaching their final goals. */
2342      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2343
2344      if (match) {
2345           if (pe1)
2346                *pe1 = (char*)p1;
2347           if (pe2)
2348                *pe2 = (char*)p2;
2349      }
2350
2351      return match ? 0 : 1; /* 0 match, 1 mismatch */
2352 }
2353
2354 /*
2355  * Local variables:
2356  * c-indentation-style: bsd
2357  * c-basic-offset: 4
2358  * indent-tabs-mode: t
2359  * End:
2360  *
2361  * ex: set ts=8 sts=4 sw=4 noet:
2362  */