utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (UNICODE_IS_SURROGATE(uv))
  50         Perl_croak(aTHX_ "UTF-16 surrogate 0x%04"UVxf, uv);
  51     else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
  52              (uv == 0xFFFE || uv == 0xFFFF))
  53         Perl_croak(aTHX_ "Unicode character 0x%04"UVxf" is illegal", uv);
  54     if (UNI_IS_INVARIANT(uv)) {
  55         *d++ = UTF_TO_NATIVE(uv);
  56         return d;
  57     }
  58 #if defined(EBCDIC)
  59     else {
  60         STRLEN len  = UNISKIP(uv);
  61         U8 *p = d+len-1;
  62         while (p > d) {
  63             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  64             uv >>= UTF_ACCUMULATION_SHIFT;
  65         }
  66         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  67         return d+len;
  68     }
  69 #else /* Non loop style */
  70     if (uv < 0x800) {
  71         *d++ = (( uv >>  6)         | 0xc0);
  72         *d++ = (( uv        & 0x3f) | 0x80);
  73         return d;
  74     }
  75     if (uv < 0x10000) {
  76         *d++ = (( uv >> 12)         | 0xe0);
  77         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  78         *d++ = (( uv        & 0x3f) | 0x80);
  79         return d;
  80     }
  81     if (uv < 0x200000) {
  82         *d++ = (( uv >> 18)         | 0xf0);
  83         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  84         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  85         *d++ = (( uv        & 0x3f) | 0x80);
  86         return d;
  87     }
  88     if (uv < 0x4000000) {
  89         *d++ = (( uv >> 24)         | 0xf8);
  90         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  91         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  92         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  93         *d++ = (( uv        & 0x3f) | 0x80);
  94         return d;
  95     }
  96     if (uv < 0x80000000) {
  97         *d++ = (( uv >> 30)         | 0xfc);
  98         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  99         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 100         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 101         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 102         *d++ = (( uv        & 0x3f) | 0x80);
 103         return d;
 104     }
 105 #ifdef HAS_QUAD
 106     if (uv < UTF8_QUAD_MAX)
 107 #endif
 108     {
 109         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 110         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 111         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 112         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 113         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 114         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 115         *d++ = (( uv        & 0x3f) | 0x80);
 116         return d;
 117     }
 118 #ifdef HAS_QUAD
 119     {
 120         *d++ =                        0xff;     /* Can't match U+FFFE! */
 121         *d++ =                        0x80;     /* 6 Reserved bits */
 122         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 123         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 126         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 127         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 128         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 129         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 130         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 131         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 132         *d++ = (( uv        & 0x3f) | 0x80);
 133         return d;
 134     }
 135 #endif
 136 #endif /* Loop style */
 137 }
 138
 139
 140
 141 /*
 142 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 143
 144 Tests if some arbitrary number of bytes begins in a valid UTF-8
 145 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 146 The actual number of bytes in the UTF-8 character will be returned if
 147 it is valid, otherwise 0.
 148
 149 =cut
 150 */
 151 STRLEN
 152 Perl_is_utf8_char(pTHX_ U8 *s)
 153 {
 154     U8 u = *s;
 155     STRLEN slen, len;
 156     UV uv, ouv;
 157
 158     if (UTF8_IS_INVARIANT(u))
 159         return 1;
 160
 161     if (!UTF8_IS_START(u))
 162         return 0;
 163
 164     len = UTF8SKIP(s);
 165
 166     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 167         return 0;
 168
 169     slen = len - 1;
 170     s++;
 171     u &= UTF_START_MASK(len);
 172     uv  = u;
 173     ouv = uv;
 174     while (slen--) {
 175         if (!UTF8_IS_CONTINUATION(*s))
 176             return 0;
 177         uv = UTF8_ACCUMULATE(uv, *s);
 178         if (uv < ouv)
 179             return 0;
 180         ouv = uv;
 181         s++;
 182     }
 183
 184     if (UNISKIP(uv) < len)
 185         return 0;
 186
 187     return len;
 188 }
 189
 190 /*
 191 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 192
 193 Returns true if first C<len> bytes of the given string form a valid UTF8
 194 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 195 'a string that contains UTF8' because a valid ASCII string is a valid
 196 UTF8 string.
 197
 198 =cut
 199 */
 200
 201 bool
 202 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 203 {
 204     U8* x = s;
 205     U8* send;
 206     STRLEN c;
 207
 208     if (!len)
 209         len = strlen((char *)s);
 210     send = s + len;
 211
 212     while (x < send) {
 213         c = is_utf8_char(x);
 214         if (!c)
 215             return FALSE;
 216         x += c;
 217     }
 218     if (x != send)
 219         return FALSE;
 220
 221     return TRUE;
 222 }
 223
 224 /*
 225 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 226
 227 Bottom level UTF-8 decode routine.
 228 Returns the unicode code point value of the first character in the string C<s>
 229 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 230 C<retlen> will be set to the length, in bytes, of that character.
 231
 232 If C<s> does not point to a well-formed UTF8 character, the behaviour
 233 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 234 it is assumed that the caller will raise a warning, and this function
 235 will silently just set C<retlen> to C<-1> and return zero.  If the
 236 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 237 malformations will be given, C<retlen> will be set to the expected
 238 length of the UTF-8 character in bytes, and zero will be returned.
 239
 240 The C<flags> can also contain various flags to allow deviations from
 241 the strict UTF-8 encoding (see F<utf8.h>).
 242
 243 Most code should use utf8_to_uvchr() rather than call this directly.
 244
 245 =cut
 246 */
 247
 248 UV
 249 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 250 {
 251     UV uv = *s, ouv = 0;
 252     STRLEN len = 1;
 253     bool dowarn = ckWARN_d(WARN_UTF8);
 254     STRLEN expectlen = 0;
 255     U32 warning = 0;
 256
 257 /* This list is a superset of the UTF8_ALLOW_XXX. */
 258
 259 #define UTF8_WARN_EMPTY                          1
 260 #define UTF8_WARN_CONTINUATION                   2
 261 #define UTF8_WARN_NON_CONTINUATION               3
 262 #define UTF8_WARN_FE_FF                          4
 263 #define UTF8_WARN_SHORT                          5
 264 #define UTF8_WARN_OVERFLOW                       6
 265 #define UTF8_WARN_SURROGATE                      7
 266 #define UTF8_WARN_BOM                            8
 267 #define UTF8_WARN_LONG                           9
 268 #define UTF8_WARN_FFFF                          10
 269
 270     if (curlen == 0 &&
 271         !(flags & UTF8_ALLOW_EMPTY)) {
 272         warning = UTF8_WARN_EMPTY;
 273         goto malformed;
 274     }
 275
 276     if (UTF8_IS_INVARIANT(uv)) {
 277         if (retlen)
 278             *retlen = 1;
 279         return (UV) (NATIVE_TO_UTF(*s));
 280     }
 281
 282     if (UTF8_IS_CONTINUATION(uv) &&
 283         !(flags & UTF8_ALLOW_CONTINUATION)) {
 284         warning = UTF8_WARN_CONTINUATION;
 285         goto malformed;
 286     }
 287
 288     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 289         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 290         warning = UTF8_WARN_NON_CONTINUATION;
 291         goto malformed;
 292     }
 293
 294 #ifdef EBCDIC
 295     uv = NATIVE_TO_UTF(uv);
 296 #else
 297     if ((uv == 0xfe || uv == 0xff) &&
 298         !(flags & UTF8_ALLOW_FE_FF)) {
 299         warning = UTF8_WARN_FE_FF;
 300         goto malformed;
 301     }
 302 #endif
 303
 304     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 305     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 306     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 307     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 308 #ifdef EBCDIC
 309     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 310     else                        { len =  7; uv &= 0x01; }
 311 #else
 312     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 313     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 314     else                        { len = 13; uv = 0; } /* whoa! */
 315 #endif
 316
 317     if (retlen)
 318         *retlen = len;
 319
 320     expectlen = len;
 321
 322     if ((curlen < expectlen) &&
 323         !(flags & UTF8_ALLOW_SHORT)) {
 324         warning = UTF8_WARN_SHORT;
 325         goto malformed;
 326     }
 327
 328     len--;
 329     s++;
 330     ouv = uv;
 331
 332     while (len--) {
 333         if (!UTF8_IS_CONTINUATION(*s) &&
 334             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 335             s--;
 336             warning = UTF8_WARN_NON_CONTINUATION;
 337             goto malformed;
 338         }
 339         else
 340             uv = UTF8_ACCUMULATE(uv, *s);
 341         if (!(uv > ouv)) {
 342             /* These cannot be allowed. */
 343             if (uv == ouv) {
 344                 if (!(flags & UTF8_ALLOW_LONG)) {
 345                     warning = UTF8_WARN_LONG;
 346                     goto malformed;
 347                 }
 348             }
 349             else { /* uv < ouv */
 350                 /* This cannot be allowed. */
 351                 warning = UTF8_WARN_OVERFLOW;
 352                 goto malformed;
 353             }
 354         }
 355         s++;
 356         ouv = uv;
 357     }
 358
 359     if (UNICODE_IS_SURROGATE(uv) &&
 360         !(flags & UTF8_ALLOW_SURROGATE)) {
 361         warning = UTF8_WARN_SURROGATE;
 362         goto malformed;
 363     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 364                !(flags & UTF8_ALLOW_BOM)) {
 365         warning = UTF8_WARN_BOM;
 366         goto malformed;
 367     } else if ((expectlen > UNISKIP(uv)) &&
 368                !(flags & UTF8_ALLOW_LONG)) {
 369         warning = UTF8_WARN_LONG;
 370         goto malformed;
 371     } else if (UNICODE_IS_ILLEGAL(uv) &&
 372                !(flags & UTF8_ALLOW_FFFF)) {
 373         warning = UTF8_WARN_FFFF;
 374         goto malformed;
 375     }
 376
 377     return uv;
 378
 379 malformed:
 380
 381     if (flags & UTF8_CHECK_ONLY) {
 382         if (retlen)
 383             *retlen = -1;
 384         return 0;
 385     }
 386
 387     if (dowarn) {
 388         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 389
 390         switch (warning) {
 391         case 0: /* Intentionally empty. */ break;
 392         case UTF8_WARN_EMPTY:
 393             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 394             break;
 395         case UTF8_WARN_CONTINUATION:
 396             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 397             break;
 398         case UTF8_WARN_NON_CONTINUATION:
 399             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 400                            (UV)s[1], uv);
 401             break;
 402         case UTF8_WARN_FE_FF:
 403             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 404             break;
 405         case UTF8_WARN_SHORT:
 406             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 407                            curlen, curlen == 1 ? "" : "s", expectlen);
 408             expectlen = curlen;         /* distance for caller to skip */
 409             break;
 410         case UTF8_WARN_OVERFLOW:
 411             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 412                            ouv, *s);
 413             break;
 414         case UTF8_WARN_SURROGATE:
 415             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 416             break;
 417         case UTF8_WARN_BOM:
 418             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 419             break;
 420         case UTF8_WARN_LONG:
 421             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 422                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 423             break;
 424         case UTF8_WARN_FFFF:
 425             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 426             break;
 427         default:
 428             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 429             break;
 430         }
 431
 432         if (warning) {
 433             char *s = SvPVX(sv);
 434
 435             if (PL_op)
 436                 Perl_warner(aTHX_ WARN_UTF8,
 437                             "%s in %s", s,  OP_DESC(PL_op));
 438             else
 439                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 440         }
 441     }
 442
 443     if (retlen)
 444         *retlen = expectlen ? expectlen : len;
 445
 446     return 0;
 447 }
 448
 449 /*
 450 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 451
 452 Returns the native character value of the first character in the string C<s>
 453 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 454 length, in bytes, of that character.
 455
 456 If C<s> does not point to a well-formed UTF8 character, zero is
 457 returned and retlen is set, if possible, to -1.
 458
 459 =cut
 460 */
 461
 462 UV
 463 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 464 {
 465     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 466 }
 467
 468 /*
 469 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 470
 471 Returns the Unicode code point of the first character in the string C<s>
 472 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 473 length, in bytes, of that character.
 474
 475 This function should only be used when returned UV is considered
 476 an index into the Unicode semantic tables (e.g. swashes).
 477
 478 If C<s> does not point to a well-formed UTF8 character, zero is
 479 returned and retlen is set, if possible, to -1.
 480
 481 =cut
 482 */
 483
 484 UV
 485 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 486 {
 487     /* Call the low level routine asking for checks */
 488     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 489 }
 490
 491 /*
 492 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 493
 494 Return the length of the UTF-8 char encoded string C<s> in characters.
 495 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 496 up past C<e>, croaks.
 497
 498 =cut
 499 */
 500
 501 STRLEN
 502 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 503 {
 504     STRLEN len = 0;
 505
 506     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 507      * the bitops (especially ~) can create illegal UTF-8.
 508      * In other words: in Perl UTF-8 is not just for Unicode. */
 509
 510     if (e < s)
 511         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 512     while (s < e) {
 513         U8 t = UTF8SKIP(s);
 514
 515         if (e - s < t)
 516             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 517         s += t;
 518         len++;
 519     }
 520
 521     return len;
 522 }
 523
 524 /*
 525 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 526
 527 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 528 and C<b>.
 529
 530 WARNING: use only if you *know* that the pointers point inside the
 531 same UTF-8 buffer.
 532
 533 =cut
 534 */
 535
 536 IV
 537 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 538 {
 539     IV off = 0;
 540
 541     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 542      * the bitops (especially ~) can create illegal UTF-8.
 543      * In other words: in Perl UTF-8 is not just for Unicode. */
 544
 545     if (a < b) {
 546         while (a < b) {
 547             U8 c = UTF8SKIP(a);
 548
 549             if (b - a < c)
 550                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 551             a += c;
 552             off--;
 553         }
 554     }
 555     else {
 556         while (b < a) {
 557             U8 c = UTF8SKIP(b);
 558
 559             if (a - b < c)
 560                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 561             b += c;
 562             off++;
 563         }
 564     }
 565
 566     return off;
 567 }
 568
 569 /*
 570 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 571
 572 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 573 forward or backward.
 574
 575 WARNING: do not use the following unless you *know* C<off> is within
 576 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 577 on the first byte of character or just after the last byte of a character.
 578
 579 =cut
 580 */
 581
 582 U8 *
 583 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 584 {
 585     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 586      * the bitops (especially ~) can create illegal UTF-8.
 587      * In other words: in Perl UTF-8 is not just for Unicode. */
 588
 589     if (off >= 0) {
 590         while (off--)
 591             s += UTF8SKIP(s);
 592     }
 593     else {
 594         while (off++) {
 595             s--;
 596             while (UTF8_IS_CONTINUATION(*s))
 597                 s--;
 598         }
 599     }
 600     return s;
 601 }
 602
 603 /*
 604 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 605
 606 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 607 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 608 updates len to contain the new length.
 609 Returns zero on failure, setting C<len> to -1.
 610
 611 =cut
 612 */
 613
 614 U8 *
 615 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 616 {
 617     U8 *send;
 618     U8 *d;
 619     U8 *save = s;
 620
 621     /* ensure valid UTF8 and chars < 256 before updating string */
 622     for (send = s + *len; s < send; ) {
 623         U8 c = *s++;
 624
 625         if (!UTF8_IS_INVARIANT(c) &&
 626             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 627              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 628             *len = -1;
 629             return 0;
 630         }
 631     }
 632
 633     d = s = save;
 634     while (s < send) {
 635         STRLEN ulen;
 636         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 637         s += ulen;
 638     }
 639     *d = '\0';
 640     *len = d - save;
 641     return save;
 642 }
 643
 644 /*
 645 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 646
 647 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 648 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 649 the newly-created string, and updates C<len> to contain the new
 650 length.  Returns the original string if no conversion occurs, C<len>
 651 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 652 0 if C<s> is converted or contains all 7bit characters.
 653
 654 =cut
 655 */
 656
 657 U8 *
 658 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 659 {
 660     U8 *d;
 661     U8 *start = s;
 662     U8 *send;
 663     I32 count = 0;
 664
 665     if (!*is_utf8)
 666         return start;
 667
 668     /* ensure valid UTF8 and chars < 256 before converting string */
 669     for (send = s + *len; s < send;) {
 670         U8 c = *s++;
 671         if (!UTF8_IS_INVARIANT(c)) {
 672             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 673                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 674                 count++;
 675             else
 676                 return start;
 677         }
 678     }
 679
 680     *is_utf8 = 0;
 681
 682     Newz(801, d, (*len) - count + 1, U8);
 683     s = start; start = d;
 684     while (s < send) {
 685         U8 c = *s++;
 686         if (!UTF8_IS_INVARIANT(c)) {
 687             /* Then it is two-byte encoded */
 688             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 689             c = ASCII_TO_NATIVE(c);
 690         }
 691         *d++ = c;
 692     }
 693     *d = '\0';
 694     *len = d - start;
 695     return start;
 696 }
 697
 698 /*
 699 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 700
 701 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 702 Returns a pointer to the newly-created string, and sets C<len> to
 703 reflect the new length.
 704
 705 =cut
 706 */
 707
 708 U8*
 709 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 710 {
 711     U8 *send;
 712     U8 *d;
 713     U8 *dst;
 714     send = s + (*len);
 715
 716     Newz(801, d, (*len) * 2 + 1, U8);
 717     dst = d;
 718
 719     while (s < send) {
 720         UV uv = NATIVE_TO_ASCII(*s++);
 721         if (UNI_IS_INVARIANT(uv))
 722             *d++ = UTF_TO_NATIVE(uv);
 723         else {
 724             *d++ = UTF8_EIGHT_BIT_HI(uv);
 725             *d++ = UTF8_EIGHT_BIT_LO(uv);
 726         }
 727     }
 728     *d = '\0';
 729     *len = d-dst;
 730     return dst;
 731 }
 732
 733 /*
 734  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 735  *
 736  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 737  * We optimize for native, for obvious reasons. */
 738
 739 U8*
 740 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 741 {
 742     U8* pend;
 743     U8* dstart = d;
 744
 745     if (bytelen & 1)
 746         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 747
 748     pend = p + bytelen;
 749
 750     while (p < pend) {
 751         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 752         p += 2;
 753         if (uv < 0x80) {
 754             *d++ = uv;
 755             continue;
 756         }
 757         if (uv < 0x800) {
 758             *d++ = (( uv >>  6)         | 0xc0);
 759             *d++ = (( uv        & 0x3f) | 0x80);
 760             continue;
 761         }
 762         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 763             UV low = *p++;
 764             if (low < 0xdc00 || low >= 0xdfff)
 765                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 766             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 767         }
 768         if (uv < 0x10000) {
 769             *d++ = (( uv >> 12)         | 0xe0);
 770             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 771             *d++ = (( uv        & 0x3f) | 0x80);
 772             continue;
 773         }
 774         else {
 775             *d++ = (( uv >> 18)         | 0xf0);
 776             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 777             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 778             *d++ = (( uv        & 0x3f) | 0x80);
 779             continue;
 780         }
 781     }
 782     *newlen = d - dstart;
 783     return d;
 784 }
 785
 786 /* Note: this one is slightly destructive of the source. */
 787
 788 U8*
 789 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 790 {
 791     U8* s = (U8*)p;
 792     U8* send = s + bytelen;
 793     while (s < send) {
 794         U8 tmp = s[0];
 795         s[0] = s[1];
 796         s[1] = tmp;
 797         s += 2;
 798     }
 799     return utf16_to_utf8(p, d, bytelen, newlen);
 800 }
 801
 802 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 803
 804 bool
 805 Perl_is_uni_alnum(pTHX_ UV c)
 806 {
 807     U8 tmpbuf[UTF8_MAXLEN+1];
 808     uvchr_to_utf8(tmpbuf, (UV)c);
 809     return is_utf8_alnum(tmpbuf);
 810 }
 811
 812 bool
 813 Perl_is_uni_alnumc(pTHX_ UV c)
 814 {
 815     U8 tmpbuf[UTF8_MAXLEN+1];
 816     uvchr_to_utf8(tmpbuf, (UV)c);
 817     return is_utf8_alnumc(tmpbuf);
 818 }
 819
 820 bool
 821 Perl_is_uni_idfirst(pTHX_ UV c)
 822 {
 823     U8 tmpbuf[UTF8_MAXLEN+1];
 824     uvchr_to_utf8(tmpbuf, (UV)c);
 825     return is_utf8_idfirst(tmpbuf);
 826 }
 827
 828 bool
 829 Perl_is_uni_alpha(pTHX_ UV c)
 830 {
 831     U8 tmpbuf[UTF8_MAXLEN+1];
 832     uvchr_to_utf8(tmpbuf, (UV)c);
 833     return is_utf8_alpha(tmpbuf);
 834 }
 835
 836 bool
 837 Perl_is_uni_ascii(pTHX_ UV c)
 838 {
 839     U8 tmpbuf[UTF8_MAXLEN+1];
 840     uvchr_to_utf8(tmpbuf, (UV)c);
 841     return is_utf8_ascii(tmpbuf);
 842 }
 843
 844 bool
 845 Perl_is_uni_space(pTHX_ UV c)
 846 {
 847     U8 tmpbuf[UTF8_MAXLEN+1];
 848     uvchr_to_utf8(tmpbuf, (UV)c);
 849     return is_utf8_space(tmpbuf);
 850 }
 851
 852 bool
 853 Perl_is_uni_digit(pTHX_ UV c)
 854 {
 855     U8 tmpbuf[UTF8_MAXLEN+1];
 856     uvchr_to_utf8(tmpbuf, (UV)c);
 857     return is_utf8_digit(tmpbuf);
 858 }
 859
 860 bool
 861 Perl_is_uni_upper(pTHX_ UV c)
 862 {
 863     U8 tmpbuf[UTF8_MAXLEN+1];
 864     uvchr_to_utf8(tmpbuf, (UV)c);
 865     return is_utf8_upper(tmpbuf);
 866 }
 867
 868 bool
 869 Perl_is_uni_lower(pTHX_ UV c)
 870 {
 871     U8 tmpbuf[UTF8_MAXLEN+1];
 872     uvchr_to_utf8(tmpbuf, (UV)c);
 873     return is_utf8_lower(tmpbuf);
 874 }
 875
 876 bool
 877 Perl_is_uni_cntrl(pTHX_ UV c)
 878 {
 879     U8 tmpbuf[UTF8_MAXLEN+1];
 880     uvchr_to_utf8(tmpbuf, (UV)c);
 881     return is_utf8_cntrl(tmpbuf);
 882 }
 883
 884 bool
 885 Perl_is_uni_graph(pTHX_ UV c)
 886 {
 887     U8 tmpbuf[UTF8_MAXLEN+1];
 888     uvchr_to_utf8(tmpbuf, (UV)c);
 889     return is_utf8_graph(tmpbuf);
 890 }
 891
 892 bool
 893 Perl_is_uni_print(pTHX_ UV c)
 894 {
 895     U8 tmpbuf[UTF8_MAXLEN+1];
 896     uvchr_to_utf8(tmpbuf, (UV)c);
 897     return is_utf8_print(tmpbuf);
 898 }
 899
 900 bool
 901 Perl_is_uni_punct(pTHX_ UV c)
 902 {
 903     U8 tmpbuf[UTF8_MAXLEN+1];
 904     uvchr_to_utf8(tmpbuf, (UV)c);
 905     return is_utf8_punct(tmpbuf);
 906 }
 907
 908 bool
 909 Perl_is_uni_xdigit(pTHX_ UV c)
 910 {
 911     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 912     uvchr_to_utf8(tmpbuf, (UV)c);
 913     return is_utf8_xdigit(tmpbuf);
 914 }
 915
 916 UV
 917 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 918 {
 919     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 920     uvchr_to_utf8(tmpbuf, (UV)c);
 921     return to_utf8_upper(tmpbuf, p, lenp);
 922 }
 923
 924 UV
 925 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 926 {
 927     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 928     uvchr_to_utf8(tmpbuf, (UV)c);
 929     return to_utf8_title(tmpbuf, p, lenp);
 930 }
 931
 932 UV
 933 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 934 {
 935     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 936     uvchr_to_utf8(tmpbuf, (UV)c);
 937     return to_utf8_lower(tmpbuf, p, lenp);
 938 }
 939
 940 UV
 941 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 942 {
 943     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
 944     uvchr_to_utf8(tmpbuf, (UV)c);
 945     return to_utf8_fold(tmpbuf, p, lenp);
 946 }
 947
 948 /* for now these all assume no locale info available for Unicode > 255 */
 949
 950 bool
 951 Perl_is_uni_alnum_lc(pTHX_ UV c)
 952 {
 953     return is_uni_alnum(c);     /* XXX no locale support yet */
 954 }
 955
 956 bool
 957 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 958 {
 959     return is_uni_alnumc(c);    /* XXX no locale support yet */
 960 }
 961
 962 bool
 963 Perl_is_uni_idfirst_lc(pTHX_ UV c)
 964 {
 965     return is_uni_idfirst(c);   /* XXX no locale support yet */
 966 }
 967
 968 bool
 969 Perl_is_uni_alpha_lc(pTHX_ UV c)
 970 {
 971     return is_uni_alpha(c);     /* XXX no locale support yet */
 972 }
 973
 974 bool
 975 Perl_is_uni_ascii_lc(pTHX_ UV c)
 976 {
 977     return is_uni_ascii(c);     /* XXX no locale support yet */
 978 }
 979
 980 bool
 981 Perl_is_uni_space_lc(pTHX_ UV c)
 982 {
 983     return is_uni_space(c);     /* XXX no locale support yet */
 984 }
 985
 986 bool
 987 Perl_is_uni_digit_lc(pTHX_ UV c)
 988 {
 989     return is_uni_digit(c);     /* XXX no locale support yet */
 990 }
 991
 992 bool
 993 Perl_is_uni_upper_lc(pTHX_ UV c)
 994 {
 995     return is_uni_upper(c);     /* XXX no locale support yet */
 996 }
 997
 998 bool
 999 Perl_is_uni_lower_lc(pTHX_ UV c)
1000 {
1001     return is_uni_lower(c);     /* XXX no locale support yet */
1002 }
1003
1004 bool
1005 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1006 {
1007     return is_uni_cntrl(c);     /* XXX no locale support yet */
1008 }
1009
1010 bool
1011 Perl_is_uni_graph_lc(pTHX_ UV c)
1012 {
1013     return is_uni_graph(c);     /* XXX no locale support yet */
1014 }
1015
1016 bool
1017 Perl_is_uni_print_lc(pTHX_ UV c)
1018 {
1019     return is_uni_print(c);     /* XXX no locale support yet */
1020 }
1021
1022 bool
1023 Perl_is_uni_punct_lc(pTHX_ UV c)
1024 {
1025     return is_uni_punct(c);     /* XXX no locale support yet */
1026 }
1027
1028 bool
1029 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1030 {
1031     return is_uni_xdigit(c);    /* XXX no locale support yet */
1032 }
1033
1034 bool
1035 Perl_is_utf8_alnum(pTHX_ U8 *p)
1036 {
1037     if (!is_utf8_char(p))
1038         return FALSE;
1039     if (!PL_utf8_alnum)
1040         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1041          * descendant of isalnum(3), in other words, it doesn't
1042          * contain the '_'. --jhi */
1043         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1044     return swash_fetch(PL_utf8_alnum, p, TRUE);
1045 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1046 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1047     if (!PL_utf8_alnum)
1048         PL_utf8_alnum = swash_init("utf8", "",
1049             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1050     return swash_fetch(PL_utf8_alnum, p, TRUE);
1051 #endif
1052 }
1053
1054 bool
1055 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1056 {
1057     if (!is_utf8_char(p))
1058         return FALSE;
1059     if (!PL_utf8_alnum)
1060         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1061     return swash_fetch(PL_utf8_alnum, p, TRUE);
1062 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1063 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1064     if (!PL_utf8_alnum)
1065         PL_utf8_alnum = swash_init("utf8", "",
1066             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1067     return swash_fetch(PL_utf8_alnum, p, TRUE);
1068 #endif
1069 }
1070
1071 bool
1072 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1073 {
1074     return *p == '_' || is_utf8_alpha(p);
1075 }
1076
1077 bool
1078 Perl_is_utf8_alpha(pTHX_ U8 *p)
1079 {
1080     if (!is_utf8_char(p))
1081         return FALSE;
1082     if (!PL_utf8_alpha)
1083         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1084     return swash_fetch(PL_utf8_alpha, p, TRUE);
1085 }
1086
1087 bool
1088 Perl_is_utf8_ascii(pTHX_ U8 *p)
1089 {
1090     if (!is_utf8_char(p))
1091         return FALSE;
1092     if (!PL_utf8_ascii)
1093         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1094     return swash_fetch(PL_utf8_ascii, p, TRUE);
1095 }
1096
1097 bool
1098 Perl_is_utf8_space(pTHX_ U8 *p)
1099 {
1100     if (!is_utf8_char(p))
1101         return FALSE;
1102     if (!PL_utf8_space)
1103         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1104     return swash_fetch(PL_utf8_space, p, TRUE);
1105 }
1106
1107 bool
1108 Perl_is_utf8_digit(pTHX_ U8 *p)
1109 {
1110     if (!is_utf8_char(p))
1111         return FALSE;
1112     if (!PL_utf8_digit)
1113         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1114     return swash_fetch(PL_utf8_digit, p, TRUE);
1115 }
1116
1117 bool
1118 Perl_is_utf8_upper(pTHX_ U8 *p)
1119 {
1120     if (!is_utf8_char(p))
1121         return FALSE;
1122     if (!PL_utf8_upper)
1123         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1124     return swash_fetch(PL_utf8_upper, p, TRUE);
1125 }
1126
1127 bool
1128 Perl_is_utf8_lower(pTHX_ U8 *p)
1129 {
1130     if (!is_utf8_char(p))
1131         return FALSE;
1132     if (!PL_utf8_lower)
1133         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1134     return swash_fetch(PL_utf8_lower, p, TRUE);
1135 }
1136
1137 bool
1138 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1139 {
1140     if (!is_utf8_char(p))
1141         return FALSE;
1142     if (!PL_utf8_cntrl)
1143         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1144     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1145 }
1146
1147 bool
1148 Perl_is_utf8_graph(pTHX_ U8 *p)
1149 {
1150     if (!is_utf8_char(p))
1151         return FALSE;
1152     if (!PL_utf8_graph)
1153         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1154     return swash_fetch(PL_utf8_graph, p, TRUE);
1155 }
1156
1157 bool
1158 Perl_is_utf8_print(pTHX_ U8 *p)
1159 {
1160     if (!is_utf8_char(p))
1161         return FALSE;
1162     if (!PL_utf8_print)
1163         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1164     return swash_fetch(PL_utf8_print, p, TRUE);
1165 }
1166
1167 bool
1168 Perl_is_utf8_punct(pTHX_ U8 *p)
1169 {
1170     if (!is_utf8_char(p))
1171         return FALSE;
1172     if (!PL_utf8_punct)
1173         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1174     return swash_fetch(PL_utf8_punct, p, TRUE);
1175 }
1176
1177 bool
1178 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1179 {
1180     if (!is_utf8_char(p))
1181         return FALSE;
1182     if (!PL_utf8_xdigit)
1183         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1184     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1185 }
1186
1187 bool
1188 Perl_is_utf8_mark(pTHX_ U8 *p)
1189 {
1190     if (!is_utf8_char(p))
1191         return FALSE;
1192     if (!PL_utf8_mark)
1193         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1194     return swash_fetch(PL_utf8_mark, p, TRUE);
1195 }
1196
1197 /*
1198 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1199
1200 The "p" contains the pointer to the UTF-8 string encoding
1201 the character that is being converted.
1202
1203 The "ustrp" is a pointer to the character buffer to put the
1204 conversion result to.  The "lenp" is a pointer to the length
1205 of the result.
1206
1207 The "swash" is a pointer to the swash to use.
1208
1209 The "normal" is a string like "ToLower" which means the swash
1210 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1211 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1212
1213 The "special" is a string like "utf8::ToSpecLower", which means
1214 the hash %utf8::ToSpecLower, which is stored in the same file,
1215 lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
1216 to the hash is by Perl_to_utf8_case().
1217
1218 =cut
1219  */
1220
1221 UV
1222 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1223 {
1224     UV uv;
1225
1226     if (!*swashp)
1227         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1228     uv = swash_fetch(*swashp, p, TRUE);
1229     if (uv)
1230          uv = UNI_TO_NATIVE(uv);
1231     else {
1232          HV *hv;
1233          SV *keysv;
1234          HE *he;
1235
1236          uv = utf8_to_uvchr(p, 0);
1237
1238          if ((hv    = get_hv(special, FALSE)) &&
1239              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1240              (he    = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1241               SV *val = HeVAL(he);
1242               char *s = SvPV(val, *lenp);
1243               U8 c = *(U8*)s;
1244               if (*lenp > 1 || UNI_IS_INVARIANT(c))
1245                    Copy(s, ustrp, *lenp, U8);
1246               else {
1247                    /* something in the 0x80..0xFF range */
1248                    ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1249                    ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1250                    *lenp = 2;
1251               }
1252               return 0;
1253          }
1254     }
1255     if (lenp)
1256        *lenp = UNISKIP(uv);
1257     uvuni_to_utf8(ustrp, uv);
1258     return uv;
1259 }
1260
1261 UV
1262 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1263 {
1264     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1265                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1266 }
1267
1268 UV
1269 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1270 {
1271     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1272                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1273 }
1274
1275 UV
1276 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1277 {
1278     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1279                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1280 }
1281
1282 UV
1283 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1284 {
1285     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1286                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1287 }
1288
1289 /* a "swash" is a swatch hash */
1290
1291 SV*
1292 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1293 {
1294     SV* retval;
1295     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1296     dSP;
1297     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1298     SV* errsv_save;
1299
1300     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1301         ENTER;
1302         errsv_save = newSVsv(ERRSV);
1303         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1304         if (!SvTRUE(ERRSV))
1305             sv_setsv(ERRSV, errsv_save);
1306         SvREFCNT_dec(errsv_save);
1307         LEAVE;
1308     }
1309     SPAGAIN;
1310     PUSHSTACKi(PERLSI_MAGIC);
1311     PUSHMARK(SP);
1312     EXTEND(SP,5);
1313     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1314     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1315     PUSHs(listsv);
1316     PUSHs(sv_2mortal(newSViv(minbits)));
1317     PUSHs(sv_2mortal(newSViv(none)));
1318     PUTBACK;
1319     ENTER;
1320     SAVEI32(PL_hints);
1321     PL_hints = 0;
1322     save_re_context();
1323     if (PL_curcop == &PL_compiling)
1324         /* XXX ought to be handled by lex_start */
1325         sv_setpv(tokenbufsv, PL_tokenbuf);
1326     errsv_save = newSVsv(ERRSV);
1327     if (call_method("SWASHNEW", G_SCALAR))
1328         retval = newSVsv(*PL_stack_sp--);
1329     else
1330         retval = &PL_sv_undef;
1331     if (!SvTRUE(ERRSV))
1332         sv_setsv(ERRSV, errsv_save);
1333     SvREFCNT_dec(errsv_save);
1334     LEAVE;
1335     POPSTACK;
1336     if (PL_curcop == &PL_compiling) {
1337         STRLEN len;
1338         char* pv = SvPV(tokenbufsv, len);
1339
1340         Copy(pv, PL_tokenbuf, len+1, char);
1341         PL_curcop->op_private = PL_hints;
1342     }
1343     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1344         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1345     return retval;
1346 }
1347
1348
1349 /* This API is wrong for special case conversions since we may need to
1350  * return several Unicode characters for a single Unicode character
1351  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1352  * the lower-level routine, and it is similarly broken for returning
1353  * multiple values.  --jhi */
1354 UV
1355 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1356 {
1357     HV* hv = (HV*)SvRV(sv);
1358     U32 klen;
1359     U32 off;
1360     STRLEN slen;
1361     STRLEN needents;
1362     U8 *tmps = NULL;
1363     U32 bit;
1364     SV *retval;
1365     U8 tmputf8[2];
1366     UV c = NATIVE_TO_ASCII(*ptr);
1367
1368     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1369         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1370         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1371         ptr = tmputf8;
1372     }
1373     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1374      * then the "swatch" is a vec() for al the chars which start
1375      * with 0xAA..0xYY
1376      * So the key in the hash (klen) is length of encoded char -1
1377      */
1378     klen = UTF8SKIP(ptr) - 1;
1379     off  = ptr[klen];
1380
1381     if (klen == 0)
1382      {
1383       /* If char in invariant then swatch is for all the invariant chars
1384        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1385        */
1386       needents = UTF_CONTINUATION_MARK;
1387       off      = NATIVE_TO_UTF(ptr[klen]);
1388      }
1389     else
1390      {
1391       /* If char is encoded then swatch is for the prefix */
1392       needents = (1 << UTF_ACCUMULATION_SHIFT);
1393       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1394      }
1395
1396     /*
1397      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1398      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1399      * it's nothing to sniff at.)  Pity we usually come through at least
1400      * two function calls to get here...
1401      *
1402      * NB: this code assumes that swatches are never modified, once generated!
1403      */
1404
1405     if (hv   == PL_last_swash_hv &&
1406         klen == PL_last_swash_klen &&
1407         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1408     {
1409         tmps = PL_last_swash_tmps;
1410         slen = PL_last_swash_slen;
1411     }
1412     else {
1413         /* Try our second-level swatch cache, kept in a hash. */
1414         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1415
1416         /* If not cached, generate it via utf8::SWASHGET */
1417         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1418             dSP;
1419             /* We use utf8n_to_uvuni() as we want an index into
1420                Unicode tables, not a native character number.
1421              */
1422             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1423             SV *errsv_save;
1424             ENTER;
1425             SAVETMPS;
1426             save_re_context();
1427             PUSHSTACKi(PERLSI_MAGIC);
1428             PUSHMARK(SP);
1429             EXTEND(SP,3);
1430             PUSHs((SV*)sv);
1431             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1432             PUSHs(sv_2mortal(newSViv((klen) ?
1433                                      (code_point & ~(needents - 1)) : 0)));
1434             PUSHs(sv_2mortal(newSViv(needents)));
1435             PUTBACK;
1436             errsv_save = newSVsv(ERRSV);
1437             if (call_method("SWASHGET", G_SCALAR))
1438                 retval = newSVsv(*PL_stack_sp--);
1439             else
1440                 retval = &PL_sv_undef;
1441             if (!SvTRUE(ERRSV))
1442                 sv_setsv(ERRSV, errsv_save);
1443             SvREFCNT_dec(errsv_save);
1444             POPSTACK;
1445             FREETMPS;
1446             LEAVE;
1447             if (PL_curcop == &PL_compiling)
1448                 PL_curcop->op_private = PL_hints;
1449
1450             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1451
1452             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1453                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1454         }
1455
1456         PL_last_swash_hv = hv;
1457         PL_last_swash_klen = klen;
1458         PL_last_swash_tmps = tmps;
1459         PL_last_swash_slen = slen;
1460         if (klen)
1461             Copy(ptr, PL_last_swash_key, klen, U8);
1462     }
1463
1464     switch ((int)((slen << 3) / needents)) {
1465     case 1:
1466         bit = 1 << (off & 7);
1467         off >>= 3;
1468         return (tmps[off] & bit) != 0;
1469     case 8:
1470         return tmps[off];
1471     case 16:
1472         off <<= 1;
1473         return (tmps[off] << 8) + tmps[off + 1] ;
1474     case 32:
1475         off <<= 2;
1476         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1477     }
1478     Perl_croak(aTHX_ "panic: swash_fetch");
1479     return 0;
1480 }
1481
1482
1483 /*
1484 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1485
1486 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1487 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1488 bytes available. The return value is the pointer to the byte after the
1489 end of the new character. In other words,
1490
1491     d = uvchr_to_utf8(d, uv);
1492
1493 is the recommended wide native character-aware way of saying
1494
1495     *(d++) = uv;
1496
1497 =cut
1498 */
1499
1500 /* On ASCII machines this is normally a macro but we want a
1501    real function in case XS code wants it
1502 */
1503 #undef Perl_uvchr_to_utf8
1504 U8 *
1505 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1506 {
1507     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1508 }
1509
1510
1511 /*
1512 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1513
1514 Returns the native character value of the first character in the string C<s>
1515 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1516 length, in bytes, of that character.
1517
1518 Allows length and flags to be passed to low level routine.
1519
1520 =cut
1521 */
1522 /* On ASCII machines this is normally a macro but we want a
1523    real function in case XS code wants it
1524 */
1525 #undef Perl_utf8n_to_uvchr
1526 UV
1527 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1528 {
1529     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1530     return UNI_TO_NATIVE(uv);
1531 }
1532
1533 /*
1534 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1535
1536 Build to the scalar dsv a displayable version of the string spv,
1537 length len, the displayable version being at most pvlim bytes long
1538 (if longer, the rest is truncated and "..." will be appended).
1539 The flags argument is currently unused but available for future extensions.
1540 The pointer to the PV of the dsv is returned.
1541
1542 =cut */
1543 char *
1544 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1545 {
1546     int truncated = 0;
1547     char *s, *e;
1548
1549     sv_setpvn(dsv, "", 0);
1550     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1551          UV u;
1552          if (pvlim && SvCUR(dsv) >= pvlim) {
1553               truncated++;
1554               break;
1555          }
1556          u = utf8_to_uvchr((U8*)s, 0);
1557          Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1558     }
1559     if (truncated)
1560          sv_catpvn(dsv, "...", 3);
1561
1562     return SvPVX(dsv);
1563 }
1564
1565 /*
1566 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1567
1568 Build to the scalar dsv a displayable version of the scalar sv,
1569 he displayable version being at most pvlim bytes long
1570 (if longer, the rest is truncated and "..." will be appended).
1571 The flags argument is currently unused but available for future extensions.
1572 The pointer to the PV of the dsv is returned.
1573
1574 =cut */
1575 char *
1576 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1577 {
1578      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1579                                 pvlim, flags);
1580 }
1581
1582 /*
1583 =for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1584
1585 Return true if the strings s1 and s2 differ case-insensitively, false
1586 if not (if they are equal case-insensitively).  If u1 is true, the
1587 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1588 the string s2 is assumed to be in UTF-8-encoded Unicode.
1589
1590 For case-insensitiveness, the "casefolding" of Unicode is used
1591 instead of upper/lowercasing both the characters, see
1592 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1593
1594 =cut */
1595 I32
1596 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1597 {
1598      register U8 *a  = (U8*)s1;
1599      register U8 *b  = (U8*)s2;
1600      register U8 *ae = b + len1;
1601      register U8 *be = b + len2;
1602      STRLEN la, lb;
1603      UV ca, cb;
1604      STRLEN ulen1, ulen2;
1605      U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1606      U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
1607
1608      while (a < ae && b < be) {
1609           if (u1) {
1610                if (a + UTF8SKIP(a) > ae)
1611                     break;
1612                ca = utf8_to_uvchr((U8*)a, &la);
1613           } else {
1614                ca = *a;
1615                la = 1;
1616           }
1617           if (u2) {
1618                if (b + UTF8SKIP(b) > be)
1619                     break;
1620                cb = utf8_to_uvchr((U8*)b, &lb);
1621           } else {
1622                cb = *b;
1623                lb = 1;
1624           }
1625           if (ca != cb) {
1626                if (u1)
1627                     to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1628                else
1629                     ulen1 = 1;
1630                if (u2)
1631                     to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1632                else
1633                     ulen2 = 1;
1634                if (ulen1 != ulen2
1635                    || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1636                    || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1637                     return 1; /* mismatch */
1638           }
1639           a += la;
1640           b += lb;
1641      }
1642      return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
1643 }
1644