utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (UNI_IS_INVARIANT(uv)) {
  50         *d++ = UTF_TO_NATIVE(uv);
  51         return d;
  52     }
  53     if (UNICODE_IS_SURROGATE(uv))
  54         Perl_croak(aTHX_ "UTF-16 surrogate 0x%04"UVxf, uv);
  55 #if defined(EBCDIC)
  56     else {
  57         STRLEN len  = UNISKIP(uv);
  58         U8 *p = d+len-1;
  59         while (p > d) {
  60             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  61             uv >>= UTF_ACCUMULATION_SHIFT;
  62         }
  63         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  64         return d+len;
  65     }
  66 #else /* Non loop style */
  67     if (uv < 0x800) {
  68         *d++ = (( uv >>  6)         | 0xc0);
  69         *d++ = (( uv        & 0x3f) | 0x80);
  70         return d;
  71     }
  72     if (uv < 0x10000) {
  73         *d++ = (( uv >> 12)         | 0xe0);
  74         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  75         *d++ = (( uv        & 0x3f) | 0x80);
  76         return d;
  77     }
  78     if (uv < 0x200000) {
  79         *d++ = (( uv >> 18)         | 0xf0);
  80         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  81         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  82         *d++ = (( uv        & 0x3f) | 0x80);
  83         return d;
  84     }
  85     if (uv < 0x4000000) {
  86         *d++ = (( uv >> 24)         | 0xf8);
  87         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  88         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  89         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  90         *d++ = (( uv        & 0x3f) | 0x80);
  91         return d;
  92     }
  93     if (uv < 0x80000000) {
  94         *d++ = (( uv >> 30)         | 0xfc);
  95         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  96         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  97         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  98         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  99         *d++ = (( uv        & 0x3f) | 0x80);
 100         return d;
 101     }
 102 #ifdef HAS_QUAD
 103     if (uv < UTF8_QUAD_MAX)
 104 #endif
 105     {
 106         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 107         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 109         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 110         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 111         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 112         *d++ = (( uv        & 0x3f) | 0x80);
 113         return d;
 114     }
 115 #ifdef HAS_QUAD
 116     {
 117         *d++ =                        0xff;     /* Can't match U+FFFE! */
 118         *d++ =                        0x80;     /* 6 Reserved bits */
 119         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 120         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 121         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 122         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 126         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 127         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 128         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 129         *d++ = (( uv        & 0x3f) | 0x80);
 130         return d;
 131     }
 132 #endif
 133 #endif /* Loop style */
 134 }
 135
 136
 137
 138 /*
 139 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 140
 141 Tests if some arbitrary number of bytes begins in a valid UTF-8
 142 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 143 The actual number of bytes in the UTF-8 character will be returned if
 144 it is valid, otherwise 0.
 145
 146 =cut
 147 */
 148 STRLEN
 149 Perl_is_utf8_char(pTHX_ U8 *s)
 150 {
 151     U8 u = *s;
 152     STRLEN slen, len;
 153     UV uv, ouv;
 154
 155     if (UTF8_IS_INVARIANT(u))
 156         return 1;
 157
 158     if (!UTF8_IS_START(u))
 159         return 0;
 160
 161     len = UTF8SKIP(s);
 162
 163     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 164         return 0;
 165
 166     slen = len - 1;
 167     s++;
 168     u &= UTF_START_MASK(len);
 169     uv  = u;
 170     ouv = uv;
 171     while (slen--) {
 172         if (!UTF8_IS_CONTINUATION(*s))
 173             return 0;
 174         uv = UTF8_ACCUMULATE(uv, *s);
 175         if (uv < ouv)
 176             return 0;
 177         ouv = uv;
 178         s++;
 179     }
 180
 181     if (UNISKIP(uv) < len)
 182         return 0;
 183
 184     return len;
 185 }
 186
 187 /*
 188 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 189
 190 Returns true if first C<len> bytes of the given string form a valid UTF8
 191 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 192 'a string that contains UTF8' because a valid ASCII string is a valid
 193 UTF8 string.
 194
 195 =cut
 196 */
 197
 198 bool
 199 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 200 {
 201     U8* x = s;
 202     U8* send;
 203     STRLEN c;
 204
 205     if (!len)
 206         len = strlen((char *)s);
 207     send = s + len;
 208
 209     while (x < send) {
 210         c = is_utf8_char(x);
 211         if (!c)
 212             return FALSE;
 213         x += c;
 214     }
 215     if (x != send)
 216         return FALSE;
 217
 218     return TRUE;
 219 }
 220
 221 /*
 222 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 223
 224 Bottom level UTF-8 decode routine.
 225 Returns the unicode code point value of the first character in the string C<s>
 226 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 227 C<retlen> will be set to the length, in bytes, of that character.
 228
 229 If C<s> does not point to a well-formed UTF8 character, the behaviour
 230 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 231 it is assumed that the caller will raise a warning, and this function
 232 will silently just set C<retlen> to C<-1> and return zero.  If the
 233 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 234 malformations will be given, C<retlen> will be set to the expected
 235 length of the UTF-8 character in bytes, and zero will be returned.
 236
 237 The C<flags> can also contain various flags to allow deviations from
 238 the strict UTF-8 encoding (see F<utf8.h>).
 239
 240 Most code should use utf8_to_uvchr() rather than call this directly.
 241
 242 =cut
 243 */
 244
 245 UV
 246 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 247 {
 248     UV uv = *s, ouv = 0;
 249     STRLEN len = 1;
 250     bool dowarn = ckWARN_d(WARN_UTF8);
 251     STRLEN expectlen = 0;
 252     U32 warning = 0;
 253
 254 /* This list is a superset of the UTF8_ALLOW_XXX. */
 255
 256 #define UTF8_WARN_EMPTY                          1
 257 #define UTF8_WARN_CONTINUATION                   2
 258 #define UTF8_WARN_NON_CONTINUATION               3
 259 #define UTF8_WARN_FE_FF                          4
 260 #define UTF8_WARN_SHORT                          5
 261 #define UTF8_WARN_OVERFLOW                       6
 262 #define UTF8_WARN_SURROGATE                      7
 263 #define UTF8_WARN_BOM                            8
 264 #define UTF8_WARN_LONG                           9
 265 #define UTF8_WARN_FFFF                          10
 266
 267     if (curlen == 0 &&
 268         !(flags & UTF8_ALLOW_EMPTY)) {
 269         warning = UTF8_WARN_EMPTY;
 270         goto malformed;
 271     }
 272
 273     if (UTF8_IS_INVARIANT(uv)) {
 274         if (retlen)
 275             *retlen = 1;
 276         return (UV) (NATIVE_TO_UTF(*s));
 277     }
 278
 279     if (UTF8_IS_CONTINUATION(uv) &&
 280         !(flags & UTF8_ALLOW_CONTINUATION)) {
 281         warning = UTF8_WARN_CONTINUATION;
 282         goto malformed;
 283     }
 284
 285     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 286         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 287         warning = UTF8_WARN_NON_CONTINUATION;
 288         goto malformed;
 289     }
 290
 291 #ifdef EBCDIC
 292     uv = NATIVE_TO_UTF(uv);
 293 #else
 294     if ((uv == 0xfe || uv == 0xff) &&
 295         !(flags & UTF8_ALLOW_FE_FF)) {
 296         warning = UTF8_WARN_FE_FF;
 297         goto malformed;
 298     }
 299 #endif
 300
 301     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 302     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 303     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 304     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 305 #ifdef EBCDIC
 306     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 307     else                        { len =  7; uv &= 0x01; }
 308 #else
 309     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 310     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 311     else                        { len = 13; uv = 0; } /* whoa! */
 312 #endif
 313
 314     if (retlen)
 315         *retlen = len;
 316
 317     expectlen = len;
 318
 319     if ((curlen < expectlen) &&
 320         !(flags & UTF8_ALLOW_SHORT)) {
 321         warning = UTF8_WARN_SHORT;
 322         goto malformed;
 323     }
 324
 325     len--;
 326     s++;
 327     ouv = uv;
 328
 329     while (len--) {
 330         if (!UTF8_IS_CONTINUATION(*s) &&
 331             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 332             s--;
 333             warning = UTF8_WARN_NON_CONTINUATION;
 334             goto malformed;
 335         }
 336         else
 337             uv = UTF8_ACCUMULATE(uv, *s);
 338         if (!(uv > ouv)) {
 339             /* These cannot be allowed. */
 340             if (uv == ouv) {
 341                 if (!(flags & UTF8_ALLOW_LONG)) {
 342                     warning = UTF8_WARN_LONG;
 343                     goto malformed;
 344                 }
 345             }
 346             else { /* uv < ouv */
 347                 /* This cannot be allowed. */
 348                 warning = UTF8_WARN_OVERFLOW;
 349                 goto malformed;
 350             }
 351         }
 352         s++;
 353         ouv = uv;
 354     }
 355
 356     if (UNICODE_IS_SURROGATE(uv) &&
 357         !(flags & UTF8_ALLOW_SURROGATE)) {
 358         warning = UTF8_WARN_SURROGATE;
 359         goto malformed;
 360     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 361                !(flags & UTF8_ALLOW_BOM)) {
 362         warning = UTF8_WARN_BOM;
 363         goto malformed;
 364     } else if ((expectlen > UNISKIP(uv)) &&
 365                !(flags & UTF8_ALLOW_LONG)) {
 366         warning = UTF8_WARN_LONG;
 367         goto malformed;
 368     } else if (UNICODE_IS_ILLEGAL(uv) &&
 369                !(flags & UTF8_ALLOW_FFFF)) {
 370         warning = UTF8_WARN_FFFF;
 371         goto malformed;
 372     }
 373
 374     return uv;
 375
 376 malformed:
 377
 378     if (flags & UTF8_CHECK_ONLY) {
 379         if (retlen)
 380             *retlen = -1;
 381         return 0;
 382     }
 383
 384     if (dowarn) {
 385         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 386
 387         switch (warning) {
 388         case 0: /* Intentionally empty. */ break;
 389         case UTF8_WARN_EMPTY:
 390             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 391             break;
 392         case UTF8_WARN_CONTINUATION:
 393             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 394             break;
 395         case UTF8_WARN_NON_CONTINUATION:
 396             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 397                            (UV)s[1], uv);
 398             break;
 399         case UTF8_WARN_FE_FF:
 400             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 401             break;
 402         case UTF8_WARN_SHORT:
 403             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 404                            curlen, curlen == 1 ? "" : "s", expectlen);
 405             expectlen = curlen;         /* distance for caller to skip */
 406             break;
 407         case UTF8_WARN_OVERFLOW:
 408             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 409                            ouv, *s);
 410             break;
 411         case UTF8_WARN_SURROGATE:
 412             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 413             break;
 414         case UTF8_WARN_BOM:
 415             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 416             break;
 417         case UTF8_WARN_LONG:
 418             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 419                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 420             break;
 421         case UTF8_WARN_FFFF:
 422             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 423             break;
 424         default:
 425             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 426             break;
 427         }
 428
 429         if (warning) {
 430             char *s = SvPVX(sv);
 431
 432             if (PL_op)
 433                 Perl_warner(aTHX_ WARN_UTF8,
 434                             "%s in %s", s,  OP_DESC(PL_op));
 435             else
 436                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 437         }
 438     }
 439
 440     if (retlen)
 441         *retlen = expectlen ? expectlen : len;
 442
 443     return 0;
 444 }
 445
 446 /*
 447 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 448
 449 Returns the native character value of the first character in the string C<s>
 450 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 451 length, in bytes, of that character.
 452
 453 If C<s> does not point to a well-formed UTF8 character, zero is
 454 returned and retlen is set, if possible, to -1.
 455
 456 =cut
 457 */
 458
 459 UV
 460 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 461 {
 462     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 463 }
 464
 465 /*
 466 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 467
 468 Returns the Unicode code point of the first character in the string C<s>
 469 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 470 length, in bytes, of that character.
 471
 472 This function should only be used when returned UV is considered
 473 an index into the Unicode semantic tables (e.g. swashes).
 474
 475 If C<s> does not point to a well-formed UTF8 character, zero is
 476 returned and retlen is set, if possible, to -1.
 477
 478 =cut
 479 */
 480
 481 UV
 482 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 483 {
 484     /* Call the low level routine asking for checks */
 485     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 486 }
 487
 488 /*
 489 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 490
 491 Return the length of the UTF-8 char encoded string C<s> in characters.
 492 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 493 up past C<e>, croaks.
 494
 495 =cut
 496 */
 497
 498 STRLEN
 499 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 500 {
 501     STRLEN len = 0;
 502
 503     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 504      * the bitops (especially ~) can create illegal UTF-8.
 505      * In other words: in Perl UTF-8 is not just for Unicode. */
 506
 507     if (e < s)
 508         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 509     while (s < e) {
 510         U8 t = UTF8SKIP(s);
 511
 512         if (e - s < t)
 513             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 514         s += t;
 515         len++;
 516     }
 517
 518     return len;
 519 }
 520
 521 /*
 522 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 523
 524 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 525 and C<b>.
 526
 527 WARNING: use only if you *know* that the pointers point inside the
 528 same UTF-8 buffer.
 529
 530 =cut
 531 */
 532
 533 IV
 534 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 535 {
 536     IV off = 0;
 537
 538     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 539      * the bitops (especially ~) can create illegal UTF-8.
 540      * In other words: in Perl UTF-8 is not just for Unicode. */
 541
 542     if (a < b) {
 543         while (a < b) {
 544             U8 c = UTF8SKIP(a);
 545
 546             if (b - a < c)
 547                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 548             a += c;
 549             off--;
 550         }
 551     }
 552     else {
 553         while (b < a) {
 554             U8 c = UTF8SKIP(b);
 555
 556             if (a - b < c)
 557                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 558             b += c;
 559             off++;
 560         }
 561     }
 562
 563     return off;
 564 }
 565
 566 /*
 567 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 568
 569 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 570 forward or backward.
 571
 572 WARNING: do not use the following unless you *know* C<off> is within
 573 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 574 on the first byte of character or just after the last byte of a character.
 575
 576 =cut
 577 */
 578
 579 U8 *
 580 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 581 {
 582     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 583      * the bitops (especially ~) can create illegal UTF-8.
 584      * In other words: in Perl UTF-8 is not just for Unicode. */
 585
 586     if (off >= 0) {
 587         while (off--)
 588             s += UTF8SKIP(s);
 589     }
 590     else {
 591         while (off++) {
 592             s--;
 593             while (UTF8_IS_CONTINUATION(*s))
 594                 s--;
 595         }
 596     }
 597     return s;
 598 }
 599
 600 /*
 601 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 602
 603 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 604 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 605 updates len to contain the new length.
 606 Returns zero on failure, setting C<len> to -1.
 607
 608 =cut
 609 */
 610
 611 U8 *
 612 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 613 {
 614     U8 *send;
 615     U8 *d;
 616     U8 *save = s;
 617
 618     /* ensure valid UTF8 and chars < 256 before updating string */
 619     for (send = s + *len; s < send; ) {
 620         U8 c = *s++;
 621
 622         if (!UTF8_IS_INVARIANT(c) &&
 623             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 624              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 625             *len = -1;
 626             return 0;
 627         }
 628     }
 629
 630     d = s = save;
 631     while (s < send) {
 632         STRLEN ulen;
 633         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 634         s += ulen;
 635     }
 636     *d = '\0';
 637     *len = d - save;
 638     return save;
 639 }
 640
 641 /*
 642 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 643
 644 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 645 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 646 the newly-created string, and updates C<len> to contain the new
 647 length.  Returns the original string if no conversion occurs, C<len>
 648 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 649 0 if C<s> is converted or contains all 7bit characters.
 650
 651 =cut
 652 */
 653
 654 U8 *
 655 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 656 {
 657     U8 *d;
 658     U8 *start = s;
 659     U8 *send;
 660     I32 count = 0;
 661
 662     if (!*is_utf8)
 663         return start;
 664
 665     /* ensure valid UTF8 and chars < 256 before converting string */
 666     for (send = s + *len; s < send;) {
 667         U8 c = *s++;
 668         if (!UTF8_IS_INVARIANT(c)) {
 669             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 670                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 671                 count++;
 672             else
 673                 return start;
 674         }
 675     }
 676
 677     *is_utf8 = 0;
 678
 679     Newz(801, d, (*len) - count + 1, U8);
 680     s = start; start = d;
 681     while (s < send) {
 682         U8 c = *s++;
 683         if (!UTF8_IS_INVARIANT(c)) {
 684             /* Then it is two-byte encoded */
 685             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 686             c = ASCII_TO_NATIVE(c);
 687         }
 688         *d++ = c;
 689     }
 690     *d = '\0';
 691     *len = d - start;
 692     return start;
 693 }
 694
 695 /*
 696 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 697
 698 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 699 Returns a pointer to the newly-created string, and sets C<len> to
 700 reflect the new length.
 701
 702 =cut
 703 */
 704
 705 U8*
 706 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 707 {
 708     U8 *send;
 709     U8 *d;
 710     U8 *dst;
 711     send = s + (*len);
 712
 713     Newz(801, d, (*len) * 2 + 1, U8);
 714     dst = d;
 715
 716     while (s < send) {
 717         UV uv = NATIVE_TO_ASCII(*s++);
 718         if (UNI_IS_INVARIANT(uv))
 719             *d++ = UTF_TO_NATIVE(uv);
 720         else {
 721             *d++ = UTF8_EIGHT_BIT_HI(uv);
 722             *d++ = UTF8_EIGHT_BIT_LO(uv);
 723         }
 724     }
 725     *d = '\0';
 726     *len = d-dst;
 727     return dst;
 728 }
 729
 730 /*
 731  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 732  *
 733  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 734  * We optimize for native, for obvious reasons. */
 735
 736 U8*
 737 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 738 {
 739     U8* pend;
 740     U8* dstart = d;
 741
 742     if (bytelen & 1)
 743         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 744
 745     pend = p + bytelen;
 746
 747     while (p < pend) {
 748         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 749         p += 2;
 750         if (uv < 0x80) {
 751             *d++ = uv;
 752             continue;
 753         }
 754         if (uv < 0x800) {
 755             *d++ = (( uv >>  6)         | 0xc0);
 756             *d++ = (( uv        & 0x3f) | 0x80);
 757             continue;
 758         }
 759         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 760             UV low = *p++;
 761             if (low < 0xdc00 || low >= 0xdfff)
 762                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 763             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 764         }
 765         if (uv < 0x10000) {
 766             *d++ = (( uv >> 12)         | 0xe0);
 767             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 768             *d++ = (( uv        & 0x3f) | 0x80);
 769             continue;
 770         }
 771         else {
 772             *d++ = (( uv >> 18)         | 0xf0);
 773             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 774             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 775             *d++ = (( uv        & 0x3f) | 0x80);
 776             continue;
 777         }
 778     }
 779     *newlen = d - dstart;
 780     return d;
 781 }
 782
 783 /* Note: this one is slightly destructive of the source. */
 784
 785 U8*
 786 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 787 {
 788     U8* s = (U8*)p;
 789     U8* send = s + bytelen;
 790     while (s < send) {
 791         U8 tmp = s[0];
 792         s[0] = s[1];
 793         s[1] = tmp;
 794         s += 2;
 795     }
 796     return utf16_to_utf8(p, d, bytelen, newlen);
 797 }
 798
 799 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 800
 801 bool
 802 Perl_is_uni_alnum(pTHX_ UV c)
 803 {
 804     U8 tmpbuf[UTF8_MAXLEN+1];
 805     uvchr_to_utf8(tmpbuf, (UV)c);
 806     return is_utf8_alnum(tmpbuf);
 807 }
 808
 809 bool
 810 Perl_is_uni_alnumc(pTHX_ UV c)
 811 {
 812     U8 tmpbuf[UTF8_MAXLEN+1];
 813     uvchr_to_utf8(tmpbuf, (UV)c);
 814     return is_utf8_alnumc(tmpbuf);
 815 }
 816
 817 bool
 818 Perl_is_uni_idfirst(pTHX_ UV c)
 819 {
 820     U8 tmpbuf[UTF8_MAXLEN+1];
 821     uvchr_to_utf8(tmpbuf, (UV)c);
 822     return is_utf8_idfirst(tmpbuf);
 823 }
 824
 825 bool
 826 Perl_is_uni_alpha(pTHX_ UV c)
 827 {
 828     U8 tmpbuf[UTF8_MAXLEN+1];
 829     uvchr_to_utf8(tmpbuf, (UV)c);
 830     return is_utf8_alpha(tmpbuf);
 831 }
 832
 833 bool
 834 Perl_is_uni_ascii(pTHX_ UV c)
 835 {
 836     U8 tmpbuf[UTF8_MAXLEN+1];
 837     uvchr_to_utf8(tmpbuf, (UV)c);
 838     return is_utf8_ascii(tmpbuf);
 839 }
 840
 841 bool
 842 Perl_is_uni_space(pTHX_ UV c)
 843 {
 844     U8 tmpbuf[UTF8_MAXLEN+1];
 845     uvchr_to_utf8(tmpbuf, (UV)c);
 846     return is_utf8_space(tmpbuf);
 847 }
 848
 849 bool
 850 Perl_is_uni_digit(pTHX_ UV c)
 851 {
 852     U8 tmpbuf[UTF8_MAXLEN+1];
 853     uvchr_to_utf8(tmpbuf, (UV)c);
 854     return is_utf8_digit(tmpbuf);
 855 }
 856
 857 bool
 858 Perl_is_uni_upper(pTHX_ UV c)
 859 {
 860     U8 tmpbuf[UTF8_MAXLEN+1];
 861     uvchr_to_utf8(tmpbuf, (UV)c);
 862     return is_utf8_upper(tmpbuf);
 863 }
 864
 865 bool
 866 Perl_is_uni_lower(pTHX_ UV c)
 867 {
 868     U8 tmpbuf[UTF8_MAXLEN+1];
 869     uvchr_to_utf8(tmpbuf, (UV)c);
 870     return is_utf8_lower(tmpbuf);
 871 }
 872
 873 bool
 874 Perl_is_uni_cntrl(pTHX_ UV c)
 875 {
 876     U8 tmpbuf[UTF8_MAXLEN+1];
 877     uvchr_to_utf8(tmpbuf, (UV)c);
 878     return is_utf8_cntrl(tmpbuf);
 879 }
 880
 881 bool
 882 Perl_is_uni_graph(pTHX_ UV c)
 883 {
 884     U8 tmpbuf[UTF8_MAXLEN+1];
 885     uvchr_to_utf8(tmpbuf, (UV)c);
 886     return is_utf8_graph(tmpbuf);
 887 }
 888
 889 bool
 890 Perl_is_uni_print(pTHX_ UV c)
 891 {
 892     U8 tmpbuf[UTF8_MAXLEN+1];
 893     uvchr_to_utf8(tmpbuf, (UV)c);
 894     return is_utf8_print(tmpbuf);
 895 }
 896
 897 bool
 898 Perl_is_uni_punct(pTHX_ UV c)
 899 {
 900     U8 tmpbuf[UTF8_MAXLEN+1];
 901     uvchr_to_utf8(tmpbuf, (UV)c);
 902     return is_utf8_punct(tmpbuf);
 903 }
 904
 905 bool
 906 Perl_is_uni_xdigit(pTHX_ UV c)
 907 {
 908     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 909     uvchr_to_utf8(tmpbuf, (UV)c);
 910     return is_utf8_xdigit(tmpbuf);
 911 }
 912
 913 UV
 914 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 915 {
 916     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 917     uvchr_to_utf8(tmpbuf, (UV)c);
 918     return to_utf8_upper(tmpbuf, p, lenp);
 919 }
 920
 921 UV
 922 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 923 {
 924     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 925     uvchr_to_utf8(tmpbuf, (UV)c);
 926     return to_utf8_title(tmpbuf, p, lenp);
 927 }
 928
 929 UV
 930 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 931 {
 932     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 933     uvchr_to_utf8(tmpbuf, (UV)c);
 934     return to_utf8_lower(tmpbuf, p, lenp);
 935 }
 936
 937 UV
 938 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 939 {
 940     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
 941     uvchr_to_utf8(tmpbuf, (UV)c);
 942     return to_utf8_fold(tmpbuf, p, lenp);
 943 }
 944
 945 /* for now these all assume no locale info available for Unicode > 255 */
 946
 947 bool
 948 Perl_is_uni_alnum_lc(pTHX_ UV c)
 949 {
 950     return is_uni_alnum(c);     /* XXX no locale support yet */
 951 }
 952
 953 bool
 954 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 955 {
 956     return is_uni_alnumc(c);    /* XXX no locale support yet */
 957 }
 958
 959 bool
 960 Perl_is_uni_idfirst_lc(pTHX_ UV c)
 961 {
 962     return is_uni_idfirst(c);   /* XXX no locale support yet */
 963 }
 964
 965 bool
 966 Perl_is_uni_alpha_lc(pTHX_ UV c)
 967 {
 968     return is_uni_alpha(c);     /* XXX no locale support yet */
 969 }
 970
 971 bool
 972 Perl_is_uni_ascii_lc(pTHX_ UV c)
 973 {
 974     return is_uni_ascii(c);     /* XXX no locale support yet */
 975 }
 976
 977 bool
 978 Perl_is_uni_space_lc(pTHX_ UV c)
 979 {
 980     return is_uni_space(c);     /* XXX no locale support yet */
 981 }
 982
 983 bool
 984 Perl_is_uni_digit_lc(pTHX_ UV c)
 985 {
 986     return is_uni_digit(c);     /* XXX no locale support yet */
 987 }
 988
 989 bool
 990 Perl_is_uni_upper_lc(pTHX_ UV c)
 991 {
 992     return is_uni_upper(c);     /* XXX no locale support yet */
 993 }
 994
 995 bool
 996 Perl_is_uni_lower_lc(pTHX_ UV c)
 997 {
 998     return is_uni_lower(c);     /* XXX no locale support yet */
 999 }
1000
1001 bool
1002 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1003 {
1004     return is_uni_cntrl(c);     /* XXX no locale support yet */
1005 }
1006
1007 bool
1008 Perl_is_uni_graph_lc(pTHX_ UV c)
1009 {
1010     return is_uni_graph(c);     /* XXX no locale support yet */
1011 }
1012
1013 bool
1014 Perl_is_uni_print_lc(pTHX_ UV c)
1015 {
1016     return is_uni_print(c);     /* XXX no locale support yet */
1017 }
1018
1019 bool
1020 Perl_is_uni_punct_lc(pTHX_ UV c)
1021 {
1022     return is_uni_punct(c);     /* XXX no locale support yet */
1023 }
1024
1025 bool
1026 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1027 {
1028     return is_uni_xdigit(c);    /* XXX no locale support yet */
1029 }
1030
1031 bool
1032 Perl_is_utf8_alnum(pTHX_ U8 *p)
1033 {
1034     if (!is_utf8_char(p))
1035         return FALSE;
1036     if (!PL_utf8_alnum)
1037         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1038          * descendant of isalnum(3), in other words, it doesn't
1039          * contain the '_'. --jhi */
1040         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1041     return swash_fetch(PL_utf8_alnum, p, TRUE);
1042 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1043 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1044     if (!PL_utf8_alnum)
1045         PL_utf8_alnum = swash_init("utf8", "",
1046             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1047     return swash_fetch(PL_utf8_alnum, p, TRUE);
1048 #endif
1049 }
1050
1051 bool
1052 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1053 {
1054     if (!is_utf8_char(p))
1055         return FALSE;
1056     if (!PL_utf8_alnum)
1057         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1058     return swash_fetch(PL_utf8_alnum, p, TRUE);
1059 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1060 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1061     if (!PL_utf8_alnum)
1062         PL_utf8_alnum = swash_init("utf8", "",
1063             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1064     return swash_fetch(PL_utf8_alnum, p, TRUE);
1065 #endif
1066 }
1067
1068 bool
1069 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1070 {
1071     return *p == '_' || is_utf8_alpha(p);
1072 }
1073
1074 bool
1075 Perl_is_utf8_alpha(pTHX_ U8 *p)
1076 {
1077     if (!is_utf8_char(p))
1078         return FALSE;
1079     if (!PL_utf8_alpha)
1080         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1081     return swash_fetch(PL_utf8_alpha, p, TRUE);
1082 }
1083
1084 bool
1085 Perl_is_utf8_ascii(pTHX_ U8 *p)
1086 {
1087     if (!is_utf8_char(p))
1088         return FALSE;
1089     if (!PL_utf8_ascii)
1090         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1091     return swash_fetch(PL_utf8_ascii, p, TRUE);
1092 }
1093
1094 bool
1095 Perl_is_utf8_space(pTHX_ U8 *p)
1096 {
1097     if (!is_utf8_char(p))
1098         return FALSE;
1099     if (!PL_utf8_space)
1100         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1101     return swash_fetch(PL_utf8_space, p, TRUE);
1102 }
1103
1104 bool
1105 Perl_is_utf8_digit(pTHX_ U8 *p)
1106 {
1107     if (!is_utf8_char(p))
1108         return FALSE;
1109     if (!PL_utf8_digit)
1110         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1111     return swash_fetch(PL_utf8_digit, p, TRUE);
1112 }
1113
1114 bool
1115 Perl_is_utf8_upper(pTHX_ U8 *p)
1116 {
1117     if (!is_utf8_char(p))
1118         return FALSE;
1119     if (!PL_utf8_upper)
1120         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1121     return swash_fetch(PL_utf8_upper, p, TRUE);
1122 }
1123
1124 bool
1125 Perl_is_utf8_lower(pTHX_ U8 *p)
1126 {
1127     if (!is_utf8_char(p))
1128         return FALSE;
1129     if (!PL_utf8_lower)
1130         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1131     return swash_fetch(PL_utf8_lower, p, TRUE);
1132 }
1133
1134 bool
1135 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1136 {
1137     if (!is_utf8_char(p))
1138         return FALSE;
1139     if (!PL_utf8_cntrl)
1140         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1141     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1142 }
1143
1144 bool
1145 Perl_is_utf8_graph(pTHX_ U8 *p)
1146 {
1147     if (!is_utf8_char(p))
1148         return FALSE;
1149     if (!PL_utf8_graph)
1150         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1151     return swash_fetch(PL_utf8_graph, p, TRUE);
1152 }
1153
1154 bool
1155 Perl_is_utf8_print(pTHX_ U8 *p)
1156 {
1157     if (!is_utf8_char(p))
1158         return FALSE;
1159     if (!PL_utf8_print)
1160         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1161     return swash_fetch(PL_utf8_print, p, TRUE);
1162 }
1163
1164 bool
1165 Perl_is_utf8_punct(pTHX_ U8 *p)
1166 {
1167     if (!is_utf8_char(p))
1168         return FALSE;
1169     if (!PL_utf8_punct)
1170         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1171     return swash_fetch(PL_utf8_punct, p, TRUE);
1172 }
1173
1174 bool
1175 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1176 {
1177     if (!is_utf8_char(p))
1178         return FALSE;
1179     if (!PL_utf8_xdigit)
1180         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1181     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1182 }
1183
1184 bool
1185 Perl_is_utf8_mark(pTHX_ U8 *p)
1186 {
1187     if (!is_utf8_char(p))
1188         return FALSE;
1189     if (!PL_utf8_mark)
1190         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1191     return swash_fetch(PL_utf8_mark, p, TRUE);
1192 }
1193
1194 /*
1195 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1196
1197 The "p" contains the pointer to the UTF-8 string encoding
1198 the character that is being converted.
1199
1200 The "ustrp" is a pointer to the character buffer to put the
1201 conversion result to.  The "lenp" is a pointer to the length
1202 of the result.
1203
1204 The "swash" is a pointer to the swash to use.
1205
1206 The "normal" is a string like "ToLower" which means the swash
1207 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1208 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1209
1210 The "special" is a string like "utf8::ToSpecLower", which means
1211 the hash %utf8::ToSpecLower, which is stored in the same file,
1212 lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
1213 to the hash is by Perl_to_utf8_case().
1214
1215 =cut
1216  */
1217
1218 UV
1219 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1220 {
1221     UV uv;
1222
1223     if (!*swashp)
1224         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1225     uv = swash_fetch(*swashp, p, TRUE);
1226     if (uv)
1227          uv = UNI_TO_NATIVE(uv);
1228     else {
1229          HV *hv;
1230          SV *keysv;
1231          HE *he;
1232
1233          uv = utf8_to_uvchr(p, 0);
1234
1235          if ((hv    = get_hv(special, FALSE)) &&
1236              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1237              (he    = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1238               SV *val = HeVAL(he);
1239               char *s = SvPV(val, *lenp);
1240               U8 c = *(U8*)s;
1241               if (*lenp > 1 || UNI_IS_INVARIANT(c))
1242                    Copy(s, ustrp, *lenp, U8);
1243               else {
1244                    /* something in the 0x80..0xFF range */
1245                    ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1246                    ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1247                    *lenp = 2;
1248               }
1249               return 0;
1250          }
1251     }
1252     if (lenp)
1253        *lenp = UNISKIP(uv);
1254     uvuni_to_utf8(ustrp, uv);
1255     return uv;
1256 }
1257
1258 UV
1259 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1260 {
1261     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1262                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1263 }
1264
1265 UV
1266 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1267 {
1268     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1269                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1270 }
1271
1272 UV
1273 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1274 {
1275     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1276                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1277 }
1278
1279 UV
1280 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1281 {
1282     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1283                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1284 }
1285
1286 /* a "swash" is a swatch hash */
1287
1288 SV*
1289 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1290 {
1291     SV* retval;
1292     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1293     dSP;
1294     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1295     SV* errsv_save;
1296
1297     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1298         ENTER;
1299         errsv_save = newSVsv(ERRSV);
1300         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1301         if (!SvTRUE(ERRSV))
1302             sv_setsv(ERRSV, errsv_save);
1303         SvREFCNT_dec(errsv_save);
1304         LEAVE;
1305     }
1306     SPAGAIN;
1307     PUSHSTACKi(PERLSI_MAGIC);
1308     PUSHMARK(SP);
1309     EXTEND(SP,5);
1310     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1311     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1312     PUSHs(listsv);
1313     PUSHs(sv_2mortal(newSViv(minbits)));
1314     PUSHs(sv_2mortal(newSViv(none)));
1315     PUTBACK;
1316     ENTER;
1317     SAVEI32(PL_hints);
1318     PL_hints = 0;
1319     save_re_context();
1320     if (PL_curcop == &PL_compiling)
1321         /* XXX ought to be handled by lex_start */
1322         sv_setpv(tokenbufsv, PL_tokenbuf);
1323     errsv_save = newSVsv(ERRSV);
1324     if (call_method("SWASHNEW", G_SCALAR))
1325         retval = newSVsv(*PL_stack_sp--);
1326     else
1327         retval = &PL_sv_undef;
1328     if (!SvTRUE(ERRSV))
1329         sv_setsv(ERRSV, errsv_save);
1330     SvREFCNT_dec(errsv_save);
1331     LEAVE;
1332     POPSTACK;
1333     if (PL_curcop == &PL_compiling) {
1334         STRLEN len;
1335         char* pv = SvPV(tokenbufsv, len);
1336
1337         Copy(pv, PL_tokenbuf, len+1, char);
1338         PL_curcop->op_private = PL_hints;
1339     }
1340     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1341         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1342     return retval;
1343 }
1344
1345
1346 /* This API is wrong for special case conversions since we may need to
1347  * return several Unicode characters for a single Unicode character
1348  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1349  * the lower-level routine, and it is similarly broken for returning
1350  * multiple values.  --jhi */
1351 UV
1352 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1353 {
1354     HV* hv = (HV*)SvRV(sv);
1355     U32 klen;
1356     U32 off;
1357     STRLEN slen;
1358     STRLEN needents;
1359     U8 *tmps = NULL;
1360     U32 bit;
1361     SV *retval;
1362     U8 tmputf8[2];
1363     UV c = NATIVE_TO_ASCII(*ptr);
1364
1365     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1366         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1367         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1368         ptr = tmputf8;
1369     }
1370     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1371      * then the "swatch" is a vec() for al the chars which start
1372      * with 0xAA..0xYY
1373      * So the key in the hash (klen) is length of encoded char -1
1374      */
1375     klen = UTF8SKIP(ptr) - 1;
1376     off  = ptr[klen];
1377
1378     if (klen == 0)
1379      {
1380       /* If char in invariant then swatch is for all the invariant chars
1381        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1382        */
1383       needents = UTF_CONTINUATION_MARK;
1384       off      = NATIVE_TO_UTF(ptr[klen]);
1385      }
1386     else
1387      {
1388       /* If char is encoded then swatch is for the prefix */
1389       needents = (1 << UTF_ACCUMULATION_SHIFT);
1390       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1391      }
1392
1393     /*
1394      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1395      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1396      * it's nothing to sniff at.)  Pity we usually come through at least
1397      * two function calls to get here...
1398      *
1399      * NB: this code assumes that swatches are never modified, once generated!
1400      */
1401
1402     if (hv   == PL_last_swash_hv &&
1403         klen == PL_last_swash_klen &&
1404         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1405     {
1406         tmps = PL_last_swash_tmps;
1407         slen = PL_last_swash_slen;
1408     }
1409     else {
1410         /* Try our second-level swatch cache, kept in a hash. */
1411         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1412
1413         /* If not cached, generate it via utf8::SWASHGET */
1414         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1415             dSP;
1416             /* We use utf8n_to_uvuni() as we want an index into
1417                Unicode tables, not a native character number.
1418              */
1419             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1420             SV *errsv_save;
1421             ENTER;
1422             SAVETMPS;
1423             save_re_context();
1424             PUSHSTACKi(PERLSI_MAGIC);
1425             PUSHMARK(SP);
1426             EXTEND(SP,3);
1427             PUSHs((SV*)sv);
1428             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1429             PUSHs(sv_2mortal(newSViv((klen) ?
1430                                      (code_point & ~(needents - 1)) : 0)));
1431             PUSHs(sv_2mortal(newSViv(needents)));
1432             PUTBACK;
1433             errsv_save = newSVsv(ERRSV);
1434             if (call_method("SWASHGET", G_SCALAR))
1435                 retval = newSVsv(*PL_stack_sp--);
1436             else
1437                 retval = &PL_sv_undef;
1438             if (!SvTRUE(ERRSV))
1439                 sv_setsv(ERRSV, errsv_save);
1440             SvREFCNT_dec(errsv_save);
1441             POPSTACK;
1442             FREETMPS;
1443             LEAVE;
1444             if (PL_curcop == &PL_compiling)
1445                 PL_curcop->op_private = PL_hints;
1446
1447             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1448
1449             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1450                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1451         }
1452
1453         PL_last_swash_hv = hv;
1454         PL_last_swash_klen = klen;
1455         PL_last_swash_tmps = tmps;
1456         PL_last_swash_slen = slen;
1457         if (klen)
1458             Copy(ptr, PL_last_swash_key, klen, U8);
1459     }
1460
1461     switch ((int)((slen << 3) / needents)) {
1462     case 1:
1463         bit = 1 << (off & 7);
1464         off >>= 3;
1465         return (tmps[off] & bit) != 0;
1466     case 8:
1467         return tmps[off];
1468     case 16:
1469         off <<= 1;
1470         return (tmps[off] << 8) + tmps[off + 1] ;
1471     case 32:
1472         off <<= 2;
1473         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1474     }
1475     Perl_croak(aTHX_ "panic: swash_fetch");
1476     return 0;
1477 }
1478
1479
1480 /*
1481 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1482
1483 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1484 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1485 bytes available. The return value is the pointer to the byte after the
1486 end of the new character. In other words,
1487
1488     d = uvchr_to_utf8(d, uv);
1489
1490 is the recommended wide native character-aware way of saying
1491
1492     *(d++) = uv;
1493
1494 =cut
1495 */
1496
1497 /* On ASCII machines this is normally a macro but we want a
1498    real function in case XS code wants it
1499 */
1500 #undef Perl_uvchr_to_utf8
1501 U8 *
1502 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1503 {
1504     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1505 }
1506
1507
1508 /*
1509 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1510
1511 Returns the native character value of the first character in the string C<s>
1512 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1513 length, in bytes, of that character.
1514
1515 Allows length and flags to be passed to low level routine.
1516
1517 =cut
1518 */
1519 /* On ASCII machines this is normally a macro but we want a
1520    real function in case XS code wants it
1521 */
1522 #undef Perl_utf8n_to_uvchr
1523 UV
1524 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1525 {
1526     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1527     return UNI_TO_NATIVE(uv);
1528 }
1529
1530 /*
1531 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1532
1533 Build to the scalar dsv a displayable version of the string spv,
1534 length len, the displayable version being at most pvlim bytes long
1535 (if longer, the rest is truncated and "..." will be appended).
1536 The flags argument is currently unused but available for future extensions.
1537 The pointer to the PV of the dsv is returned.
1538
1539 =cut */
1540 char *
1541 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1542 {
1543     int truncated = 0;
1544     char *s, *e;
1545
1546     sv_setpvn(dsv, "", 0);
1547     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1548          UV u;
1549          if (pvlim && SvCUR(dsv) >= pvlim) {
1550               truncated++;
1551               break;
1552          }
1553          u = utf8_to_uvchr((U8*)s, 0);
1554          Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1555     }
1556     if (truncated)
1557          sv_catpvn(dsv, "...", 3);
1558
1559     return SvPVX(dsv);
1560 }
1561
1562 /*
1563 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1564
1565 Build to the scalar dsv a displayable version of the scalar sv,
1566 he displayable version being at most pvlim bytes long
1567 (if longer, the rest is truncated and "..." will be appended).
1568 The flags argument is currently unused but available for future extensions.
1569 The pointer to the PV of the dsv is returned.
1570
1571 =cut */
1572 char *
1573 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1574 {
1575      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1576                                 pvlim, flags);
1577 }
1578
1579 /*
1580 =for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1581
1582 Return true if the strings s1 and s2 differ case-insensitively, false
1583 if not (if they are equal case-insensitively).  If u1 is true, the
1584 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1585 the string s2 is assumed to be in UTF-8-encoded Unicode.
1586
1587 For case-insensitiveness, the "casefolding" of Unicode is used
1588 instead of upper/lowercasing both the characters, see
1589 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1590
1591 =cut */
1592 I32
1593 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1594 {
1595      register U8 *a  = (U8*)s1;
1596      register U8 *b  = (U8*)s2;
1597      register U8 *ae = b + len1;
1598      register U8 *be = b + len2;
1599      STRLEN la, lb;
1600      UV ca, cb;
1601      STRLEN ulen1, ulen2;
1602      U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1603      U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
1604
1605      while (a < ae && b < be) {
1606           if (u1) {
1607                if (a + UTF8SKIP(a) > ae)
1608                     break;
1609                ca = utf8_to_uvchr((U8*)a, &la);
1610           } else {
1611                ca = *a;
1612                la = 1;
1613           }
1614           if (u2) {
1615                if (b + UTF8SKIP(b) > be)
1616                     break;
1617                cb = utf8_to_uvchr((U8*)b, &lb);
1618           } else {
1619                cb = *b;
1620                lb = 1;
1621           }
1622           if (ca != cb) {
1623                if (u1)
1624                     to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1625                else
1626                     ulen1 = 1;
1627                if (u2)
1628                     to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1629                else
1630                     ulen2 = 1;
1631                if (ulen1 != ulen2
1632                    || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1633                    || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1634                     return 1; /* mismatch */
1635           }
1636           a += la;
1637           b += lb;
1638      }
1639      return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
1640 }
1641