utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (UNI_IS_INVARIANT(uv)) {
  50         *d++ = UTF_TO_NATIVE(uv);
  51         return d;
  52     }
  53 #if defined(EBCDIC) || 1 /* always for testing */
  54     else {
  55         STRLEN len  = UNISKIP(uv);
  56         U8 *p = d+len-1;
  57         while (p > d) {
  58             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  59             uv >>= UTF_ACCUMULATION_SHIFT;
  60         }
  61         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  62         return d+len;
  63     }
  64 #else /* Non loop style */
  65     if (uv < 0x800) {
  66         *d++ = (( uv >>  6)         | 0xc0);
  67         *d++ = (( uv        & 0x3f) | 0x80);
  68         return d;
  69     }
  70     if (uv < 0x10000) {
  71         *d++ = (( uv >> 12)         | 0xe0);
  72         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  73         *d++ = (( uv        & 0x3f) | 0x80);
  74         return d;
  75     }
  76     if (uv < 0x200000) {
  77         *d++ = (( uv >> 18)         | 0xf0);
  78         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  79         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  80         *d++ = (( uv        & 0x3f) | 0x80);
  81         return d;
  82     }
  83     if (uv < 0x4000000) {
  84         *d++ = (( uv >> 24)         | 0xf8);
  85         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x80000000) {
  92         *d++ = (( uv >> 30)         | 0xfc);
  93         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  96         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  97         *d++ = (( uv        & 0x3f) | 0x80);
  98         return d;
  99     }
 100 #ifdef HAS_QUAD
 101     if (uv < UTF8_QUAD_MAX)
 102 #endif
 103     {
 104         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 105         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 106         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 109         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 110         *d++ = (( uv        & 0x3f) | 0x80);
 111         return d;
 112     }
 113 #ifdef HAS_QUAD
 114     {
 115         *d++ =                        0xff;     /* Can't match U+FFFE! */
 116         *d++ =                        0x80;     /* 6 Reserved bits */
 117         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 118         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 119         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 120         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 121         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 122         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 126         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 127         *d++ = (( uv        & 0x3f) | 0x80);
 128         return d;
 129     }
 130 #endif
 131 #endif /* Loop style */
 132 }
 133
 134
 135
 136 /*
 137 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 138
 139 Tests if some arbitrary number of bytes begins in a valid UTF-8
 140 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 141 The actual number of bytes in the UTF-8 character will be returned if
 142 it is valid, otherwise 0.
 143
 144 =cut
 145 */
 146 STRLEN
 147 Perl_is_utf8_char(pTHX_ U8 *s)
 148 {
 149     U8 u = *s;
 150     STRLEN slen, len;
 151     UV uv, ouv;
 152
 153     if (UTF8_IS_INVARIANT(u))
 154         return 1;
 155
 156     if (!UTF8_IS_START(u))
 157         return 0;
 158
 159     len = UTF8SKIP(s);
 160
 161     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 162         return 0;
 163
 164     slen = len - 1;
 165     s++;
 166     u &= UTF_START_MASK(len);
 167     uv  = u;
 168     ouv = uv;
 169     while (slen--) {
 170         if (!UTF8_IS_CONTINUATION(*s))
 171             return 0;
 172         uv = UTF8_ACCUMULATE(uv, *s);
 173         if (uv < ouv)
 174             return 0;
 175         ouv = uv;
 176         s++;
 177     }
 178
 179     if (UNISKIP(uv) < len)
 180         return 0;
 181
 182     return len;
 183 }
 184
 185 /*
 186 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 187
 188 Returns true if first C<len> bytes of the given string form a valid UTF8
 189 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 190 'a string that contains UTF8' because a valid ASCII string is a valid
 191 UTF8 string.
 192
 193 =cut
 194 */
 195
 196 bool
 197 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 198 {
 199     U8* x = s;
 200     U8* send;
 201     STRLEN c;
 202
 203     if (!len)
 204         len = strlen((char *)s);
 205     send = s + len;
 206
 207     while (x < send) {
 208         c = is_utf8_char(x);
 209         if (!c)
 210             return FALSE;
 211         x += c;
 212     }
 213     if (x != send)
 214         return FALSE;
 215
 216     return TRUE;
 217 }
 218
 219 /*
 220 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 221
 222 Bottom level UTF-8 decode routine.
 223 Returns the unicode code point value of the first character in the string C<s>
 224 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 225 C<retlen> will be set to the length, in bytes, of that character.
 226
 227 If C<s> does not point to a well-formed UTF8 character, the behaviour
 228 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 229 it is assumed that the caller will raise a warning, and this function
 230 will silently just set C<retlen> to C<-1> and return zero.  If the
 231 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 232 malformations will be given, C<retlen> will be set to the expected
 233 length of the UTF-8 character in bytes, and zero will be returned.
 234
 235 The C<flags> can also contain various flags to allow deviations from
 236 the strict UTF-8 encoding (see F<utf8.h>).
 237
 238 Most code should use utf8_to_uvchr() rather than call this directly.
 239
 240 =cut
 241 */
 242
 243 UV
 244 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 245 {
 246     UV uv = *s, ouv = 0;
 247     STRLEN len = 1;
 248     bool dowarn = ckWARN_d(WARN_UTF8);
 249     STRLEN expectlen = 0;
 250     U32 warning = 0;
 251
 252 /* This list is a superset of the UTF8_ALLOW_XXX. */
 253
 254 #define UTF8_WARN_EMPTY                          1
 255 #define UTF8_WARN_CONTINUATION                   2
 256 #define UTF8_WARN_NON_CONTINUATION               3
 257 #define UTF8_WARN_FE_FF                          4
 258 #define UTF8_WARN_SHORT                          5
 259 #define UTF8_WARN_OVERFLOW                       6
 260 #define UTF8_WARN_SURROGATE                      7
 261 #define UTF8_WARN_BOM                            8
 262 #define UTF8_WARN_LONG                           9
 263 #define UTF8_WARN_FFFF                          10
 264
 265     if (curlen == 0 &&
 266         !(flags & UTF8_ALLOW_EMPTY)) {
 267         warning = UTF8_WARN_EMPTY;
 268         goto malformed;
 269     }
 270
 271     if (UTF8_IS_INVARIANT(uv)) {
 272         if (retlen)
 273             *retlen = 1;
 274         return (UV) (NATIVE_TO_UTF(*s));
 275     }
 276
 277     if (UTF8_IS_CONTINUATION(uv) &&
 278         !(flags & UTF8_ALLOW_CONTINUATION)) {
 279         warning = UTF8_WARN_CONTINUATION;
 280         goto malformed;
 281     }
 282
 283     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 284         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 285         warning = UTF8_WARN_NON_CONTINUATION;
 286         goto malformed;
 287     }
 288
 289 #ifdef EBCDIC
 290     uv = NATIVE_TO_UTF(uv);
 291 #else
 292     if ((uv == 0xfe || uv == 0xff) &&
 293         !(flags & UTF8_ALLOW_FE_FF)) {
 294         warning = UTF8_WARN_FE_FF;
 295         goto malformed;
 296     }
 297 #endif
 298
 299     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 300     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 301     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 302     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 303 #ifdef EBCDIC
 304     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 305     else                        { len =  7; uv &= 0x01; }
 306 #else
 307     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 308     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 309     else                        { len = 13; uv = 0; } /* whoa! */
 310 #endif
 311
 312     if (retlen)
 313         *retlen = len;
 314
 315     expectlen = len;
 316
 317     if ((curlen < expectlen) &&
 318         !(flags & UTF8_ALLOW_SHORT)) {
 319         warning = UTF8_WARN_SHORT;
 320         goto malformed;
 321     }
 322
 323     len--;
 324     s++;
 325     ouv = uv;
 326
 327     while (len--) {
 328         if (!UTF8_IS_CONTINUATION(*s) &&
 329             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 330             s--;
 331             warning = UTF8_WARN_NON_CONTINUATION;
 332             goto malformed;
 333         }
 334         else
 335             uv = UTF8_ACCUMULATE(uv, *s);
 336         if (!(uv > ouv)) {
 337             /* These cannot be allowed. */
 338             if (uv == ouv) {
 339                 if (!(flags & UTF8_ALLOW_LONG)) {
 340                     warning = UTF8_WARN_LONG;
 341                     goto malformed;
 342                 }
 343             }
 344             else { /* uv < ouv */
 345                 /* This cannot be allowed. */
 346                 warning = UTF8_WARN_OVERFLOW;
 347                 goto malformed;
 348             }
 349         }
 350         s++;
 351         ouv = uv;
 352     }
 353
 354     if (UNICODE_IS_SURROGATE(uv) &&
 355         !(flags & UTF8_ALLOW_SURROGATE)) {
 356         warning = UTF8_WARN_SURROGATE;
 357         goto malformed;
 358     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 359                !(flags & UTF8_ALLOW_BOM)) {
 360         warning = UTF8_WARN_BOM;
 361         goto malformed;
 362     } else if ((expectlen > UNISKIP(uv)) &&
 363                !(flags & UTF8_ALLOW_LONG)) {
 364         warning = UTF8_WARN_LONG;
 365         goto malformed;
 366     } else if (UNICODE_IS_ILLEGAL(uv) &&
 367                !(flags & UTF8_ALLOW_FFFF)) {
 368         warning = UTF8_WARN_FFFF;
 369         goto malformed;
 370     }
 371
 372     return uv;
 373
 374 malformed:
 375
 376     if (flags & UTF8_CHECK_ONLY) {
 377         if (retlen)
 378             *retlen = -1;
 379         return 0;
 380     }
 381
 382     if (dowarn) {
 383         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 384
 385         switch (warning) {
 386         case 0: /* Intentionally empty. */ break;
 387         case UTF8_WARN_EMPTY:
 388             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 389             break;
 390         case UTF8_WARN_CONTINUATION:
 391             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 392             break;
 393         case UTF8_WARN_NON_CONTINUATION:
 394             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 395                            (UV)s[1], uv);
 396             break;
 397         case UTF8_WARN_FE_FF:
 398             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 399             break;
 400         case UTF8_WARN_SHORT:
 401             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 402                            curlen, curlen == 1 ? "" : "s", expectlen);
 403             break;
 404         case UTF8_WARN_OVERFLOW:
 405             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 406                            ouv, *s);
 407             break;
 408         case UTF8_WARN_SURROGATE:
 409             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 410             break;
 411         case UTF8_WARN_BOM:
 412             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 413             break;
 414         case UTF8_WARN_LONG:
 415             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 416                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 417             break;
 418         case UTF8_WARN_FFFF:
 419             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 420             break;
 421         default:
 422             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 423             break;
 424         }
 425
 426         if (warning) {
 427             char *s = SvPVX(sv);
 428
 429             if (PL_op)
 430                 Perl_warner(aTHX_ WARN_UTF8,
 431                             "%s in %s", s,  OP_DESC(PL_op));
 432             else
 433                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 434         }
 435     }
 436
 437     if (retlen)
 438         *retlen = expectlen ? expectlen : len;
 439
 440     return 0;
 441 }
 442
 443 /*
 444 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 445
 446 Returns the native character value of the first character in the string C<s>
 447 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 448 length, in bytes, of that character.
 449
 450 If C<s> does not point to a well-formed UTF8 character, zero is
 451 returned and retlen is set, if possible, to -1.
 452
 453 =cut
 454 */
 455
 456 UV
 457 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 458 {
 459     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 460 }
 461
 462 /*
 463 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 464
 465 Returns the Unicode code point of the first character in the string C<s>
 466 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 467 length, in bytes, of that character.
 468
 469 This function should only be used when returned UV is considered
 470 an index into the Unicode semantic tables (e.g. swashes).
 471
 472 If C<s> does not point to a well-formed UTF8 character, zero is
 473 returned and retlen is set, if possible, to -1.
 474
 475 =cut
 476 */
 477
 478 UV
 479 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 480 {
 481     /* Call the low level routine asking for checks */
 482     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 483 }
 484
 485 /*
 486 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 487
 488 Return the length of the UTF-8 char encoded string C<s> in characters.
 489 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 490 up past C<e>, croaks.
 491
 492 =cut
 493 */
 494
 495 STRLEN
 496 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 497 {
 498     STRLEN len = 0;
 499
 500     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 501      * the bitops (especially ~) can create illegal UTF-8.
 502      * In other words: in Perl UTF-8 is not just for Unicode. */
 503
 504     if (e < s)
 505         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 506     while (s < e) {
 507         U8 t = UTF8SKIP(s);
 508
 509         if (e - s < t)
 510             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 511         s += t;
 512         len++;
 513     }
 514
 515     return len;
 516 }
 517
 518 /*
 519 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 520
 521 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 522 and C<b>.
 523
 524 WARNING: use only if you *know* that the pointers point inside the
 525 same UTF-8 buffer.
 526
 527 =cut
 528 */
 529
 530 IV
 531 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 532 {
 533     IV off = 0;
 534
 535     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 536      * the bitops (especially ~) can create illegal UTF-8.
 537      * In other words: in Perl UTF-8 is not just for Unicode. */
 538
 539     if (a < b) {
 540         while (a < b) {
 541             U8 c = UTF8SKIP(a);
 542
 543             if (b - a < c)
 544                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 545             a += c;
 546             off--;
 547         }
 548     }
 549     else {
 550         while (b < a) {
 551             U8 c = UTF8SKIP(b);
 552
 553             if (a - b < c)
 554                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 555             b += c;
 556             off++;
 557         }
 558     }
 559
 560     return off;
 561 }
 562
 563 /*
 564 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 565
 566 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 567 forward or backward.
 568
 569 WARNING: do not use the following unless you *know* C<off> is within
 570 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 571 on the first byte of character or just after the last byte of a character.
 572
 573 =cut
 574 */
 575
 576 U8 *
 577 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 578 {
 579     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 580      * the bitops (especially ~) can create illegal UTF-8.
 581      * In other words: in Perl UTF-8 is not just for Unicode. */
 582
 583     if (off >= 0) {
 584         while (off--)
 585             s += UTF8SKIP(s);
 586     }
 587     else {
 588         while (off++) {
 589             s--;
 590             while (UTF8_IS_CONTINUATION(*s))
 591                 s--;
 592         }
 593     }
 594     return s;
 595 }
 596
 597 /*
 598 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 599
 600 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 601 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 602 updates len to contain the new length.
 603 Returns zero on failure, setting C<len> to -1.
 604
 605 =cut
 606 */
 607
 608 U8 *
 609 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 610 {
 611     U8 *send;
 612     U8 *d;
 613     U8 *save = s;
 614
 615     /* ensure valid UTF8 and chars < 256 before updating string */
 616     for (send = s + *len; s < send; ) {
 617         U8 c = *s++;
 618
 619         if (!UTF8_IS_INVARIANT(c) &&
 620             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 621              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 622             *len = -1;
 623             return 0;
 624         }
 625     }
 626
 627     d = s = save;
 628     while (s < send) {
 629         STRLEN ulen;
 630         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 631         s += ulen;
 632     }
 633     *d = '\0';
 634     *len = d - save;
 635     return save;
 636 }
 637
 638 /*
 639 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 640
 641 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 642 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 643 the newly-created string, and updates C<len> to contain the new
 644 length.  Returns the original string if no conversion occurs, C<len>
 645 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 646 0 if C<s> is converted or contains all 7bit characters.
 647
 648 =cut
 649 */
 650
 651 U8 *
 652 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 653 {
 654     U8 *d;
 655     U8 *start = s;
 656     U8 *send;
 657     I32 count = 0;
 658
 659     if (!*is_utf8)
 660         return start;
 661
 662     /* ensure valid UTF8 and chars < 256 before converting string */
 663     for (send = s + *len; s < send;) {
 664         U8 c = *s++;
 665         if (!UTF8_IS_INVARIANT(c)) {
 666             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 667                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 668                 count++;
 669             else
 670                 return start;
 671         }
 672     }
 673
 674     *is_utf8 = 0;
 675
 676     Newz(801, d, (*len) - count + 1, U8);
 677     s = start; start = d;
 678     while (s < send) {
 679         U8 c = *s++;
 680         if (!UTF8_IS_INVARIANT(c)) {
 681             /* Then it is two-byte encoded */
 682             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 683             c = ASCII_TO_NATIVE(c);
 684         }
 685         *d++ = c;
 686     }
 687     *d = '\0';
 688     *len = d - start;
 689     return start;
 690 }
 691
 692 /*
 693 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 694
 695 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 696 Returns a pointer to the newly-created string, and sets C<len> to
 697 reflect the new length.
 698
 699 =cut
 700 */
 701
 702 U8*
 703 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 704 {
 705     U8 *send;
 706     U8 *d;
 707     U8 *dst;
 708     send = s + (*len);
 709
 710     Newz(801, d, (*len) * 2 + 1, U8);
 711     dst = d;
 712
 713     while (s < send) {
 714         UV uv = NATIVE_TO_ASCII(*s++);
 715         if (UNI_IS_INVARIANT(uv))
 716             *d++ = UTF_TO_NATIVE(uv);
 717         else {
 718             *d++ = UTF8_EIGHT_BIT_HI(uv);
 719             *d++ = UTF8_EIGHT_BIT_LO(uv);
 720         }
 721     }
 722     *d = '\0';
 723     *len = d-dst;
 724     return dst;
 725 }
 726
 727 /*
 728  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 729  *
 730  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 731  * We optimize for native, for obvious reasons. */
 732
 733 U8*
 734 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 735 {
 736     U8* pend;
 737     U8* dstart = d;
 738
 739     if (bytelen & 1)
 740         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 741
 742     pend = p + bytelen;
 743
 744     while (p < pend) {
 745         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 746         p += 2;
 747         if (uv < 0x80) {
 748             *d++ = uv;
 749             continue;
 750         }
 751         if (uv < 0x800) {
 752             *d++ = (( uv >>  6)         | 0xc0);
 753             *d++ = (( uv        & 0x3f) | 0x80);
 754             continue;
 755         }
 756         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 757             UV low = *p++;
 758             if (low < 0xdc00 || low >= 0xdfff)
 759                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 760             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 761         }
 762         if (uv < 0x10000) {
 763             *d++ = (( uv >> 12)         | 0xe0);
 764             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 765             *d++ = (( uv        & 0x3f) | 0x80);
 766             continue;
 767         }
 768         else {
 769             *d++ = (( uv >> 18)         | 0xf0);
 770             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 771             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 772             *d++ = (( uv        & 0x3f) | 0x80);
 773             continue;
 774         }
 775     }
 776     *newlen = d - dstart;
 777     return d;
 778 }
 779
 780 /* Note: this one is slightly destructive of the source. */
 781
 782 U8*
 783 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 784 {
 785     U8* s = (U8*)p;
 786     U8* send = s + bytelen;
 787     while (s < send) {
 788         U8 tmp = s[0];
 789         s[0] = s[1];
 790         s[1] = tmp;
 791         s += 2;
 792     }
 793     return utf16_to_utf8(p, d, bytelen, newlen);
 794 }
 795
 796 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 797
 798 bool
 799 Perl_is_uni_alnum(pTHX_ U32 c)
 800 {
 801     U8 tmpbuf[UTF8_MAXLEN+1];
 802     uvchr_to_utf8(tmpbuf, (UV)c);
 803     return is_utf8_alnum(tmpbuf);
 804 }
 805
 806 bool
 807 Perl_is_uni_alnumc(pTHX_ U32 c)
 808 {
 809     U8 tmpbuf[UTF8_MAXLEN+1];
 810     uvchr_to_utf8(tmpbuf, (UV)c);
 811     return is_utf8_alnumc(tmpbuf);
 812 }
 813
 814 bool
 815 Perl_is_uni_idfirst(pTHX_ U32 c)
 816 {
 817     U8 tmpbuf[UTF8_MAXLEN+1];
 818     uvchr_to_utf8(tmpbuf, (UV)c);
 819     return is_utf8_idfirst(tmpbuf);
 820 }
 821
 822 bool
 823 Perl_is_uni_alpha(pTHX_ U32 c)
 824 {
 825     U8 tmpbuf[UTF8_MAXLEN+1];
 826     uvchr_to_utf8(tmpbuf, (UV)c);
 827     return is_utf8_alpha(tmpbuf);
 828 }
 829
 830 bool
 831 Perl_is_uni_ascii(pTHX_ U32 c)
 832 {
 833     U8 tmpbuf[UTF8_MAXLEN+1];
 834     uvchr_to_utf8(tmpbuf, (UV)c);
 835     return is_utf8_ascii(tmpbuf);
 836 }
 837
 838 bool
 839 Perl_is_uni_space(pTHX_ U32 c)
 840 {
 841     U8 tmpbuf[UTF8_MAXLEN+1];
 842     uvchr_to_utf8(tmpbuf, (UV)c);
 843     return is_utf8_space(tmpbuf);
 844 }
 845
 846 bool
 847 Perl_is_uni_digit(pTHX_ U32 c)
 848 {
 849     U8 tmpbuf[UTF8_MAXLEN+1];
 850     uvchr_to_utf8(tmpbuf, (UV)c);
 851     return is_utf8_digit(tmpbuf);
 852 }
 853
 854 bool
 855 Perl_is_uni_upper(pTHX_ U32 c)
 856 {
 857     U8 tmpbuf[UTF8_MAXLEN+1];
 858     uvchr_to_utf8(tmpbuf, (UV)c);
 859     return is_utf8_upper(tmpbuf);
 860 }
 861
 862 bool
 863 Perl_is_uni_lower(pTHX_ U32 c)
 864 {
 865     U8 tmpbuf[UTF8_MAXLEN+1];
 866     uvchr_to_utf8(tmpbuf, (UV)c);
 867     return is_utf8_lower(tmpbuf);
 868 }
 869
 870 bool
 871 Perl_is_uni_cntrl(pTHX_ U32 c)
 872 {
 873     U8 tmpbuf[UTF8_MAXLEN+1];
 874     uvchr_to_utf8(tmpbuf, (UV)c);
 875     return is_utf8_cntrl(tmpbuf);
 876 }
 877
 878 bool
 879 Perl_is_uni_graph(pTHX_ U32 c)
 880 {
 881     U8 tmpbuf[UTF8_MAXLEN+1];
 882     uvchr_to_utf8(tmpbuf, (UV)c);
 883     return is_utf8_graph(tmpbuf);
 884 }
 885
 886 bool
 887 Perl_is_uni_print(pTHX_ U32 c)
 888 {
 889     U8 tmpbuf[UTF8_MAXLEN+1];
 890     uvchr_to_utf8(tmpbuf, (UV)c);
 891     return is_utf8_print(tmpbuf);
 892 }
 893
 894 bool
 895 Perl_is_uni_punct(pTHX_ U32 c)
 896 {
 897     U8 tmpbuf[UTF8_MAXLEN+1];
 898     uvchr_to_utf8(tmpbuf, (UV)c);
 899     return is_utf8_punct(tmpbuf);
 900 }
 901
 902 bool
 903 Perl_is_uni_xdigit(pTHX_ U32 c)
 904 {
 905     U8 tmpbuf[UTF8_MAXLEN*2+1];
 906     uvchr_to_utf8(tmpbuf, (UV)c);
 907     return is_utf8_xdigit(tmpbuf);
 908 }
 909
 910 U32
 911 Perl_to_uni_upper(pTHX_ U32 c, U8* p, STRLEN *lenp)
 912 {
 913     U8 tmpbuf[UTF8_MAXLEN*2+1];
 914     uvchr_to_utf8(tmpbuf, (UV)c);
 915     return to_utf8_upper(tmpbuf, p, lenp);
 916 }
 917
 918 U32
 919 Perl_to_uni_title(pTHX_ U32 c, U8* p, STRLEN *lenp)
 920 {
 921     U8 tmpbuf[UTF8_MAXLEN*2+1];
 922     uvchr_to_utf8(tmpbuf, (UV)c);
 923     return to_utf8_title(tmpbuf, p, lenp);
 924 }
 925
 926 U32
 927 Perl_to_uni_lower(pTHX_ U32 c, U8* p, STRLEN *lenp)
 928 {
 929     U8 tmpbuf[UTF8_MAXLEN+1];
 930     uvchr_to_utf8(tmpbuf, (UV)c);
 931     return to_utf8_lower(tmpbuf, p, lenp);
 932 }
 933
 934 /* for now these all assume no locale info available for Unicode > 255 */
 935
 936 bool
 937 Perl_is_uni_alnum_lc(pTHX_ U32 c)
 938 {
 939     return is_uni_alnum(c);     /* XXX no locale support yet */
 940 }
 941
 942 bool
 943 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
 944 {
 945     return is_uni_alnumc(c);    /* XXX no locale support yet */
 946 }
 947
 948 bool
 949 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
 950 {
 951     return is_uni_idfirst(c);   /* XXX no locale support yet */
 952 }
 953
 954 bool
 955 Perl_is_uni_alpha_lc(pTHX_ U32 c)
 956 {
 957     return is_uni_alpha(c);     /* XXX no locale support yet */
 958 }
 959
 960 bool
 961 Perl_is_uni_ascii_lc(pTHX_ U32 c)
 962 {
 963     return is_uni_ascii(c);     /* XXX no locale support yet */
 964 }
 965
 966 bool
 967 Perl_is_uni_space_lc(pTHX_ U32 c)
 968 {
 969     return is_uni_space(c);     /* XXX no locale support yet */
 970 }
 971
 972 bool
 973 Perl_is_uni_digit_lc(pTHX_ U32 c)
 974 {
 975     return is_uni_digit(c);     /* XXX no locale support yet */
 976 }
 977
 978 bool
 979 Perl_is_uni_upper_lc(pTHX_ U32 c)
 980 {
 981     return is_uni_upper(c);     /* XXX no locale support yet */
 982 }
 983
 984 bool
 985 Perl_is_uni_lower_lc(pTHX_ U32 c)
 986 {
 987     return is_uni_lower(c);     /* XXX no locale support yet */
 988 }
 989
 990 bool
 991 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
 992 {
 993     return is_uni_cntrl(c);     /* XXX no locale support yet */
 994 }
 995
 996 bool
 997 Perl_is_uni_graph_lc(pTHX_ U32 c)
 998 {
 999     return is_uni_graph(c);     /* XXX no locale support yet */
1000 }
1001
1002 bool
1003 Perl_is_uni_print_lc(pTHX_ U32 c)
1004 {
1005     return is_uni_print(c);     /* XXX no locale support yet */
1006 }
1007
1008 bool
1009 Perl_is_uni_punct_lc(pTHX_ U32 c)
1010 {
1011     return is_uni_punct(c);     /* XXX no locale support yet */
1012 }
1013
1014 bool
1015 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1016 {
1017     return is_uni_xdigit(c);    /* XXX no locale support yet */
1018 }
1019
1020 bool
1021 Perl_is_utf8_alnum(pTHX_ U8 *p)
1022 {
1023     if (!is_utf8_char(p))
1024         return FALSE;
1025     if (!PL_utf8_alnum)
1026         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1027          * descendant of isalnum(3), in other words, it doesn't
1028          * contain the '_'. --jhi */
1029         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1030     return swash_fetch(PL_utf8_alnum, p, TRUE);
1031 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1032 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1033     if (!PL_utf8_alnum)
1034         PL_utf8_alnum = swash_init("utf8", "",
1035             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1036     return swash_fetch(PL_utf8_alnum, p, TRUE);
1037 #endif
1038 }
1039
1040 bool
1041 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1042 {
1043     if (!is_utf8_char(p))
1044         return FALSE;
1045     if (!PL_utf8_alnum)
1046         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1047     return swash_fetch(PL_utf8_alnum, p, TRUE);
1048 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1049 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1050     if (!PL_utf8_alnum)
1051         PL_utf8_alnum = swash_init("utf8", "",
1052             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1053     return swash_fetch(PL_utf8_alnum, p, TRUE);
1054 #endif
1055 }
1056
1057 bool
1058 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1059 {
1060     return *p == '_' || is_utf8_alpha(p);
1061 }
1062
1063 bool
1064 Perl_is_utf8_alpha(pTHX_ U8 *p)
1065 {
1066     if (!is_utf8_char(p))
1067         return FALSE;
1068     if (!PL_utf8_alpha)
1069         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1070     return swash_fetch(PL_utf8_alpha, p, TRUE);
1071 }
1072
1073 bool
1074 Perl_is_utf8_ascii(pTHX_ U8 *p)
1075 {
1076     if (!is_utf8_char(p))
1077         return FALSE;
1078     if (!PL_utf8_ascii)
1079         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1080     return swash_fetch(PL_utf8_ascii, p, TRUE);
1081 }
1082
1083 bool
1084 Perl_is_utf8_space(pTHX_ U8 *p)
1085 {
1086     if (!is_utf8_char(p))
1087         return FALSE;
1088     if (!PL_utf8_space)
1089         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1090     return swash_fetch(PL_utf8_space, p, TRUE);
1091 }
1092
1093 bool
1094 Perl_is_utf8_digit(pTHX_ U8 *p)
1095 {
1096     if (!is_utf8_char(p))
1097         return FALSE;
1098     if (!PL_utf8_digit)
1099         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1100     return swash_fetch(PL_utf8_digit, p, TRUE);
1101 }
1102
1103 bool
1104 Perl_is_utf8_upper(pTHX_ U8 *p)
1105 {
1106     if (!is_utf8_char(p))
1107         return FALSE;
1108     if (!PL_utf8_upper)
1109         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1110     return swash_fetch(PL_utf8_upper, p, TRUE);
1111 }
1112
1113 bool
1114 Perl_is_utf8_lower(pTHX_ U8 *p)
1115 {
1116     if (!is_utf8_char(p))
1117         return FALSE;
1118     if (!PL_utf8_lower)
1119         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1120     return swash_fetch(PL_utf8_lower, p, TRUE);
1121 }
1122
1123 bool
1124 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1125 {
1126     if (!is_utf8_char(p))
1127         return FALSE;
1128     if (!PL_utf8_cntrl)
1129         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1130     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1131 }
1132
1133 bool
1134 Perl_is_utf8_graph(pTHX_ U8 *p)
1135 {
1136     if (!is_utf8_char(p))
1137         return FALSE;
1138     if (!PL_utf8_graph)
1139         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1140     return swash_fetch(PL_utf8_graph, p, TRUE);
1141 }
1142
1143 bool
1144 Perl_is_utf8_print(pTHX_ U8 *p)
1145 {
1146     if (!is_utf8_char(p))
1147         return FALSE;
1148     if (!PL_utf8_print)
1149         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1150     return swash_fetch(PL_utf8_print, p, TRUE);
1151 }
1152
1153 bool
1154 Perl_is_utf8_punct(pTHX_ U8 *p)
1155 {
1156     if (!is_utf8_char(p))
1157         return FALSE;
1158     if (!PL_utf8_punct)
1159         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1160     return swash_fetch(PL_utf8_punct, p, TRUE);
1161 }
1162
1163 bool
1164 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1165 {
1166     if (!is_utf8_char(p))
1167         return FALSE;
1168     if (!PL_utf8_xdigit)
1169         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1170     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1171 }
1172
1173 bool
1174 Perl_is_utf8_mark(pTHX_ U8 *p)
1175 {
1176     if (!is_utf8_char(p))
1177         return FALSE;
1178     if (!PL_utf8_mark)
1179         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1180     return swash_fetch(PL_utf8_mark, p, TRUE);
1181 }
1182
1183 /*
1184 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1185
1186 The "p" contains the pointer to the UTF-8 string encoding
1187 the character that is being converted.
1188
1189 The "ustrp" is a pointer to the character buffer to put the
1190 conversion result to.  The "lenp" is a pointer to the length
1191 of the result.
1192
1193 The "swash" is a pointer to the swash to use.
1194
1195 The "normal" is a string like "ToLower" which means the swash
1196 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1197 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1198
1199 The "special" is a string like "utf8::ToSpecLower", which means
1200 the hash %utf8::ToSpecLower, which is stored in the same file,
1201 lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
1202 to the hash is by Perl_to_utf8_case().
1203
1204 =cut
1205  */
1206
1207 UV
1208 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1209 {
1210     UV uv;
1211
1212     if (!*swashp)
1213         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1214     uv = swash_fetch(*swashp, p, TRUE);
1215     if (uv)
1216          uv = UNI_TO_NATIVE(uv);
1217     else {
1218          HV *hv;
1219          SV *keysv;
1220          HE *he;
1221
1222          uv = utf8_to_uvchr(p, 0);
1223
1224          if ((hv    = get_hv(special, FALSE)) &&
1225              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1226              (he    = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1227               SV *val = HeVAL(he);
1228               char *s = SvPV(val, *lenp);
1229               U8 c = *(U8*)s;
1230               if (*lenp > 1 || UNI_IS_INVARIANT(c))
1231                    Copy(s, ustrp, *lenp, U8);
1232               else {
1233                    /* something in the 0x80..0xFF range */
1234                    ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1235                    ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1236                    *lenp = 2;
1237               }
1238               return 0;
1239          }
1240     }
1241     *lenp = UNISKIP(uv);
1242     uvuni_to_utf8(ustrp, uv);
1243     return uv;
1244 }
1245
1246 UV
1247 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1248 {
1249     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1250                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1251 }
1252
1253 UV
1254 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1255 {
1256     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1257                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1258 }
1259
1260 UV
1261 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1262 {
1263     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1264                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1265 }
1266
1267 UV
1268 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1269 {
1270     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1271                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1272 }
1273
1274 /* a "swash" is a swatch hash */
1275
1276 SV*
1277 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1278 {
1279     SV* retval;
1280     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1281     dSP;
1282     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1283     SV* errsv_save;
1284
1285     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1286         ENTER;
1287         errsv_save = newSVsv(ERRSV);
1288         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1289         if (!SvTRUE(ERRSV))
1290             sv_setsv(ERRSV, errsv_save);
1291         SvREFCNT_dec(errsv_save);
1292         LEAVE;
1293     }
1294     SPAGAIN;
1295     PUSHSTACKi(PERLSI_MAGIC);
1296     PUSHMARK(SP);
1297     EXTEND(SP,5);
1298     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1299     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1300     PUSHs(listsv);
1301     PUSHs(sv_2mortal(newSViv(minbits)));
1302     PUSHs(sv_2mortal(newSViv(none)));
1303     PUTBACK;
1304     ENTER;
1305     SAVEI32(PL_hints);
1306     PL_hints = 0;
1307     save_re_context();
1308     if (PL_curcop == &PL_compiling)
1309         /* XXX ought to be handled by lex_start */
1310         sv_setpv(tokenbufsv, PL_tokenbuf);
1311     errsv_save = newSVsv(ERRSV);
1312     if (call_method("SWASHNEW", G_SCALAR))
1313         retval = newSVsv(*PL_stack_sp--);
1314     else
1315         retval = &PL_sv_undef;
1316     if (!SvTRUE(ERRSV))
1317         sv_setsv(ERRSV, errsv_save);
1318     SvREFCNT_dec(errsv_save);
1319     LEAVE;
1320     POPSTACK;
1321     if (PL_curcop == &PL_compiling) {
1322         STRLEN len;
1323         char* pv = SvPV(tokenbufsv, len);
1324
1325         Copy(pv, PL_tokenbuf, len+1, char);
1326         PL_curcop->op_private = PL_hints;
1327     }
1328     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1329         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1330     return retval;
1331 }
1332
1333
1334 /* This API is wrong for special case conversions since we may need to
1335  * return several Unicode characters for a single Unicode character
1336  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1337  * the lower-level routine, and it is similarly broken for returning
1338  * multiple values.  --jhi */
1339 UV
1340 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1341 {
1342     HV* hv = (HV*)SvRV(sv);
1343     U32 klen;
1344     U32 off;
1345     STRLEN slen;
1346     STRLEN needents;
1347     U8 *tmps = NULL;
1348     U32 bit;
1349     SV *retval;
1350     U8 tmputf8[2];
1351     UV c = NATIVE_TO_ASCII(*ptr);
1352
1353     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1354         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1355         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1356         ptr = tmputf8;
1357     }
1358     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1359      * then the "swatch" is a vec() for al the chars which start
1360      * with 0xAA..0xYY
1361      * So the key in the hash (klen) is length of encoded char -1
1362      */
1363     klen = UTF8SKIP(ptr) - 1;
1364     off  = ptr[klen];
1365
1366     if (klen == 0)
1367      {
1368       /* If char in invariant then swatch is for all the invariant chars
1369        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1370        */
1371       needents = UTF_CONTINUATION_MARK;
1372       off      = NATIVE_TO_UTF(ptr[klen]);
1373      }
1374     else
1375      {
1376       /* If char is encoded then swatch is for the prefix */
1377       needents = (1 << UTF_ACCUMULATION_SHIFT);
1378       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1379      }
1380
1381     /*
1382      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1383      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1384      * it's nothing to sniff at.)  Pity we usually come through at least
1385      * two function calls to get here...
1386      *
1387      * NB: this code assumes that swatches are never modified, once generated!
1388      */
1389
1390     if (hv   == PL_last_swash_hv &&
1391         klen == PL_last_swash_klen &&
1392         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1393     {
1394         tmps = PL_last_swash_tmps;
1395         slen = PL_last_swash_slen;
1396     }
1397     else {
1398         /* Try our second-level swatch cache, kept in a hash. */
1399         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1400
1401         /* If not cached, generate it via utf8::SWASHGET */
1402         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1403             dSP;
1404             /* We use utf8n_to_uvuni() as we want an index into
1405                Unicode tables, not a native character number.
1406              */
1407             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1408             SV *errsv_save;
1409             ENTER;
1410             SAVETMPS;
1411             save_re_context();
1412             PUSHSTACKi(PERLSI_MAGIC);
1413             PUSHMARK(SP);
1414             EXTEND(SP,3);
1415             PUSHs((SV*)sv);
1416             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1417             PUSHs(sv_2mortal(newSViv((klen) ?
1418                                      (code_point & ~(needents - 1)) : 0)));
1419             PUSHs(sv_2mortal(newSViv(needents)));
1420             PUTBACK;
1421             errsv_save = newSVsv(ERRSV);
1422             if (call_method("SWASHGET", G_SCALAR))
1423                 retval = newSVsv(*PL_stack_sp--);
1424             else
1425                 retval = &PL_sv_undef;
1426             if (!SvTRUE(ERRSV))
1427                 sv_setsv(ERRSV, errsv_save);
1428             SvREFCNT_dec(errsv_save);
1429             POPSTACK;
1430             FREETMPS;
1431             LEAVE;
1432             if (PL_curcop == &PL_compiling)
1433                 PL_curcop->op_private = PL_hints;
1434
1435             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1436
1437             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1438                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1439         }
1440
1441         PL_last_swash_hv = hv;
1442         PL_last_swash_klen = klen;
1443         PL_last_swash_tmps = tmps;
1444         PL_last_swash_slen = slen;
1445         if (klen)
1446             Copy(ptr, PL_last_swash_key, klen, U8);
1447     }
1448
1449     switch ((int)((slen << 3) / needents)) {
1450     case 1:
1451         bit = 1 << (off & 7);
1452         off >>= 3;
1453         return (tmps[off] & bit) != 0;
1454     case 8:
1455         return tmps[off];
1456     case 16:
1457         off <<= 1;
1458         return (tmps[off] << 8) + tmps[off + 1] ;
1459     case 32:
1460         off <<= 2;
1461         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1462     }
1463     Perl_croak(aTHX_ "panic: swash_fetch");
1464     return 0;
1465 }
1466
1467
1468 /*
1469 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1470
1471 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1472 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1473 bytes available. The return value is the pointer to the byte after the
1474 end of the new character. In other words,
1475
1476     d = uvchr_to_utf8(d, uv);
1477
1478 is the recommended wide native character-aware way of saying
1479
1480     *(d++) = uv;
1481
1482 =cut
1483 */
1484
1485 /* On ASCII machines this is normally a macro but we want a
1486    real function in case XS code wants it
1487 */
1488 #undef Perl_uvchr_to_utf8
1489 U8 *
1490 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1491 {
1492     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1493 }
1494
1495
1496 /*
1497 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1498
1499 Returns the native character value of the first character in the string C<s>
1500 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1501 length, in bytes, of that character.
1502
1503 Allows length and flags to be passed to low level routine.
1504
1505 =cut
1506 */
1507 /* On ASCII machines this is normally a macro but we want a
1508    real function in case XS code wants it
1509 */
1510 #undef Perl_utf8n_to_uvchr
1511 UV
1512 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1513 {
1514     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1515     return UNI_TO_NATIVE(uv);
1516 }
1517
1518