utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (UNI_IS_INVARIANT(uv)) {
  50         *d++ = UTF_TO_NATIVE(uv);
  51         return d;
  52     }
  53 #if defined(EBCDIC) || 1 /* always for testing */
  54     else {
  55         STRLEN len  = UNISKIP(uv);
  56         U8 *p = d+len-1;
  57         while (p > d) {
  58             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  59             uv >>= UTF_ACCUMULATION_SHIFT;
  60         }
  61         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  62         return d+len;
  63     }
  64 #else /* Non loop style */
  65     if (uv < 0x800) {
  66         *d++ = (( uv >>  6)         | 0xc0);
  67         *d++ = (( uv        & 0x3f) | 0x80);
  68         return d;
  69     }
  70     if (uv < 0x10000) {
  71         *d++ = (( uv >> 12)         | 0xe0);
  72         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  73         *d++ = (( uv        & 0x3f) | 0x80);
  74         return d;
  75     }
  76     if (uv < 0x200000) {
  77         *d++ = (( uv >> 18)         | 0xf0);
  78         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  79         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  80         *d++ = (( uv        & 0x3f) | 0x80);
  81         return d;
  82     }
  83     if (uv < 0x4000000) {
  84         *d++ = (( uv >> 24)         | 0xf8);
  85         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x80000000) {
  92         *d++ = (( uv >> 30)         | 0xfc);
  93         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  96         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  97         *d++ = (( uv        & 0x3f) | 0x80);
  98         return d;
  99     }
 100 #ifdef HAS_QUAD
 101     if (uv < UTF8_QUAD_MAX)
 102 #endif
 103     {
 104         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 105         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 106         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 109         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 110         *d++ = (( uv        & 0x3f) | 0x80);
 111         return d;
 112     }
 113 #ifdef HAS_QUAD
 114     {
 115         *d++ =                        0xff;     /* Can't match U+FFFE! */
 116         *d++ =                        0x80;     /* 6 Reserved bits */
 117         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 118         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 119         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 120         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 121         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 122         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 126         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 127         *d++ = (( uv        & 0x3f) | 0x80);
 128         return d;
 129     }
 130 #endif
 131 #endif /* Loop style */
 132 }
 133
 134
 135
 136 /*
 137 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 138
 139 Tests if some arbitrary number of bytes begins in a valid UTF-8
 140 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 141 The actual number of bytes in the UTF-8 character will be returned if
 142 it is valid, otherwise 0.
 143
 144 =cut */
 145 STRLEN
 146 Perl_is_utf8_char(pTHX_ U8 *s)
 147 {
 148     U8 u = *s;
 149     STRLEN slen, len;
 150     UV uv, ouv;
 151
 152     if (UTF8_IS_INVARIANT(u))
 153         return 1;
 154
 155     if (!UTF8_IS_START(u))
 156         return 0;
 157
 158     len = UTF8SKIP(s);
 159
 160     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 161         return 0;
 162
 163     slen = len - 1;
 164     s++;
 165     /* The initial value is dubious */
 166     uv  = u;
 167     ouv = uv;
 168     while (slen--) {
 169         if (!UTF8_IS_CONTINUATION(*s))
 170             return 0;
 171         uv = UTF8_ACCUMULATE(uv, *s);
 172         if (uv < ouv)
 173             return 0;
 174         ouv = uv;
 175         s++;
 176     }
 177
 178     if (UNISKIP(uv) < len)
 179         return 0;
 180
 181     return len;
 182 }
 183
 184 /*
 185 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 186
 187 Returns true if first C<len> bytes of the given string form a valid UTF8
 188 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 189 'a string that contains UTF8' because a valid ASCII string is a valid
 190 UTF8 string.
 191
 192 =cut
 193 */
 194
 195 bool
 196 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 197 {
 198     U8* x = s;
 199     U8* send;
 200     STRLEN c;
 201
 202     if (!len)
 203         len = strlen((char *)s);
 204     send = s + len;
 205
 206     while (x < send) {
 207         c = is_utf8_char(x);
 208         if (!c)
 209             return FALSE;
 210         x += c;
 211     }
 212     if (x != send)
 213         return FALSE;
 214
 215     return TRUE;
 216 }
 217
 218 /*
 219 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 220
 221 Bottom level UTF-8 decode routine.
 222 Returns the unicode code point value of the first character in the string C<s>
 223 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 224 C<retlen> will be set to the length, in bytes, of that character.
 225
 226 If C<s> does not point to a well-formed UTF8 character, the behaviour
 227 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 228 it is assumed that the caller will raise a warning, and this function
 229 will silently just set C<retlen> to C<-1> and return zero.  If the
 230 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 231 malformations will be given, C<retlen> will be set to the expected
 232 length of the UTF-8 character in bytes, and zero will be returned.
 233
 234 The C<flags> can also contain various flags to allow deviations from
 235 the strict UTF-8 encoding (see F<utf8.h>).
 236
 237 Most code should use utf8_to_uvchr() rather than call this directly.
 238
 239 =cut */
 240
 241 UV
 242 Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 243 {
 244     UV uv = *s, ouv;
 245     STRLEN len = 1;
 246     bool dowarn = ckWARN_d(WARN_UTF8);
 247     STRLEN expectlen = 0;
 248     U32 warning = 0;
 249
 250 /* This list is a superset of the UTF8_ALLOW_XXX. */
 251
 252 #define UTF8_WARN_EMPTY                          1
 253 #define UTF8_WARN_CONTINUATION                   2
 254 #define UTF8_WARN_NON_CONTINUATION               3
 255 #define UTF8_WARN_FE_FF                          4
 256 #define UTF8_WARN_SHORT                          5
 257 #define UTF8_WARN_OVERFLOW                       6
 258 #define UTF8_WARN_SURROGATE                      7
 259 #define UTF8_WARN_BOM                            8
 260 #define UTF8_WARN_LONG                           9
 261 #define UTF8_WARN_FFFF                          10
 262
 263     if (curlen == 0 &&
 264         !(flags & UTF8_ALLOW_EMPTY)) {
 265         warning = UTF8_WARN_EMPTY;
 266         goto malformed;
 267     }
 268
 269     if (UTF8_IS_INVARIANT(uv)) {
 270         if (retlen)
 271             *retlen = 1;
 272         return (UV) (NATIVE_TO_UTF(*s));
 273     }
 274
 275     if (UTF8_IS_CONTINUATION(uv) &&
 276         !(flags & UTF8_ALLOW_CONTINUATION)) {
 277         warning = UTF8_WARN_CONTINUATION;
 278         goto malformed;
 279     }
 280
 281     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 282         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 283         warning = UTF8_WARN_NON_CONTINUATION;
 284         goto malformed;
 285     }
 286
 287 #ifdef EBCDIC
 288     uv = NATIVE_TO_UTF(uv);
 289 #else
 290     if ((uv == 0xfe || uv == 0xff) &&
 291         !(flags & UTF8_ALLOW_FE_FF)) {
 292         warning = UTF8_WARN_FE_FF;
 293         goto malformed;
 294     }
 295 #endif
 296
 297     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 298     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 299     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 300     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 301 #ifdef EBCDIC
 302     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 303     else                        { len =  7; uv &= 0x01; }
 304 #else
 305     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 306     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 307     else                        { len = 13; uv = 0; } /* whoa! */
 308 #endif
 309
 310     if (retlen)
 311         *retlen = len;
 312
 313     expectlen = len;
 314
 315     if ((curlen < expectlen) &&
 316         !(flags & UTF8_ALLOW_SHORT)) {
 317         warning = UTF8_WARN_SHORT;
 318         goto malformed;
 319     }
 320
 321     len--;
 322     s++;
 323     ouv = uv;
 324
 325     while (len--) {
 326         if (!UTF8_IS_CONTINUATION(*s) &&
 327             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 328             s--;
 329             warning = UTF8_WARN_NON_CONTINUATION;
 330             goto malformed;
 331         }
 332         else
 333             uv = UTF8_ACCUMULATE(uv, *s);
 334         if (!(uv > ouv)) {
 335             /* These cannot be allowed. */
 336             if (uv == ouv) {
 337                 if (!(flags & UTF8_ALLOW_LONG)) {
 338                     warning = UTF8_WARN_LONG;
 339                     goto malformed;
 340                 }
 341             }
 342             else { /* uv < ouv */
 343                 /* This cannot be allowed. */
 344                 warning = UTF8_WARN_OVERFLOW;
 345                 goto malformed;
 346             }
 347         }
 348         s++;
 349         ouv = uv;
 350     }
 351
 352     if (UNICODE_IS_SURROGATE(uv) &&
 353         !(flags & UTF8_ALLOW_SURROGATE)) {
 354         warning = UTF8_WARN_SURROGATE;
 355         goto malformed;
 356     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 357                !(flags & UTF8_ALLOW_BOM)) {
 358         warning = UTF8_WARN_BOM;
 359         goto malformed;
 360     } else if ((expectlen > UNISKIP(uv)) &&
 361                !(flags & UTF8_ALLOW_LONG)) {
 362         warning = UTF8_WARN_LONG;
 363         goto malformed;
 364     } else if (UNICODE_IS_ILLEGAL(uv) &&
 365                !(flags & UTF8_ALLOW_FFFF)) {
 366         warning = UTF8_WARN_FFFF;
 367         goto malformed;
 368     }
 369
 370     return uv;
 371
 372 malformed:
 373
 374     if (flags & UTF8_CHECK_ONLY) {
 375         if (retlen)
 376             *retlen = -1;
 377         return 0;
 378     }
 379
 380     if (dowarn) {
 381         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 382
 383         switch (warning) {
 384         case 0: /* Intentionally empty. */ break;
 385         case UTF8_WARN_EMPTY:
 386             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 387             break;
 388         case UTF8_WARN_CONTINUATION:
 389             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 390             break;
 391         case UTF8_WARN_NON_CONTINUATION:
 392             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 393                            (UV)s[1], uv);
 394             break;
 395         case UTF8_WARN_FE_FF:
 396             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 397             break;
 398         case UTF8_WARN_SHORT:
 399             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 400                            curlen, curlen == 1 ? "" : "s", expectlen);
 401             break;
 402         case UTF8_WARN_OVERFLOW:
 403             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 404                            ouv, *s);
 405             break;
 406         case UTF8_WARN_SURROGATE:
 407             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 408             break;
 409         case UTF8_WARN_BOM:
 410             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 411             break;
 412         case UTF8_WARN_LONG:
 413             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 414                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 415             break;
 416         case UTF8_WARN_FFFF:
 417             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 418             break;
 419         default:
 420             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 421             break;
 422         }
 423
 424         if (warning) {
 425             char *s = SvPVX(sv);
 426
 427             if (PL_op)
 428                 Perl_warner(aTHX_ WARN_UTF8,
 429                             "%s in %s", s,  PL_op_desc[PL_op->op_type]);
 430             else
 431                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 432         }
 433     }
 434
 435     if (retlen)
 436         *retlen = expectlen ? expectlen : len;
 437
 438     return 0;
 439 }
 440
 441 /*
 442 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
 443
 444 Returns the native character value of the first character in the string C<s>
 445 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 446 length, in bytes, of that character.
 447
 448 If C<s> does not point to a well-formed UTF8 character, zero is
 449 returned and retlen is set, if possible, to -1.
 450
 451 =cut
 452 */
 453
 454 UV
 455 Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
 456 {
 457     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 458 }
 459
 460 /*
 461 =for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
 462
 463 Returns the Unicode code point of the first character in the string C<s>
 464 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 465 length, in bytes, of that character.
 466
 467 This function should only be used when returned UV is considered
 468 an index into the Unicode semantic tables (e.g. swashes).
 469
 470 If C<s> does not point to a well-formed UTF8 character, zero is
 471 returned and retlen is set, if possible, to -1.
 472
 473 =cut
 474 */
 475
 476 UV
 477 Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
 478 {
 479     /* Call the low level routine asking for checks */
 480     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 481 }
 482
 483 /*
 484 =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
 485
 486 Return the length of the UTF-8 char encoded string C<s> in characters.
 487 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 488 up past C<e>, croaks.
 489
 490 =cut
 491 */
 492
 493 STRLEN
 494 Perl_utf8_length(pTHX_ U8* s, U8* e)
 495 {
 496     STRLEN len = 0;
 497
 498     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 499      * the bitops (especially ~) can create illegal UTF-8.
 500      * In other words: in Perl UTF-8 is not just for Unicode. */
 501
 502     if (e < s)
 503         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 504     while (s < e) {
 505         U8 t = UTF8SKIP(s);
 506
 507         if (e - s < t)
 508             Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
 509         s += t;
 510         len++;
 511     }
 512
 513     return len;
 514 }
 515
 516 /*
 517 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 518
 519 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 520 and C<b>.
 521
 522 WARNING: use only if you *know* that the pointers point inside the
 523 same UTF-8 buffer.
 524
 525 =cut */
 526
 527 IV
 528 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 529 {
 530     IV off = 0;
 531
 532     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 533      * the bitops (especially ~) can create illegal UTF-8.
 534      * In other words: in Perl UTF-8 is not just for Unicode. */
 535
 536     if (a < b) {
 537         while (a < b) {
 538             U8 c = UTF8SKIP(a);
 539
 540             if (b - a < c)
 541                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 542             a += c;
 543             off--;
 544         }
 545     }
 546     else {
 547         while (b < a) {
 548             U8 c = UTF8SKIP(b);
 549
 550             if (a - b < c)
 551                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 552             b += c;
 553             off++;
 554         }
 555     }
 556
 557     return off;
 558 }
 559
 560 /*
 561 =for apidoc A|U8*|utf8_hop|U8 *s|I32 off
 562
 563 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 564 forward or backward.
 565
 566 WARNING: do not use the following unless you *know* C<off> is within
 567 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 568 on the first byte of character or just after the last byte of a character.
 569
 570 =cut */
 571
 572 U8 *
 573 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 574 {
 575     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 576      * the bitops (especially ~) can create illegal UTF-8.
 577      * In other words: in Perl UTF-8 is not just for Unicode. */
 578
 579     if (off >= 0) {
 580         while (off--)
 581             s += UTF8SKIP(s);
 582     }
 583     else {
 584         while (off++) {
 585             s--;
 586             while (UTF8_IS_CONTINUATION(*s))
 587                 s--;
 588         }
 589     }
 590     return s;
 591 }
 592
 593 /*
 594 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 595
 596 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 597 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 598 updates len to contain the new length.
 599 Returns zero on failure, setting C<len> to -1.
 600
 601 =cut
 602 */
 603
 604 U8 *
 605 Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 606 {
 607     U8 *send;
 608     U8 *d;
 609     U8 *save = s;
 610
 611     /* ensure valid UTF8 and chars < 256 before updating string */
 612     for (send = s + *len; s < send; ) {
 613         U8 c = *s++;
 614
 615         if (!UTF8_IS_INVARIANT(c) &&
 616             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 617              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 618             *len = -1;
 619             return 0;
 620         }
 621     }
 622
 623     d = s = save;
 624     while (s < send) {
 625         STRLEN ulen;
 626         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 627         s += ulen;
 628     }
 629     *d = '\0';
 630     *len = d - save;
 631     return save;
 632 }
 633
 634 /*
 635 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 636
 637 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 638 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 639 the newly-created string, and updates C<len> to contain the new
 640 length.  Returns the original string if no conversion occurs, C<len>
 641 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 642 0 if C<s> is converted or contains all 7bit characters.
 643
 644 =cut */
 645
 646 U8 *
 647 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
 648 {
 649     U8 *d;
 650     U8 *start = s;
 651     U8 *send;
 652     I32 count = 0;
 653
 654     if (!*is_utf8)
 655         return start;
 656
 657     /* ensure valid UTF8 and chars < 256 before converting string */
 658     for (send = s + *len; s < send;) {
 659         U8 c = *s++;
 660         if (!UTF8_IS_INVARIANT(c)) {
 661             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 662                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 663                 count++;
 664             else
 665                 return start;
 666         }
 667     }
 668
 669     *is_utf8 = 0;
 670
 671     Newz(801, d, (*len) - count + 1, U8);
 672     s = start; start = d;
 673     while (s < send) {
 674         U8 c = *s++;
 675         if (!UTF8_IS_INVARIANT(c)) {
 676             /* Then it is two-byte encoded */
 677             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 678             c = ASCII_TO_NATIVE(c);
 679         }
 680         *d++ = c;
 681     }
 682     *d = '\0';
 683     *len = d - start;
 684     return start;
 685 }
 686
 687 /*
 688 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 689
 690 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 691 Returns a pointer to the newly-created string, and sets C<len> to
 692 reflect the new length.
 693
 694 =cut
 695 */
 696
 697 U8*
 698 Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
 699 {
 700     U8 *send;
 701     U8 *d;
 702     U8 *dst;
 703     send = s + (*len);
 704
 705     Newz(801, d, (*len) * 2 + 1, U8);
 706     dst = d;
 707
 708     while (s < send) {
 709         UV uv = NATIVE_TO_ASCII(*s++);
 710         if (UNI_IS_INVARIANT(uv))
 711             *d++ = UTF_TO_NATIVE(uv);
 712         else {
 713             *d++ = UTF8_EIGHT_BIT_HI(uv);
 714             *d++ = UTF8_EIGHT_BIT_LO(uv);
 715         }
 716     }
 717     *d = '\0';
 718     *len = d-dst;
 719     return dst;
 720 }
 721
 722 /*
 723  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 724  *
 725  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 726  * We optimize for native, for obvious reasons. */
 727
 728 U8*
 729 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 730 {
 731     U8* pend;
 732     U8* dstart = d;
 733
 734     if (bytelen & 1)
 735         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 736
 737     pend = p + bytelen;
 738
 739     while (p < pend) {
 740         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 741         p += 2;
 742         if (uv < 0x80) {
 743             *d++ = uv;
 744             continue;
 745         }
 746         if (uv < 0x800) {
 747             *d++ = (( uv >>  6)         | 0xc0);
 748             *d++ = (( uv        & 0x3f) | 0x80);
 749             continue;
 750         }
 751         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 752             UV low = *p++;
 753             if (low < 0xdc00 || low >= 0xdfff)
 754                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 755             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 756         }
 757         if (uv < 0x10000) {
 758             *d++ = (( uv >> 12)         | 0xe0);
 759             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 760             *d++ = (( uv        & 0x3f) | 0x80);
 761             continue;
 762         }
 763         else {
 764             *d++ = (( uv >> 18)         | 0xf0);
 765             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 766             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 767             *d++ = (( uv        & 0x3f) | 0x80);
 768             continue;
 769         }
 770     }
 771     *newlen = d - dstart;
 772     return d;
 773 }
 774
 775 /* Note: this one is slightly destructive of the source. */
 776
 777 U8*
 778 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 779 {
 780     U8* s = (U8*)p;
 781     U8* send = s + bytelen;
 782     while (s < send) {
 783         U8 tmp = s[0];
 784         s[0] = s[1];
 785         s[1] = tmp;
 786         s += 2;
 787     }
 788     return utf16_to_utf8(p, d, bytelen, newlen);
 789 }
 790
 791 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 792
 793 bool
 794 Perl_is_uni_alnum(pTHX_ U32 c)
 795 {
 796     U8 tmpbuf[UTF8_MAXLEN+1];
 797     uvchr_to_utf8(tmpbuf, (UV)c);
 798     return is_utf8_alnum(tmpbuf);
 799 }
 800
 801 bool
 802 Perl_is_uni_alnumc(pTHX_ U32 c)
 803 {
 804     U8 tmpbuf[UTF8_MAXLEN+1];
 805     uvchr_to_utf8(tmpbuf, (UV)c);
 806     return is_utf8_alnumc(tmpbuf);
 807 }
 808
 809 bool
 810 Perl_is_uni_idfirst(pTHX_ U32 c)
 811 {
 812     U8 tmpbuf[UTF8_MAXLEN+1];
 813     uvchr_to_utf8(tmpbuf, (UV)c);
 814     return is_utf8_idfirst(tmpbuf);
 815 }
 816
 817 bool
 818 Perl_is_uni_alpha(pTHX_ U32 c)
 819 {
 820     U8 tmpbuf[UTF8_MAXLEN+1];
 821     uvchr_to_utf8(tmpbuf, (UV)c);
 822     return is_utf8_alpha(tmpbuf);
 823 }
 824
 825 bool
 826 Perl_is_uni_ascii(pTHX_ U32 c)
 827 {
 828     U8 tmpbuf[UTF8_MAXLEN+1];
 829     uvchr_to_utf8(tmpbuf, (UV)c);
 830     return is_utf8_ascii(tmpbuf);
 831 }
 832
 833 bool
 834 Perl_is_uni_space(pTHX_ U32 c)
 835 {
 836     U8 tmpbuf[UTF8_MAXLEN+1];
 837     uvchr_to_utf8(tmpbuf, (UV)c);
 838     return is_utf8_space(tmpbuf);
 839 }
 840
 841 bool
 842 Perl_is_uni_digit(pTHX_ U32 c)
 843 {
 844     U8 tmpbuf[UTF8_MAXLEN+1];
 845     uvchr_to_utf8(tmpbuf, (UV)c);
 846     return is_utf8_digit(tmpbuf);
 847 }
 848
 849 bool
 850 Perl_is_uni_upper(pTHX_ U32 c)
 851 {
 852     U8 tmpbuf[UTF8_MAXLEN+1];
 853     uvchr_to_utf8(tmpbuf, (UV)c);
 854     return is_utf8_upper(tmpbuf);
 855 }
 856
 857 bool
 858 Perl_is_uni_lower(pTHX_ U32 c)
 859 {
 860     U8 tmpbuf[UTF8_MAXLEN+1];
 861     uvchr_to_utf8(tmpbuf, (UV)c);
 862     return is_utf8_lower(tmpbuf);
 863 }
 864
 865 bool
 866 Perl_is_uni_cntrl(pTHX_ U32 c)
 867 {
 868     U8 tmpbuf[UTF8_MAXLEN+1];
 869     uvchr_to_utf8(tmpbuf, (UV)c);
 870     return is_utf8_cntrl(tmpbuf);
 871 }
 872
 873 bool
 874 Perl_is_uni_graph(pTHX_ U32 c)
 875 {
 876     U8 tmpbuf[UTF8_MAXLEN+1];
 877     uvchr_to_utf8(tmpbuf, (UV)c);
 878     return is_utf8_graph(tmpbuf);
 879 }
 880
 881 bool
 882 Perl_is_uni_print(pTHX_ U32 c)
 883 {
 884     U8 tmpbuf[UTF8_MAXLEN+1];
 885     uvchr_to_utf8(tmpbuf, (UV)c);
 886     return is_utf8_print(tmpbuf);
 887 }
 888
 889 bool
 890 Perl_is_uni_punct(pTHX_ U32 c)
 891 {
 892     U8 tmpbuf[UTF8_MAXLEN+1];
 893     uvchr_to_utf8(tmpbuf, (UV)c);
 894     return is_utf8_punct(tmpbuf);
 895 }
 896
 897 bool
 898 Perl_is_uni_xdigit(pTHX_ U32 c)
 899 {
 900     U8 tmpbuf[UTF8_MAXLEN+1];
 901     uvchr_to_utf8(tmpbuf, (UV)c);
 902     return is_utf8_xdigit(tmpbuf);
 903 }
 904
 905 U32
 906 Perl_to_uni_upper(pTHX_ U32 c)
 907 {
 908     U8 tmpbuf[UTF8_MAXLEN+1];
 909     uvchr_to_utf8(tmpbuf, (UV)c);
 910     return to_utf8_upper(tmpbuf);
 911 }
 912
 913 U32
 914 Perl_to_uni_title(pTHX_ U32 c)
 915 {
 916     U8 tmpbuf[UTF8_MAXLEN+1];
 917     uvchr_to_utf8(tmpbuf, (UV)c);
 918     return to_utf8_title(tmpbuf);
 919 }
 920
 921 U32
 922 Perl_to_uni_lower(pTHX_ U32 c)
 923 {
 924     U8 tmpbuf[UTF8_MAXLEN+1];
 925     uvchr_to_utf8(tmpbuf, (UV)c);
 926     return to_utf8_lower(tmpbuf);
 927 }
 928
 929 /* for now these all assume no locale info available for Unicode > 255 */
 930
 931 bool
 932 Perl_is_uni_alnum_lc(pTHX_ U32 c)
 933 {
 934     return is_uni_alnum(c);     /* XXX no locale support yet */
 935 }
 936
 937 bool
 938 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
 939 {
 940     return is_uni_alnumc(c);    /* XXX no locale support yet */
 941 }
 942
 943 bool
 944 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
 945 {
 946     return is_uni_idfirst(c);   /* XXX no locale support yet */
 947 }
 948
 949 bool
 950 Perl_is_uni_alpha_lc(pTHX_ U32 c)
 951 {
 952     return is_uni_alpha(c);     /* XXX no locale support yet */
 953 }
 954
 955 bool
 956 Perl_is_uni_ascii_lc(pTHX_ U32 c)
 957 {
 958     return is_uni_ascii(c);     /* XXX no locale support yet */
 959 }
 960
 961 bool
 962 Perl_is_uni_space_lc(pTHX_ U32 c)
 963 {
 964     return is_uni_space(c);     /* XXX no locale support yet */
 965 }
 966
 967 bool
 968 Perl_is_uni_digit_lc(pTHX_ U32 c)
 969 {
 970     return is_uni_digit(c);     /* XXX no locale support yet */
 971 }
 972
 973 bool
 974 Perl_is_uni_upper_lc(pTHX_ U32 c)
 975 {
 976     return is_uni_upper(c);     /* XXX no locale support yet */
 977 }
 978
 979 bool
 980 Perl_is_uni_lower_lc(pTHX_ U32 c)
 981 {
 982     return is_uni_lower(c);     /* XXX no locale support yet */
 983 }
 984
 985 bool
 986 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
 987 {
 988     return is_uni_cntrl(c);     /* XXX no locale support yet */
 989 }
 990
 991 bool
 992 Perl_is_uni_graph_lc(pTHX_ U32 c)
 993 {
 994     return is_uni_graph(c);     /* XXX no locale support yet */
 995 }
 996
 997 bool
 998 Perl_is_uni_print_lc(pTHX_ U32 c)
 999 {
1000     return is_uni_print(c);     /* XXX no locale support yet */
1001 }
1002
1003 bool
1004 Perl_is_uni_punct_lc(pTHX_ U32 c)
1005 {
1006     return is_uni_punct(c);     /* XXX no locale support yet */
1007 }
1008
1009 bool
1010 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1011 {
1012     return is_uni_xdigit(c);    /* XXX no locale support yet */
1013 }
1014
1015 U32
1016 Perl_to_uni_upper_lc(pTHX_ U32 c)
1017 {
1018     return to_uni_upper(c);     /* XXX no locale support yet */
1019 }
1020
1021 U32
1022 Perl_to_uni_title_lc(pTHX_ U32 c)
1023 {
1024     return to_uni_title(c);     /* XXX no locale support yet */
1025 }
1026
1027 U32
1028 Perl_to_uni_lower_lc(pTHX_ U32 c)
1029 {
1030     return to_uni_lower(c);     /* XXX no locale support yet */
1031 }
1032
1033 bool
1034 Perl_is_utf8_alnum(pTHX_ U8 *p)
1035 {
1036     if (!is_utf8_char(p))
1037         return FALSE;
1038     if (!PL_utf8_alnum)
1039         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1040          * descendant of isalnum(3), in other words, it doesn't
1041          * contain the '_'. --jhi */
1042         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1043     return swash_fetch(PL_utf8_alnum, p);
1044 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1045 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1046     if (!PL_utf8_alnum)
1047         PL_utf8_alnum = swash_init("utf8", "",
1048             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1049     return swash_fetch(PL_utf8_alnum, p);
1050 #endif
1051 }
1052
1053 bool
1054 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1055 {
1056     if (!is_utf8_char(p))
1057         return FALSE;
1058     if (!PL_utf8_alnum)
1059         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1060     return swash_fetch(PL_utf8_alnum, p);
1061 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1062 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1063     if (!PL_utf8_alnum)
1064         PL_utf8_alnum = swash_init("utf8", "",
1065             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1066     return swash_fetch(PL_utf8_alnum, p);
1067 #endif
1068 }
1069
1070 bool
1071 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1072 {
1073     return *p == '_' || is_utf8_alpha(p);
1074 }
1075
1076 bool
1077 Perl_is_utf8_alpha(pTHX_ U8 *p)
1078 {
1079     if (!is_utf8_char(p))
1080         return FALSE;
1081     if (!PL_utf8_alpha)
1082         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1083     return swash_fetch(PL_utf8_alpha, p);
1084 }
1085
1086 bool
1087 Perl_is_utf8_ascii(pTHX_ U8 *p)
1088 {
1089     if (!is_utf8_char(p))
1090         return FALSE;
1091     if (!PL_utf8_ascii)
1092         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1093     return swash_fetch(PL_utf8_ascii, p);
1094 }
1095
1096 bool
1097 Perl_is_utf8_space(pTHX_ U8 *p)
1098 {
1099     if (!is_utf8_char(p))
1100         return FALSE;
1101     if (!PL_utf8_space)
1102         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1103     return swash_fetch(PL_utf8_space, p);
1104 }
1105
1106 bool
1107 Perl_is_utf8_digit(pTHX_ U8 *p)
1108 {
1109     if (!is_utf8_char(p))
1110         return FALSE;
1111     if (!PL_utf8_digit)
1112         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1113     return swash_fetch(PL_utf8_digit, p);
1114 }
1115
1116 bool
1117 Perl_is_utf8_upper(pTHX_ U8 *p)
1118 {
1119     if (!is_utf8_char(p))
1120         return FALSE;
1121     if (!PL_utf8_upper)
1122         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1123     return swash_fetch(PL_utf8_upper, p);
1124 }
1125
1126 bool
1127 Perl_is_utf8_lower(pTHX_ U8 *p)
1128 {
1129     if (!is_utf8_char(p))
1130         return FALSE;
1131     if (!PL_utf8_lower)
1132         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1133     return swash_fetch(PL_utf8_lower, p);
1134 }
1135
1136 bool
1137 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1138 {
1139     if (!is_utf8_char(p))
1140         return FALSE;
1141     if (!PL_utf8_cntrl)
1142         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1143     return swash_fetch(PL_utf8_cntrl, p);
1144 }
1145
1146 bool
1147 Perl_is_utf8_graph(pTHX_ U8 *p)
1148 {
1149     if (!is_utf8_char(p))
1150         return FALSE;
1151     if (!PL_utf8_graph)
1152         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1153     return swash_fetch(PL_utf8_graph, p);
1154 }
1155
1156 bool
1157 Perl_is_utf8_print(pTHX_ U8 *p)
1158 {
1159     if (!is_utf8_char(p))
1160         return FALSE;
1161     if (!PL_utf8_print)
1162         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1163     return swash_fetch(PL_utf8_print, p);
1164 }
1165
1166 bool
1167 Perl_is_utf8_punct(pTHX_ U8 *p)
1168 {
1169     if (!is_utf8_char(p))
1170         return FALSE;
1171     if (!PL_utf8_punct)
1172         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1173     return swash_fetch(PL_utf8_punct, p);
1174 }
1175
1176 bool
1177 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1178 {
1179     if (!is_utf8_char(p))
1180         return FALSE;
1181     if (!PL_utf8_xdigit)
1182         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1183     return swash_fetch(PL_utf8_xdigit, p);
1184 }
1185
1186 bool
1187 Perl_is_utf8_mark(pTHX_ U8 *p)
1188 {
1189     if (!is_utf8_char(p))
1190         return FALSE;
1191     if (!PL_utf8_mark)
1192         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1193     return swash_fetch(PL_utf8_mark, p);
1194 }
1195
1196 UV
1197 Perl_to_utf8_upper(pTHX_ U8 *p)
1198 {
1199     UV uv;
1200
1201     if (!PL_utf8_toupper)
1202         PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1203     uv = swash_fetch(PL_utf8_toupper, p);
1204     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1205 }
1206
1207 UV
1208 Perl_to_utf8_title(pTHX_ U8 *p)
1209 {
1210     UV uv;
1211
1212     if (!PL_utf8_totitle)
1213         PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1214     uv = swash_fetch(PL_utf8_totitle, p);
1215     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1216 }
1217
1218 UV
1219 Perl_to_utf8_lower(pTHX_ U8 *p)
1220 {
1221     UV uv;
1222
1223     if (!PL_utf8_tolower)
1224         PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1225     uv = swash_fetch(PL_utf8_tolower, p);
1226     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1227 }
1228
1229 /* a "swash" is a swatch hash */
1230
1231 SV*
1232 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1233 {
1234     SV* retval;
1235     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1236     dSP;
1237     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1238
1239     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1240         ENTER;
1241         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1242         LEAVE;
1243     }
1244     SPAGAIN;
1245     PUSHSTACKi(PERLSI_MAGIC);
1246     PUSHMARK(SP);
1247     EXTEND(SP,5);
1248     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1249     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1250     PUSHs(listsv);
1251     PUSHs(sv_2mortal(newSViv(minbits)));
1252     PUSHs(sv_2mortal(newSViv(none)));
1253     PUTBACK;
1254     ENTER;
1255     SAVEI32(PL_hints);
1256     PL_hints = 0;
1257     save_re_context();
1258     if (PL_curcop == &PL_compiling)
1259         /* XXX ought to be handled by lex_start */
1260         sv_setpv(tokenbufsv, PL_tokenbuf);
1261     if (call_method("SWASHNEW", G_SCALAR))
1262         retval = newSVsv(*PL_stack_sp--);
1263     else
1264         retval = &PL_sv_undef;
1265     LEAVE;
1266     POPSTACK;
1267     if (PL_curcop == &PL_compiling) {
1268         STRLEN len;
1269         char* pv = SvPV(tokenbufsv, len);
1270
1271         Copy(pv, PL_tokenbuf, len+1, char);
1272         PL_curcop->op_private = PL_hints;
1273     }
1274     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1275         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1276     return retval;
1277 }
1278
1279 UV
1280 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1281 {
1282     HV* hv = (HV*)SvRV(sv);
1283     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1284        then the "swatch" is a vec() for al the chars which start
1285        with 0xAA..0xYY
1286        So the key in the hash is length of encoded char -1
1287      */
1288     U32 klen = UTF8SKIP(ptr) - 1;
1289     U32 off = ptr[klen];
1290     STRLEN slen;
1291     STRLEN needents;
1292     U8 *tmps;
1293     U32 bit;
1294     SV *retval;
1295
1296     if (klen == 0)
1297      {
1298       /* If char in invariant then swatch is for all the invariant chars
1299        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1300        */
1301       needents = UTF_CONTINUATION_MARK;
1302       off      = NATIVE_TO_UTF(ptr[klen]);
1303      }
1304     else
1305      {
1306       /* If char is encoded then swatch is for the prefix */
1307       needents = (1 << UTF_ACCUMULATION_SHIFT);
1308       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1309      }
1310
1311     /*
1312      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1313      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1314      * it's nothing to sniff at.)  Pity we usually come through at least
1315      * two function calls to get here...
1316      *
1317      * NB: this code assumes that swatches are never modified, once generated!
1318      */
1319
1320     if (hv == PL_last_swash_hv &&
1321         klen == PL_last_swash_klen &&
1322         (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1323     {
1324         tmps = PL_last_swash_tmps;
1325         slen = PL_last_swash_slen;
1326     }
1327     else {
1328         /* Try our second-level swatch cache, kept in a hash. */
1329         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1330
1331         /* If not cached, generate it via utf8::SWASHGET */
1332         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1333             dSP;
1334             /* We use utf8n_to_uvuni() as we want an index into
1335                Unicode tables, not a native character number.
1336              */
1337             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1338             ENTER;
1339             SAVETMPS;
1340             save_re_context();
1341             PUSHSTACKi(PERLSI_MAGIC);
1342             PUSHMARK(SP);
1343             EXTEND(SP,3);
1344             PUSHs((SV*)sv);
1345             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1346             PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
1347             PUSHs(sv_2mortal(newSViv(needents)));
1348             PUTBACK;
1349             if (call_method("SWASHGET", G_SCALAR))
1350                 retval = newSVsv(*PL_stack_sp--);
1351             else
1352                 retval = &PL_sv_undef;
1353             POPSTACK;
1354             FREETMPS;
1355             LEAVE;
1356             if (PL_curcop == &PL_compiling)
1357                 PL_curcop->op_private = PL_hints;
1358
1359             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1360
1361             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1362                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1363         }
1364
1365         PL_last_swash_hv = hv;
1366         PL_last_swash_klen = klen;
1367         PL_last_swash_tmps = tmps;
1368         PL_last_swash_slen = slen;
1369         if (klen)
1370             Copy(ptr, PL_last_swash_key, klen, U8);
1371     }
1372
1373     switch ((int)((slen << 3) / needents)) {
1374     case 1:
1375         bit = 1 << (off & 7);
1376         off >>= 3;
1377         return (tmps[off] & bit) != 0;
1378     case 8:
1379         return tmps[off];
1380     case 16:
1381         off <<= 1;
1382         return (tmps[off] << 8) + tmps[off + 1] ;
1383     case 32:
1384         off <<= 2;
1385         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1386     }
1387     Perl_croak(aTHX_ "panic: swash_fetch");
1388     return 0;
1389 }
1390
1391
1392 /*
1393 =for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
1394
1395 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1396 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1397 bytes available. The return value is the pointer to the byte after the
1398 end of the new character. In other words,
1399
1400     d = uvchr_to_utf8(d, uv);
1401
1402 is the recommended wide native character-aware way of saying
1403
1404     *(d++) = uv;
1405
1406 =cut
1407 */
1408
1409 /* On ASCII machines this is normally a macro but we want a
1410    real function in case XS code wants it
1411 */
1412 #undef Perl_uvchr_to_utf8
1413 U8 *
1414 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1415 {
1416     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1417 }
1418
1419
1420 /*
1421 =for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
1422
1423 Returns the native character value of the first character in the string C<s>
1424 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1425 length, in bytes, of that character.
1426
1427 Allows length and flags to be passed to low level routine.
1428
1429 =cut
1430 */
1431 /* On ASCII machines this is normally a macro but we want a
1432    real function in case XS code wants it
1433 */
1434 #undef Perl_utf8n_to_uvchr
1435 UV
1436 Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
1437 {
1438     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1439     return UNI_TO_NATIVE(uv);
1440 }
1441
1442