utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (uv < 0x80) {
  50         *d++ = uv;
  51         return d;
  52     }
  53     if (uv < 0x800) {
  54         *d++ = (( uv >>  6)         | 0xc0);
  55         *d++ = (( uv        & 0x3f) | 0x80);
  56         return d;
  57     }
  58     if (uv < 0x10000) {
  59         *d++ = (( uv >> 12)         | 0xe0);
  60         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  61         *d++ = (( uv        & 0x3f) | 0x80);
  62         return d;
  63     }
  64     if (uv < 0x200000) {
  65         *d++ = (( uv >> 18)         | 0xf0);
  66         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  67         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  68         *d++ = (( uv        & 0x3f) | 0x80);
  69         return d;
  70     }
  71     if (uv < 0x4000000) {
  72         *d++ = (( uv >> 24)         | 0xf8);
  73         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  74         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  75         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  76         *d++ = (( uv        & 0x3f) | 0x80);
  77         return d;
  78     }
  79     if (uv < 0x80000000) {
  80         *d++ = (( uv >> 30)         | 0xfc);
  81         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  82         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  83         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  84         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  85         *d++ = (( uv        & 0x3f) | 0x80);
  86         return d;
  87     }
  88 #ifdef HAS_QUAD
  89     if (uv < UTF8_QUAD_MAX)
  90 #endif
  91     {
  92         *d++ =                        0xfe;     /* Can't match U+FEFF! */
  93         *d++ = (((uv >> 30) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  96         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  97         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  98         *d++ = (( uv        & 0x3f) | 0x80);
  99         return d;
 100     }
 101 #ifdef HAS_QUAD
 102     {
 103         *d++ =                        0xff;     /* Can't match U+FFFE! */
 104         *d++ =                        0x80;     /* 6 Reserved bits */
 105         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 106         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 109         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 110         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 111         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 112         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 113         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 114         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 115         *d++ = (( uv        & 0x3f) | 0x80);
 116         return d;
 117     }
 118 #endif
 119 }
 120
 121 /*
 122 =for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
 123
 124 Adds the UTF8 representation of the Native codepoint C<uv> to the end
 125 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 126 bytes available. The return value is the pointer to the byte after the
 127 end of the new character. In other words,
 128
 129     d = uvchr_to_utf8(d, uv);
 130
 131 is the recommended wide native character-aware way of saying
 132
 133     *(d++) = uv;
 134
 135 =cut
 136 */
 137
 138 U8 *
 139 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 140 {
 141     if (uv < 0x100)
 142         uv = NATIVE_TO_ASCII(uv);
 143     return Perl_uvuni_to_utf8(aTHX_ d, uv);
 144 }
 145
 146
 147 /*
 148 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 149
 150 Tests if some arbitrary number of bytes begins in a valid UTF-8
 151 character.  Note that an ASCII character is a valid UTF-8 character.
 152 The actual number of bytes in the UTF-8 character will be returned if
 153 it is valid, otherwise 0.
 154
 155 =cut */
 156 STRLEN
 157 Perl_is_utf8_char(pTHX_ U8 *s)
 158 {
 159     U8 u = *s;
 160     STRLEN slen, len;
 161     UV uv, ouv;
 162
 163     if (UTF8_IS_ASCII(u))
 164         return 1;
 165
 166     if (!UTF8_IS_START(u))
 167         return 0;
 168
 169     len = UTF8SKIP(s);
 170
 171     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 172         return 0;
 173
 174     slen = len - 1;
 175     s++;
 176     uv = u;
 177     ouv = uv;
 178     while (slen--) {
 179         if (!UTF8_IS_CONTINUATION(*s))
 180             return 0;
 181         uv = UTF8_ACCUMULATE(uv, *s);
 182         if (uv < ouv)
 183             return 0;
 184         ouv = uv;
 185         s++;
 186     }
 187
 188     if (UNISKIP(uv) < len)
 189         return 0;
 190
 191     return len;
 192 }
 193
 194 /*
 195 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 196
 197 Returns true if first C<len> bytes of the given string form a valid UTF8
 198 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 199 'a string that contains UTF8' because a valid ASCII string is a valid
 200 UTF8 string.
 201
 202 =cut
 203 */
 204
 205 bool
 206 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 207 {
 208     U8* x = s;
 209     U8* send;
 210     STRLEN c;
 211
 212     if (!len)
 213         len = strlen((char *)s);
 214     send = s + len;
 215
 216     while (x < send) {
 217         c = is_utf8_char(x);
 218         if (!c)
 219             return FALSE;
 220         x += c;
 221     }
 222     if (x != send)
 223         return FALSE;
 224
 225     return TRUE;
 226 }
 227
 228 /*
 229 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 230
 231 Bottom level UTF-8 decode routine.
 232 Returns the unicode code point value of the first character in the string C<s>
 233 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 234 C<retlen> will be set to the length, in bytes, of that character.
 235
 236 If C<s> does not point to a well-formed UTF8 character, the behaviour
 237 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 238 it is assumed that the caller will raise a warning, and this function
 239 will silently just set C<retlen> to C<-1> and return zero.  If the
 240 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 241 malformations will be given, C<retlen> will be set to the expected
 242 length of the UTF-8 character in bytes, and zero will be returned.
 243
 244 The C<flags> can also contain various flags to allow deviations from
 245 the strict UTF-8 encoding (see F<utf8.h>).
 246
 247 Most code should use utf8_to_uvchr() rather than call this directly.
 248
 249 =cut */
 250
 251 UV
 252 Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 253 {
 254     UV uv = *s, ouv;
 255     STRLEN len = 1;
 256     bool dowarn = ckWARN_d(WARN_UTF8);
 257     STRLEN expectlen = 0;
 258     U32 warning = 0;
 259
 260 /* This list is a superset of the UTF8_ALLOW_XXX. */
 261
 262 #define UTF8_WARN_EMPTY                          1
 263 #define UTF8_WARN_CONTINUATION                   2
 264 #define UTF8_WARN_NON_CONTINUATION               3
 265 #define UTF8_WARN_FE_FF                          4
 266 #define UTF8_WARN_SHORT                          5
 267 #define UTF8_WARN_OVERFLOW                       6
 268 #define UTF8_WARN_SURROGATE                      7
 269 #define UTF8_WARN_BOM                            8
 270 #define UTF8_WARN_LONG                           9
 271 #define UTF8_WARN_FFFF                          10
 272
 273     if (curlen == 0 &&
 274         !(flags & UTF8_ALLOW_EMPTY)) {
 275         warning = UTF8_WARN_EMPTY;
 276         goto malformed;
 277     }
 278
 279     if (UTF8_IS_ASCII(uv)) {
 280         if (retlen)
 281             *retlen = 1;
 282         return (UV) (*s);
 283     }
 284
 285     if (UTF8_IS_CONTINUATION(uv) &&
 286         !(flags & UTF8_ALLOW_CONTINUATION)) {
 287         warning = UTF8_WARN_CONTINUATION;
 288         goto malformed;
 289     }
 290
 291     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 292         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 293         warning = UTF8_WARN_NON_CONTINUATION;
 294         goto malformed;
 295     }
 296
 297     if ((uv == 0xfe || uv == 0xff) &&
 298         !(flags & UTF8_ALLOW_FE_FF)) {
 299         warning = UTF8_WARN_FE_FF;
 300         goto malformed;
 301     }
 302
 303     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 304     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 305     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 306     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 307     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 308     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 309     else                        { len = 13; uv = 0; } /* whoa! */
 310
 311     if (retlen)
 312         *retlen = len;
 313
 314     expectlen = len;
 315
 316     if ((curlen < expectlen) &&
 317         !(flags & UTF8_ALLOW_SHORT)) {
 318         warning = UTF8_WARN_SHORT;
 319         goto malformed;
 320     }
 321
 322     len--;
 323     s++;
 324     ouv = uv;
 325
 326     while (len--) {
 327         if (!UTF8_IS_CONTINUATION(*s) &&
 328             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 329             s--;
 330             warning = UTF8_WARN_NON_CONTINUATION;
 331             goto malformed;
 332         }
 333         else
 334             uv = UTF8_ACCUMULATE(uv, *s);
 335         if (!(uv > ouv)) {
 336             /* These cannot be allowed. */
 337             if (uv == ouv) {
 338                 if (!(flags & UTF8_ALLOW_LONG)) {
 339                     warning = UTF8_WARN_LONG;
 340                     goto malformed;
 341                 }
 342             }
 343             else { /* uv < ouv */
 344                 /* This cannot be allowed. */
 345                 warning = UTF8_WARN_OVERFLOW;
 346                 goto malformed;
 347             }
 348         }
 349         s++;
 350         ouv = uv;
 351     }
 352
 353     if (UNICODE_IS_SURROGATE(uv) &&
 354         !(flags & UTF8_ALLOW_SURROGATE)) {
 355         warning = UTF8_WARN_SURROGATE;
 356         goto malformed;
 357     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 358                !(flags & UTF8_ALLOW_BOM)) {
 359         warning = UTF8_WARN_BOM;
 360         goto malformed;
 361     } else if ((expectlen > UNISKIP(uv)) &&
 362                !(flags & UTF8_ALLOW_LONG)) {
 363         warning = UTF8_WARN_LONG;
 364         goto malformed;
 365     } else if (UNICODE_IS_ILLEGAL(uv) &&
 366                !(flags & UTF8_ALLOW_FFFF)) {
 367         warning = UTF8_WARN_FFFF;
 368         goto malformed;
 369     }
 370
 371     return uv;
 372
 373 malformed:
 374
 375     if (flags & UTF8_CHECK_ONLY) {
 376         if (retlen)
 377             *retlen = -1;
 378         return 0;
 379     }
 380
 381     if (dowarn) {
 382         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 383
 384         switch (warning) {
 385         case 0: /* Intentionally empty. */ break;
 386         case UTF8_WARN_EMPTY:
 387             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 388             break;
 389         case UTF8_WARN_CONTINUATION:
 390             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 391             break;
 392         case UTF8_WARN_NON_CONTINUATION:
 393             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 394                            (UV)s[1], uv);
 395             break;
 396         case UTF8_WARN_FE_FF:
 397             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 398             break;
 399         case UTF8_WARN_SHORT:
 400             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 401                            curlen, curlen == 1 ? "" : "s", expectlen);
 402             break;
 403         case UTF8_WARN_OVERFLOW:
 404             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 405                            ouv, *s);
 406             break;
 407         case UTF8_WARN_SURROGATE:
 408             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 409             break;
 410         case UTF8_WARN_BOM:
 411             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 412             break;
 413         case UTF8_WARN_LONG:
 414             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 415                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 416             break;
 417         case UTF8_WARN_FFFF:
 418             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 419             break;
 420         default:
 421             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 422             break;
 423         }
 424
 425         if (warning) {
 426             char *s = SvPVX(sv);
 427
 428             if (PL_op)
 429                 Perl_warner(aTHX_ WARN_UTF8,
 430                             "%s in %s", s,  PL_op_desc[PL_op->op_type]);
 431             else
 432                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 433         }
 434     }
 435
 436     if (retlen)
 437         *retlen = expectlen ? expectlen : len;
 438
 439     return 0;
 440 }
 441
 442 /*
 443 =for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
 444
 445 Returns the native character value of the first character in the string C<s>
 446 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 447 length, in bytes, of that character.
 448
 449 Allows length and flags to be passed to low level routine.
 450
 451 =cut
 452 */
 453
 454 UV
 455 Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 456 {
 457     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
 458     if (uv < 0x100)
 459         return (UV) ASCII_TO_NATIVE(uv);
 460     return uv;
 461 }
 462
 463 /*
 464 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
 465
 466 Returns the native character value of the first character in the string C<s>
 467 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 468 length, in bytes, of that character.
 469
 470 If C<s> does not point to a well-formed UTF8 character, zero is
 471 returned and retlen is set, if possible, to -1.
 472
 473 =cut
 474 */
 475
 476 UV
 477 Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
 478 {
 479     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 480 }
 481
 482 /*
 483 =for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
 484
 485 Returns the Unicode code point of the first character in the string C<s>
 486 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 487 length, in bytes, of that character.
 488
 489 This function should only be used when returned UV is considered
 490 an index into the Unicode semantic tables (e.g. swashes).
 491
 492 If C<s> does not point to a well-formed UTF8 character, zero is
 493 returned and retlen is set, if possible, to -1.
 494
 495 =cut
 496 */
 497
 498 UV
 499 Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
 500 {
 501     /* Call the low level routine asking for checks */
 502     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 503 }
 504
 505 /*
 506 =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
 507
 508 Return the length of the UTF-8 char encoded string C<s> in characters.
 509 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 510 up past C<e>, croaks.
 511
 512 =cut
 513 */
 514
 515 STRLEN
 516 Perl_utf8_length(pTHX_ U8* s, U8* e)
 517 {
 518     STRLEN len = 0;
 519
 520     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 521      * the bitops (especially ~) can create illegal UTF-8.
 522      * In other words: in Perl UTF-8 is not just for Unicode. */
 523
 524     if (e < s)
 525         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 526     while (s < e) {
 527         U8 t = UTF8SKIP(s);
 528
 529         if (e - s < t)
 530             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 531         s += t;
 532         len++;
 533     }
 534
 535     return len;
 536 }
 537
 538 /*
 539 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 540
 541 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 542 and C<b>.
 543
 544 WARNING: use only if you *know* that the pointers point inside the
 545 same UTF-8 buffer.
 546
 547 =cut */
 548
 549 IV
 550 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 551 {
 552     IV off = 0;
 553
 554     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 555      * the bitops (especially ~) can create illegal UTF-8.
 556      * In other words: in Perl UTF-8 is not just for Unicode. */
 557
 558     if (a < b) {
 559         while (a < b) {
 560             U8 c = UTF8SKIP(a);
 561
 562             if (b - a < c)
 563                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 564             a += c;
 565             off--;
 566         }
 567     }
 568     else {
 569         while (b < a) {
 570             U8 c = UTF8SKIP(b);
 571
 572             if (a - b < c)
 573                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 574             b += c;
 575             off++;
 576         }
 577     }
 578
 579     return off;
 580 }
 581
 582 /*
 583 =for apidoc A|U8*|utf8_hop|U8 *s|I32 off
 584
 585 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 586 forward or backward.
 587
 588 WARNING: do not use the following unless you *know* C<off> is within
 589 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 590 on the first byte of character or just after the last byte of a character.
 591
 592 =cut */
 593
 594 U8 *
 595 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 596 {
 597     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 598      * the bitops (especially ~) can create illegal UTF-8.
 599      * In other words: in Perl UTF-8 is not just for Unicode. */
 600
 601     if (off >= 0) {
 602         while (off--)
 603             s += UTF8SKIP(s);
 604     }
 605     else {
 606         while (off++) {
 607             s--;
 608             while (UTF8_IS_CONTINUATION(*s))
 609                 s--;
 610         }
 611     }
 612     return s;
 613 }
 614
 615 /*
 616 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 617
 618 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 619 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 620 updates len to contain the new length.
 621 Returns zero on failure, setting C<len> to -1.
 622
 623 =cut
 624 */
 625
 626 U8 *
 627 Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 628 {
 629     U8 *send;
 630     U8 *d;
 631     U8 *save = s;
 632
 633     /* ensure valid UTF8 and chars < 256 before updating string */
 634     for (send = s + *len; s < send; ) {
 635         U8 c = *s++;
 636
 637         if (c >= 0x80 &&
 638             ((s >= send) ||
 639              ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
 640             *len = -1;
 641             return 0;
 642         }
 643     }
 644
 645     d = s = save;
 646     while (s < send) {
 647         STRLEN ulen;
 648         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 649         s += ulen;
 650     }
 651     *d = '\0';
 652     *len = d - save;
 653     return save;
 654 }
 655
 656 /*
 657 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 658
 659 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 660 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 661 the newly-created string, and updates C<len> to contain the new
 662 length.  Returns the original string if no conversion occurs, C<len>
 663 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 664 0 if C<s> is converted or contains all 7bit characters.
 665
 666 =cut */
 667
 668 U8 *
 669 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
 670 {
 671     U8 *d;
 672     U8 *start = s;
 673     U8 *send;
 674     I32 count = 0;
 675
 676     if (!*is_utf8)
 677         return start;
 678
 679     /* ensure valid UTF8 and chars < 256 before converting string */
 680     for (send = s + *len; s < send;) {
 681         U8 c = *s++;
 682         if (!UTF8_IS_ASCII(c)) {
 683             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 684                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 685                 count++;
 686             else
 687                 return start;
 688         }
 689     }
 690
 691     *is_utf8 = 0;
 692
 693 #ifndef EBCDIC
 694     /* Can use as-is if no high chars */
 695     if (!count)
 696         return start;
 697 #endif
 698
 699     Newz(801, d, (*len) - count + 1, U8);
 700     s = start; start = d;
 701     while (s < send) {
 702         U8 c = *s++;
 703         if (!UTF8_IS_ASCII(c))
 704             c = UTF8_ACCUMULATE(c, *s++);
 705         *d++ = ASCII_TO_NATIVE(c);
 706     }
 707     *d = '\0';
 708     *len = d - start;
 709     return start;
 710 }
 711
 712 /*
 713 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 714
 715 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 716 Returns a pointer to the newly-created string, and sets C<len> to
 717 reflect the new length.
 718
 719 =cut
 720 */
 721
 722 U8*
 723 Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
 724 {
 725     U8 *send;
 726     U8 *d;
 727     U8 *dst;
 728     send = s + (*len);
 729
 730     Newz(801, d, (*len) * 2 + 1, U8);
 731     dst = d;
 732
 733     while (s < send) {
 734         UV uv = NATIVE_TO_ASCII(*s++);
 735         if (UTF8_IS_ASCII(uv))
 736             *d++ = uv;
 737         else {
 738             *d++ = UTF8_EIGHT_BIT_HI(uv);
 739             *d++ = UTF8_EIGHT_BIT_LO(uv);
 740         }
 741     }
 742     *d = '\0';
 743     *len = d-dst;
 744     return dst;
 745 }
 746
 747 /*
 748  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 749  *
 750  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 751  * We optimize for native, for obvious reasons. */
 752
 753 U8*
 754 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 755 {
 756     U8* pend;
 757     U8* dstart = d;
 758
 759     if (bytelen & 1)
 760         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 761
 762     pend = p + bytelen;
 763
 764     while (p < pend) {
 765         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 766         p += 2;
 767         if (uv < 0x80) {
 768             *d++ = uv;
 769             continue;
 770         }
 771         if (uv < 0x800) {
 772             *d++ = (( uv >>  6)         | 0xc0);
 773             *d++ = (( uv        & 0x3f) | 0x80);
 774             continue;
 775         }
 776         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 777             UV low = *p++;
 778             if (low < 0xdc00 || low >= 0xdfff)
 779                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 780             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 781         }
 782         if (uv < 0x10000) {
 783             *d++ = (( uv >> 12)         | 0xe0);
 784             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 785             *d++ = (( uv        & 0x3f) | 0x80);
 786             continue;
 787         }
 788         else {
 789             *d++ = (( uv >> 18)         | 0xf0);
 790             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 791             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 792             *d++ = (( uv        & 0x3f) | 0x80);
 793             continue;
 794         }
 795     }
 796     *newlen = d - dstart;
 797     return d;
 798 }
 799
 800 /* Note: this one is slightly destructive of the source. */
 801
 802 U8*
 803 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 804 {
 805     U8* s = (U8*)p;
 806     U8* send = s + bytelen;
 807     while (s < send) {
 808         U8 tmp = s[0];
 809         s[0] = s[1];
 810         s[1] = tmp;
 811         s += 2;
 812     }
 813     return utf16_to_utf8(p, d, bytelen, newlen);
 814 }
 815
 816 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 817
 818 bool
 819 Perl_is_uni_alnum(pTHX_ U32 c)
 820 {
 821     U8 tmpbuf[UTF8_MAXLEN+1];
 822     uvuni_to_utf8(tmpbuf, (UV)c);
 823     return is_utf8_alnum(tmpbuf);
 824 }
 825
 826 bool
 827 Perl_is_uni_alnumc(pTHX_ U32 c)
 828 {
 829     U8 tmpbuf[UTF8_MAXLEN+1];
 830     uvuni_to_utf8(tmpbuf, (UV)c);
 831     return is_utf8_alnumc(tmpbuf);
 832 }
 833
 834 bool
 835 Perl_is_uni_idfirst(pTHX_ U32 c)
 836 {
 837     U8 tmpbuf[UTF8_MAXLEN+1];
 838     uvuni_to_utf8(tmpbuf, (UV)c);
 839     return is_utf8_idfirst(tmpbuf);
 840 }
 841
 842 bool
 843 Perl_is_uni_alpha(pTHX_ U32 c)
 844 {
 845     U8 tmpbuf[UTF8_MAXLEN+1];
 846     uvuni_to_utf8(tmpbuf, (UV)c);
 847     return is_utf8_alpha(tmpbuf);
 848 }
 849
 850 bool
 851 Perl_is_uni_ascii(pTHX_ U32 c)
 852 {
 853     U8 tmpbuf[UTF8_MAXLEN+1];
 854     uvuni_to_utf8(tmpbuf, (UV)c);
 855     return is_utf8_ascii(tmpbuf);
 856 }
 857
 858 bool
 859 Perl_is_uni_space(pTHX_ U32 c)
 860 {
 861     U8 tmpbuf[UTF8_MAXLEN+1];
 862     uvuni_to_utf8(tmpbuf, (UV)c);
 863     return is_utf8_space(tmpbuf);
 864 }
 865
 866 bool
 867 Perl_is_uni_digit(pTHX_ U32 c)
 868 {
 869     U8 tmpbuf[UTF8_MAXLEN+1];
 870     uvuni_to_utf8(tmpbuf, (UV)c);
 871     return is_utf8_digit(tmpbuf);
 872 }
 873
 874 bool
 875 Perl_is_uni_upper(pTHX_ U32 c)
 876 {
 877     U8 tmpbuf[UTF8_MAXLEN+1];
 878     uvuni_to_utf8(tmpbuf, (UV)c);
 879     return is_utf8_upper(tmpbuf);
 880 }
 881
 882 bool
 883 Perl_is_uni_lower(pTHX_ U32 c)
 884 {
 885     U8 tmpbuf[UTF8_MAXLEN+1];
 886     uvuni_to_utf8(tmpbuf, (UV)c);
 887     return is_utf8_lower(tmpbuf);
 888 }
 889
 890 bool
 891 Perl_is_uni_cntrl(pTHX_ U32 c)
 892 {
 893     U8 tmpbuf[UTF8_MAXLEN+1];
 894     uvuni_to_utf8(tmpbuf, (UV)c);
 895     return is_utf8_cntrl(tmpbuf);
 896 }
 897
 898 bool
 899 Perl_is_uni_graph(pTHX_ U32 c)
 900 {
 901     U8 tmpbuf[UTF8_MAXLEN+1];
 902     uvuni_to_utf8(tmpbuf, (UV)c);
 903     return is_utf8_graph(tmpbuf);
 904 }
 905
 906 bool
 907 Perl_is_uni_print(pTHX_ U32 c)
 908 {
 909     U8 tmpbuf[UTF8_MAXLEN+1];
 910     uvuni_to_utf8(tmpbuf, (UV)c);
 911     return is_utf8_print(tmpbuf);
 912 }
 913
 914 bool
 915 Perl_is_uni_punct(pTHX_ U32 c)
 916 {
 917     U8 tmpbuf[UTF8_MAXLEN+1];
 918     uvuni_to_utf8(tmpbuf, (UV)c);
 919     return is_utf8_punct(tmpbuf);
 920 }
 921
 922 bool
 923 Perl_is_uni_xdigit(pTHX_ U32 c)
 924 {
 925     U8 tmpbuf[UTF8_MAXLEN+1];
 926     uvuni_to_utf8(tmpbuf, (UV)c);
 927     return is_utf8_xdigit(tmpbuf);
 928 }
 929
 930 U32
 931 Perl_to_uni_upper(pTHX_ U32 c)
 932 {
 933     U8 tmpbuf[UTF8_MAXLEN+1];
 934     uvuni_to_utf8(tmpbuf, (UV)c);
 935     return to_utf8_upper(tmpbuf);
 936 }
 937
 938 U32
 939 Perl_to_uni_title(pTHX_ U32 c)
 940 {
 941     U8 tmpbuf[UTF8_MAXLEN+1];
 942     uvuni_to_utf8(tmpbuf, (UV)c);
 943     return to_utf8_title(tmpbuf);
 944 }
 945
 946 U32
 947 Perl_to_uni_lower(pTHX_ U32 c)
 948 {
 949     U8 tmpbuf[UTF8_MAXLEN+1];
 950     uvuni_to_utf8(tmpbuf, (UV)c);
 951     return to_utf8_lower(tmpbuf);
 952 }
 953
 954 /* for now these all assume no locale info available for Unicode > 255 */
 955
 956 bool
 957 Perl_is_uni_alnum_lc(pTHX_ U32 c)
 958 {
 959     return is_uni_alnum(c);     /* XXX no locale support yet */
 960 }
 961
 962 bool
 963 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
 964 {
 965     return is_uni_alnumc(c);    /* XXX no locale support yet */
 966 }
 967
 968 bool
 969 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
 970 {
 971     return is_uni_idfirst(c);   /* XXX no locale support yet */
 972 }
 973
 974 bool
 975 Perl_is_uni_alpha_lc(pTHX_ U32 c)
 976 {
 977     return is_uni_alpha(c);     /* XXX no locale support yet */
 978 }
 979
 980 bool
 981 Perl_is_uni_ascii_lc(pTHX_ U32 c)
 982 {
 983     return is_uni_ascii(c);     /* XXX no locale support yet */
 984 }
 985
 986 bool
 987 Perl_is_uni_space_lc(pTHX_ U32 c)
 988 {
 989     return is_uni_space(c);     /* XXX no locale support yet */
 990 }
 991
 992 bool
 993 Perl_is_uni_digit_lc(pTHX_ U32 c)
 994 {
 995     return is_uni_digit(c);     /* XXX no locale support yet */
 996 }
 997
 998 bool
 999 Perl_is_uni_upper_lc(pTHX_ U32 c)
1000 {
1001     return is_uni_upper(c);     /* XXX no locale support yet */
1002 }
1003
1004 bool
1005 Perl_is_uni_lower_lc(pTHX_ U32 c)
1006 {
1007     return is_uni_lower(c);     /* XXX no locale support yet */
1008 }
1009
1010 bool
1011 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1012 {
1013     return is_uni_cntrl(c);     /* XXX no locale support yet */
1014 }
1015
1016 bool
1017 Perl_is_uni_graph_lc(pTHX_ U32 c)
1018 {
1019     return is_uni_graph(c);     /* XXX no locale support yet */
1020 }
1021
1022 bool
1023 Perl_is_uni_print_lc(pTHX_ U32 c)
1024 {
1025     return is_uni_print(c);     /* XXX no locale support yet */
1026 }
1027
1028 bool
1029 Perl_is_uni_punct_lc(pTHX_ U32 c)
1030 {
1031     return is_uni_punct(c);     /* XXX no locale support yet */
1032 }
1033
1034 bool
1035 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1036 {
1037     return is_uni_xdigit(c);    /* XXX no locale support yet */
1038 }
1039
1040 U32
1041 Perl_to_uni_upper_lc(pTHX_ U32 c)
1042 {
1043     return to_uni_upper(c);     /* XXX no locale support yet */
1044 }
1045
1046 U32
1047 Perl_to_uni_title_lc(pTHX_ U32 c)
1048 {
1049     return to_uni_title(c);     /* XXX no locale support yet */
1050 }
1051
1052 U32
1053 Perl_to_uni_lower_lc(pTHX_ U32 c)
1054 {
1055     return to_uni_lower(c);     /* XXX no locale support yet */
1056 }
1057
1058 bool
1059 Perl_is_utf8_alnum(pTHX_ U8 *p)
1060 {
1061     if (!is_utf8_char(p))
1062         return FALSE;
1063     if (!PL_utf8_alnum)
1064         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1065          * descendant of isalnum(3), in other words, it doesn't
1066          * contain the '_'. --jhi */
1067         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1068     return swash_fetch(PL_utf8_alnum, p);
1069 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1070 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1071     if (!PL_utf8_alnum)
1072         PL_utf8_alnum = swash_init("utf8", "",
1073             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1074     return swash_fetch(PL_utf8_alnum, p);
1075 #endif
1076 }
1077
1078 bool
1079 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1080 {
1081     if (!is_utf8_char(p))
1082         return FALSE;
1083     if (!PL_utf8_alnum)
1084         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1085     return swash_fetch(PL_utf8_alnum, p);
1086 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1087 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1088     if (!PL_utf8_alnum)
1089         PL_utf8_alnum = swash_init("utf8", "",
1090             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1091     return swash_fetch(PL_utf8_alnum, p);
1092 #endif
1093 }
1094
1095 bool
1096 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1097 {
1098     return *p == '_' || is_utf8_alpha(p);
1099 }
1100
1101 bool
1102 Perl_is_utf8_alpha(pTHX_ U8 *p)
1103 {
1104     if (!is_utf8_char(p))
1105         return FALSE;
1106     if (!PL_utf8_alpha)
1107         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1108     return swash_fetch(PL_utf8_alpha, p);
1109 }
1110
1111 bool
1112 Perl_is_utf8_ascii(pTHX_ U8 *p)
1113 {
1114     if (!is_utf8_char(p))
1115         return FALSE;
1116     if (!PL_utf8_ascii)
1117         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1118     return swash_fetch(PL_utf8_ascii, p);
1119 }
1120
1121 bool
1122 Perl_is_utf8_space(pTHX_ U8 *p)
1123 {
1124     if (!is_utf8_char(p))
1125         return FALSE;
1126     if (!PL_utf8_space)
1127         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1128     return swash_fetch(PL_utf8_space, p);
1129 }
1130
1131 bool
1132 Perl_is_utf8_digit(pTHX_ U8 *p)
1133 {
1134     if (!is_utf8_char(p))
1135         return FALSE;
1136     if (!PL_utf8_digit)
1137         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1138     return swash_fetch(PL_utf8_digit, p);
1139 }
1140
1141 bool
1142 Perl_is_utf8_upper(pTHX_ U8 *p)
1143 {
1144     if (!is_utf8_char(p))
1145         return FALSE;
1146     if (!PL_utf8_upper)
1147         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1148     return swash_fetch(PL_utf8_upper, p);
1149 }
1150
1151 bool
1152 Perl_is_utf8_lower(pTHX_ U8 *p)
1153 {
1154     if (!is_utf8_char(p))
1155         return FALSE;
1156     if (!PL_utf8_lower)
1157         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1158     return swash_fetch(PL_utf8_lower, p);
1159 }
1160
1161 bool
1162 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1163 {
1164     if (!is_utf8_char(p))
1165         return FALSE;
1166     if (!PL_utf8_cntrl)
1167         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1168     return swash_fetch(PL_utf8_cntrl, p);
1169 }
1170
1171 bool
1172 Perl_is_utf8_graph(pTHX_ U8 *p)
1173 {
1174     if (!is_utf8_char(p))
1175         return FALSE;
1176     if (!PL_utf8_graph)
1177         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1178     return swash_fetch(PL_utf8_graph, p);
1179 }
1180
1181 bool
1182 Perl_is_utf8_print(pTHX_ U8 *p)
1183 {
1184     if (!is_utf8_char(p))
1185         return FALSE;
1186     if (!PL_utf8_print)
1187         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1188     return swash_fetch(PL_utf8_print, p);
1189 }
1190
1191 bool
1192 Perl_is_utf8_punct(pTHX_ U8 *p)
1193 {
1194     if (!is_utf8_char(p))
1195         return FALSE;
1196     if (!PL_utf8_punct)
1197         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1198     return swash_fetch(PL_utf8_punct, p);
1199 }
1200
1201 bool
1202 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1203 {
1204     if (!is_utf8_char(p))
1205         return FALSE;
1206     if (!PL_utf8_xdigit)
1207         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1208     return swash_fetch(PL_utf8_xdigit, p);
1209 }
1210
1211 bool
1212 Perl_is_utf8_mark(pTHX_ U8 *p)
1213 {
1214     if (!is_utf8_char(p))
1215         return FALSE;
1216     if (!PL_utf8_mark)
1217         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1218     return swash_fetch(PL_utf8_mark, p);
1219 }
1220
1221 UV
1222 Perl_to_utf8_upper(pTHX_ U8 *p)
1223 {
1224     UV uv;
1225
1226     if (!PL_utf8_toupper)
1227         PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1228     uv = swash_fetch(PL_utf8_toupper, p);
1229     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1230 }
1231
1232 UV
1233 Perl_to_utf8_title(pTHX_ U8 *p)
1234 {
1235     UV uv;
1236
1237     if (!PL_utf8_totitle)
1238         PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1239     uv = swash_fetch(PL_utf8_totitle, p);
1240     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1241 }
1242
1243 UV
1244 Perl_to_utf8_lower(pTHX_ U8 *p)
1245 {
1246     UV uv;
1247
1248     if (!PL_utf8_tolower)
1249         PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1250     uv = swash_fetch(PL_utf8_tolower, p);
1251     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1252 }
1253
1254 /* a "swash" is a swatch hash */
1255
1256 SV*
1257 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1258 {
1259     SV* retval;
1260     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1261     dSP;
1262
1263     if (!gv_stashpv(pkg, 0)) {  /* demand load utf8 */
1264         ENTER;
1265         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1266         LEAVE;
1267     }
1268     SPAGAIN;
1269     PUSHSTACKi(PERLSI_MAGIC);
1270     PUSHMARK(SP);
1271     EXTEND(SP,5);
1272     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1273     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1274     PUSHs(listsv);
1275     PUSHs(sv_2mortal(newSViv(minbits)));
1276     PUSHs(sv_2mortal(newSViv(none)));
1277     PUTBACK;
1278     ENTER;
1279     SAVEI32(PL_hints);
1280     PL_hints = 0;
1281     save_re_context();
1282     if (PL_curcop == &PL_compiling)
1283         /* XXX ought to be handled by lex_start */
1284         sv_setpv(tokenbufsv, PL_tokenbuf);
1285     if (call_method("SWASHNEW", G_SCALAR))
1286         retval = newSVsv(*PL_stack_sp--);
1287     else
1288         retval = &PL_sv_undef;
1289     LEAVE;
1290     POPSTACK;
1291     if (PL_curcop == &PL_compiling) {
1292         STRLEN len;
1293         char* pv = SvPV(tokenbufsv, len);
1294
1295         Copy(pv, PL_tokenbuf, len+1, char);
1296         PL_curcop->op_private = PL_hints;
1297     }
1298     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1299         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1300     return retval;
1301 }
1302
1303 UV
1304 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1305 {
1306     HV* hv = (HV*)SvRV(sv);
1307     U32 klen = UTF8SKIP(ptr) - 1;
1308     U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
1309     STRLEN slen;
1310     STRLEN needents = (klen ? 64 : 128);
1311     U8 *tmps;
1312     U32 bit;
1313     SV *retval;
1314
1315     /*
1316      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1317      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1318      * it's nothing to sniff at.)  Pity we usually come through at least
1319      * two function calls to get here...
1320      *
1321      * NB: this code assumes that swatches are never modified, once generated!
1322      */
1323
1324     if (hv == PL_last_swash_hv &&
1325         klen == PL_last_swash_klen &&
1326         (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1327     {
1328         tmps = PL_last_swash_tmps;
1329         slen = PL_last_swash_slen;
1330     }
1331     else {
1332         /* Try our second-level swatch cache, kept in a hash. */
1333         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1334
1335         /* If not cached, generate it via utf8::SWASHGET */
1336         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1337             dSP;
1338             ENTER;
1339             SAVETMPS;
1340             save_re_context();
1341             PUSHSTACKi(PERLSI_MAGIC);
1342             PUSHMARK(SP);
1343             EXTEND(SP,3);
1344             PUSHs((SV*)sv);
1345             /* We call utf8_to_uni as we want and index into Unicode tables,
1346                not a native character number.
1347              */
1348             PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
1349             PUSHs(sv_2mortal(newSViv(needents)));
1350             PUTBACK;
1351             if (call_method("SWASHGET", G_SCALAR))
1352                 retval = newSVsv(*PL_stack_sp--);
1353             else
1354                 retval = &PL_sv_undef;
1355             POPSTACK;
1356             FREETMPS;
1357             LEAVE;
1358             if (PL_curcop == &PL_compiling)
1359                 PL_curcop->op_private = PL_hints;
1360
1361             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1362
1363             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
1364                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1365         }
1366
1367         PL_last_swash_hv = hv;
1368         PL_last_swash_klen = klen;
1369         PL_last_swash_tmps = tmps;
1370         PL_last_swash_slen = slen;
1371         if (klen)
1372             Copy(ptr, PL_last_swash_key, klen, U8);
1373     }
1374
1375     switch ((int)((slen << 3) / needents)) {
1376     case 1:
1377         bit = 1 << (off & 7);
1378         off >>= 3;
1379         return (tmps[off] & bit) != 0;
1380     case 8:
1381         return tmps[off];
1382     case 16:
1383         off <<= 1;
1384         return (tmps[off] << 8) + tmps[off + 1] ;
1385     case 32:
1386         off <<= 2;
1387         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1388     }
1389     Perl_croak(aTHX_ "panic: swash_fetch");
1390     return 0;
1391 }