utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (uv < 0x80) {
  50         *d++ = uv;
  51         return d;
  52     }
  53     if (uv < 0x800) {
  54         *d++ = (( uv >>  6)         | 0xc0);
  55         *d++ = (( uv        & 0x3f) | 0x80);
  56         return d;
  57     }
  58     if (uv < 0x10000) {
  59         *d++ = (( uv >> 12)         | 0xe0);
  60         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  61         *d++ = (( uv        & 0x3f) | 0x80);
  62         return d;
  63     }
  64     if (uv < 0x200000) {
  65         *d++ = (( uv >> 18)         | 0xf0);
  66         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  67         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  68         *d++ = (( uv        & 0x3f) | 0x80);
  69         return d;
  70     }
  71     if (uv < 0x4000000) {
  72         *d++ = (( uv >> 24)         | 0xf8);
  73         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  74         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  75         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  76         *d++ = (( uv        & 0x3f) | 0x80);
  77         return d;
  78     }
  79     if (uv < 0x80000000) {
  80         *d++ = (( uv >> 30)         | 0xfc);
  81         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  82         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  83         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  84         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  85         *d++ = (( uv        & 0x3f) | 0x80);
  86         return d;
  87     }
  88 #ifdef HAS_QUAD
  89     if (uv < UTF8_QUAD_MAX)
  90 #endif
  91     {
  92         *d++ =                        0xfe;     /* Can't match U+FEFF! */
  93         *d++ = (((uv >> 30) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  96         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  97         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  98         *d++ = (( uv        & 0x3f) | 0x80);
  99         return d;
 100     }
 101 #ifdef HAS_QUAD
 102     {
 103         *d++ =                        0xff;     /* Can't match U+FFFE! */
 104         *d++ =                        0x80;     /* 6 Reserved bits */
 105         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 106         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 109         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 110         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 111         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 112         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 113         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 114         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 115         *d++ = (( uv        & 0x3f) | 0x80);
 116         return d;
 117     }
 118 #endif
 119 }
 120
 121 /*
 122 =for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
 123
 124 Adds the UTF8 representation of the Native codepoint C<uv> to the end
 125 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 126 bytes available. The return value is the pointer to the byte after the
 127 end of the new character. In other words,
 128
 129     d = uvchr_to_utf8(d, uv);
 130
 131 is the recommended wide native character-aware way of saying
 132
 133     *(d++) = uv;
 134
 135 =cut
 136 */
 137
 138 U8 *
 139 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 140 {
 141     if (uv < 0x100)
 142         uv = NATIVE_TO_ASCII(uv);
 143     return Perl_uvuni_to_utf8(aTHX_ d, uv);
 144 }
 145
 146
 147 /*
 148 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 149
 150 Tests if some arbitrary number of bytes begins in a valid UTF-8
 151 character.  Note that an ASCII character is a valid UTF-8 character.
 152 The actual number of bytes in the UTF-8 character will be returned if
 153 it is valid, otherwise 0.
 154
 155 =cut */
 156 STRLEN
 157 Perl_is_utf8_char(pTHX_ U8 *s)
 158 {
 159     U8 u = *s;
 160     STRLEN slen, len;
 161     UV uv, ouv;
 162
 163     if (UTF8_IS_ASCII(u))
 164         return 1;
 165
 166     if (!UTF8_IS_START(u))
 167         return 0;
 168
 169     len = UTF8SKIP(s);
 170
 171     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 172         return 0;
 173
 174     slen = len - 1;
 175     s++;
 176     uv = u;
 177     ouv = uv;
 178     while (slen--) {
 179         if (!UTF8_IS_CONTINUATION(*s))
 180             return 0;
 181         uv = UTF8_ACCUMULATE(uv, *s);
 182         if (uv < ouv)
 183             return 0;
 184         ouv = uv;
 185         s++;
 186     }
 187
 188     if (UNISKIP(uv) < len)
 189         return 0;
 190
 191     return len;
 192 }
 193
 194 /*
 195 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 196
 197 Returns true if first C<len> bytes of the given string form a valid UTF8
 198 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 199 'a string that contains UTF8' because a valid ASCII string is a valid
 200 UTF8 string.
 201
 202 =cut
 203 */
 204
 205 bool
 206 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 207 {
 208     U8* x = s;
 209     U8* send;
 210     STRLEN c;
 211
 212     if (!len)
 213         len = strlen((char *)s);
 214     send = s + len;
 215
 216     while (x < send) {
 217         c = is_utf8_char(x);
 218         if (!c)
 219             return FALSE;
 220         x += c;
 221     }
 222     if (x != send)
 223         return FALSE;
 224
 225     return TRUE;
 226 }
 227
 228 /*
 229 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 230
 231 Bottom level UTF-8 decode routine.
 232 Returns the unicode code point value of the first character in the string C<s>
 233 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 234 C<retlen> will be set to the length, in bytes, of that character.
 235
 236 If C<s> does not point to a well-formed UTF8 character, the behaviour
 237 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 238 it is assumed that the caller will raise a warning, and this function
 239 will silently just set C<retlen> to C<-1> and return zero.  If the
 240 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 241 malformations will be given, C<retlen> will be set to the expected
 242 length of the UTF-8 character in bytes, and zero will be returned.
 243
 244 The C<flags> can also contain various flags to allow deviations from
 245 the strict UTF-8 encoding (see F<utf8.h>).
 246
 247 Most code should use utf8_to_uvchr() rather than call this directly.
 248
 249 =cut */
 250
 251 UV
 252 Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 253 {
 254     UV uv = *s, ouv;
 255     STRLEN len = 1;
 256 #ifdef EBCDIC
 257     bool dowarn = 0;
 258 #else
 259     bool dowarn = ckWARN_d(WARN_UTF8);
 260 #endif
 261     STRLEN expectlen = 0;
 262     U32 warning = 0;
 263
 264 /* This list is a superset of the UTF8_ALLOW_XXX. */
 265
 266 #define UTF8_WARN_EMPTY                          1
 267 #define UTF8_WARN_CONTINUATION                   2
 268 #define UTF8_WARN_NON_CONTINUATION               3
 269 #define UTF8_WARN_FE_FF                          4
 270 #define UTF8_WARN_SHORT                          5
 271 #define UTF8_WARN_OVERFLOW                       6
 272 #define UTF8_WARN_SURROGATE                      7
 273 #define UTF8_WARN_BOM                            8
 274 #define UTF8_WARN_LONG                           9
 275 #define UTF8_WARN_FFFF                          10
 276
 277     if (curlen == 0 &&
 278         !(flags & UTF8_ALLOW_EMPTY)) {
 279         warning = UTF8_WARN_EMPTY;
 280         goto malformed;
 281     }
 282
 283     if (UTF8_IS_ASCII(uv)) {
 284         if (retlen)
 285             *retlen = 1;
 286         return (UV) (*s);
 287     }
 288
 289     if (UTF8_IS_CONTINUATION(uv) &&
 290         !(flags & UTF8_ALLOW_CONTINUATION)) {
 291         warning = UTF8_WARN_CONTINUATION;
 292         goto malformed;
 293     }
 294
 295     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 296         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 297         warning = UTF8_WARN_NON_CONTINUATION;
 298         goto malformed;
 299     }
 300
 301     if ((uv == 0xfe || uv == 0xff) &&
 302         !(flags & UTF8_ALLOW_FE_FF)) {
 303         warning = UTF8_WARN_FE_FF;
 304         goto malformed;
 305     }
 306
 307     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 308     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 309     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 310     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 311     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 312     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 313     else                        { len = 13; uv = 0; } /* whoa! */
 314
 315     if (retlen)
 316         *retlen = len;
 317
 318     expectlen = len;
 319
 320     if ((curlen < expectlen) &&
 321         !(flags & UTF8_ALLOW_SHORT)) {
 322         warning = UTF8_WARN_SHORT;
 323         goto malformed;
 324     }
 325
 326     len--;
 327     s++;
 328     ouv = uv;
 329
 330     while (len--) {
 331         if (!UTF8_IS_CONTINUATION(*s) &&
 332             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 333             s--;
 334             warning = UTF8_WARN_NON_CONTINUATION;
 335             goto malformed;
 336         }
 337         else
 338             uv = UTF8_ACCUMULATE(uv, *s);
 339         if (!(uv > ouv)) {
 340             /* These cannot be allowed. */
 341             if (uv == ouv) {
 342                 if (!(flags & UTF8_ALLOW_LONG)) {
 343                     warning = UTF8_WARN_LONG;
 344                     goto malformed;
 345                 }
 346             }
 347             else { /* uv < ouv */
 348                 /* This cannot be allowed. */
 349                 warning = UTF8_WARN_OVERFLOW;
 350                 goto malformed;
 351             }
 352         }
 353         s++;
 354         ouv = uv;
 355     }
 356
 357     if (UNICODE_IS_SURROGATE(uv) &&
 358         !(flags & UTF8_ALLOW_SURROGATE)) {
 359         warning = UTF8_WARN_SURROGATE;
 360         goto malformed;
 361     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 362                !(flags & UTF8_ALLOW_BOM)) {
 363         warning = UTF8_WARN_BOM;
 364         goto malformed;
 365     } else if ((expectlen > UNISKIP(uv)) &&
 366                !(flags & UTF8_ALLOW_LONG)) {
 367         warning = UTF8_WARN_LONG;
 368         goto malformed;
 369     } else if (UNICODE_IS_ILLEGAL(uv) &&
 370                !(flags & UTF8_ALLOW_FFFF)) {
 371         warning = UTF8_WARN_FFFF;
 372         goto malformed;
 373     }
 374
 375     return uv;
 376
 377 malformed:
 378
 379     if (flags & UTF8_CHECK_ONLY) {
 380         if (retlen)
 381             *retlen = -1;
 382         return 0;
 383     }
 384
 385     if (dowarn) {
 386         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 387
 388         switch (warning) {
 389         case 0: /* Intentionally empty. */ break;
 390         case UTF8_WARN_EMPTY:
 391             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 392             break;
 393         case UTF8_WARN_CONTINUATION:
 394             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 395             break;
 396         case UTF8_WARN_NON_CONTINUATION:
 397             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 398                            (UV)s[1], uv);
 399             break;
 400         case UTF8_WARN_FE_FF:
 401             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 402             break;
 403         case UTF8_WARN_SHORT:
 404             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 405                            curlen, curlen == 1 ? "" : "s", expectlen);
 406             break;
 407         case UTF8_WARN_OVERFLOW:
 408             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 409                            ouv, *s);
 410             break;
 411         case UTF8_WARN_SURROGATE:
 412             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 413             break;
 414         case UTF8_WARN_BOM:
 415             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 416             break;
 417         case UTF8_WARN_LONG:
 418             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 419                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 420             break;
 421         case UTF8_WARN_FFFF:
 422             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 423             break;
 424         default:
 425             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 426             break;
 427         }
 428
 429         if (warning) {
 430             char *s = SvPVX(sv);
 431
 432             if (PL_op)
 433                 Perl_warner(aTHX_ WARN_UTF8,
 434                             "%s in %s", s,  PL_op_desc[PL_op->op_type]);
 435             else
 436                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 437         }
 438     }
 439
 440     if (retlen)
 441         *retlen = expectlen ? expectlen : len;
 442
 443     return 0;
 444 }
 445
 446 /*
 447 =for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
 448
 449 Returns the native character value of the first character in the string C<s>
 450 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 451 length, in bytes, of that character.
 452
 453 Allows length and flags to be passed to low level routine.
 454
 455 =cut
 456 */
 457
 458 UV
 459 Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 460 {
 461     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
 462     if (uv < 0x100)
 463         return (UV) ASCII_TO_NATIVE(uv);
 464     return uv;
 465 }
 466
 467 /*
 468 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
 469
 470 Returns the native character value of the first character in the string C<s>
 471 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 472 length, in bytes, of that character.
 473
 474 If C<s> does not point to a well-formed UTF8 character, zero is
 475 returned and retlen is set, if possible, to -1.
 476
 477 =cut
 478 */
 479
 480 UV
 481 Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
 482 {
 483     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 484 }
 485
 486 /*
 487 =for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
 488
 489 Returns the Unicode code point of the first character in the string C<s>
 490 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 491 length, in bytes, of that character.
 492
 493 This function should only be used when returned UV is considered
 494 an index into the Unicode semantic tables (e.g. swashes).
 495
 496 If C<s> does not point to a well-formed UTF8 character, zero is
 497 returned and retlen is set, if possible, to -1.
 498
 499 =cut
 500 */
 501
 502 UV
 503 Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
 504 {
 505     /* Call the low level routine asking for checks */
 506     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 507 }
 508
 509 /*
 510 =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
 511
 512 Return the length of the UTF-8 char encoded string C<s> in characters.
 513 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 514 up past C<e>, croaks.
 515
 516 =cut
 517 */
 518
 519 STRLEN
 520 Perl_utf8_length(pTHX_ U8* s, U8* e)
 521 {
 522     STRLEN len = 0;
 523
 524     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 525      * the bitops (especially ~) can create illegal UTF-8.
 526      * In other words: in Perl UTF-8 is not just for Unicode. */
 527
 528     if (e < s)
 529         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 530     while (s < e) {
 531         U8 t = UTF8SKIP(s);
 532
 533         if (e - s < t)
 534             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 535         s += t;
 536         len++;
 537     }
 538
 539     return len;
 540 }
 541
 542 /*
 543 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 544
 545 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 546 and C<b>.
 547
 548 WARNING: use only if you *know* that the pointers point inside the
 549 same UTF-8 buffer.
 550
 551 =cut */
 552
 553 IV
 554 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 555 {
 556     IV off = 0;
 557
 558     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 559      * the bitops (especially ~) can create illegal UTF-8.
 560      * In other words: in Perl UTF-8 is not just for Unicode. */
 561
 562     if (a < b) {
 563         while (a < b) {
 564             U8 c = UTF8SKIP(a);
 565
 566             if (b - a < c)
 567                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 568             a += c;
 569             off--;
 570         }
 571     }
 572     else {
 573         while (b < a) {
 574             U8 c = UTF8SKIP(b);
 575
 576             if (a - b < c)
 577                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 578             b += c;
 579             off++;
 580         }
 581     }
 582
 583     return off;
 584 }
 585
 586 /*
 587 =for apidoc A|U8*|utf8_hop|U8 *s|I32 off
 588
 589 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 590 forward or backward.
 591
 592 WARNING: do not use the following unless you *know* C<off> is within
 593 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 594 on the first byte of character or just after the last byte of a character.
 595
 596 =cut */
 597
 598 U8 *
 599 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 600 {
 601     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 602      * the bitops (especially ~) can create illegal UTF-8.
 603      * In other words: in Perl UTF-8 is not just for Unicode. */
 604
 605     if (off >= 0) {
 606         while (off--)
 607             s += UTF8SKIP(s);
 608     }
 609     else {
 610         while (off++) {
 611             s--;
 612             while (UTF8_IS_CONTINUATION(*s))
 613                 s--;
 614         }
 615     }
 616     return s;
 617 }
 618
 619 /*
 620 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 621
 622 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 623 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 624 updates len to contain the new length.
 625 Returns zero on failure, setting C<len> to -1.
 626
 627 =cut
 628 */
 629
 630 U8 *
 631 Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 632 {
 633     U8 *send;
 634     U8 *d;
 635     U8 *save = s;
 636
 637     /* ensure valid UTF8 and chars < 256 before updating string */
 638     for (send = s + *len; s < send; ) {
 639         U8 c = *s++;
 640
 641         if (c >= 0x80 &&
 642             ((s >= send) ||
 643              ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
 644             *len = -1;
 645             return 0;
 646         }
 647     }
 648
 649     d = s = save;
 650     while (s < send) {
 651         STRLEN ulen;
 652         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 653         s += ulen;
 654     }
 655     *d = '\0';
 656     *len = d - save;
 657     return save;
 658 }
 659
 660 /*
 661 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 662
 663 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 664 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 665 the newly-created string, and updates C<len> to contain the new
 666 length.  Returns the original string if no conversion occurs, C<len>
 667 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 668 0 if C<s> is converted or contains all 7bit characters.
 669
 670 =cut */
 671
 672 U8 *
 673 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
 674 {
 675     U8 *send;
 676     U8 *d;
 677     U8 *start = s;
 678     I32 count = 0;
 679
 680     if (!*is_utf8)
 681         return start;
 682
 683     /* ensure valid UTF8 and chars < 256 before converting string */
 684     for (send = s + *len; s < send;) {
 685         U8 c = *s++;
 686         if (!UTF8_IS_ASCII(c)) {
 687             if (UTF8_IS_CONTINUATION(c) || s >= send ||
 688                 !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
 689                 return start;
 690             s++, count++;
 691         }
 692     }
 693
 694     *is_utf8 = 0;
 695
 696     if (!count)
 697         return start;
 698
 699     Newz(801, d, (*len) - count + 1, U8);
 700     s = start; start = d;
 701     while (s < send) {
 702         U8 c = *s++;
 703
 704         if (UTF8_IS_ASCII(c))
 705             *d++ = c;
 706         else
 707             *d++ = UTF8_ACCUMULATE(c, *s++);
 708     }
 709     *d = '\0';
 710     *len = d - start;
 711     return start;
 712 }
 713
 714 /*
 715 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 716
 717 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 718 Returns a pointer to the newly-created string, and sets C<len> to
 719 reflect the new length.
 720
 721 =cut
 722 */
 723
 724 U8*
 725 Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
 726 {
 727     U8 *send;
 728     U8 *d;
 729     U8 *dst;
 730     send = s + (*len);
 731
 732     Newz(801, d, (*len) * 2 + 1, U8);
 733     dst = d;
 734
 735     while (s < send) {
 736         if (UTF8_IS_ASCII(*s))
 737             *d++ = *s++;
 738         else {
 739             UV uv = *s++;
 740
 741             *d++ = UTF8_EIGHT_BIT_HI(uv);
 742             *d++ = UTF8_EIGHT_BIT_LO(uv);
 743         }
 744     }
 745     *d = '\0';
 746     *len = d-dst;
 747     return dst;
 748 }
 749
 750 /*
 751  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 752  *
 753  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 754  * We optimize for native, for obvious reasons. */
 755
 756 U8*
 757 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 758 {
 759     U8* pend;
 760     U8* dstart = d;
 761
 762     if (bytelen & 1)
 763         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 764
 765     pend = p + bytelen;
 766
 767     while (p < pend) {
 768         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 769         p += 2;
 770         if (uv < 0x80) {
 771             *d++ = uv;
 772             continue;
 773         }
 774         if (uv < 0x800) {
 775             *d++ = (( uv >>  6)         | 0xc0);
 776             *d++ = (( uv        & 0x3f) | 0x80);
 777             continue;
 778         }
 779         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 780             UV low = *p++;
 781             if (low < 0xdc00 || low >= 0xdfff)
 782                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 783             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 784         }
 785         if (uv < 0x10000) {
 786             *d++ = (( uv >> 12)         | 0xe0);
 787             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 788             *d++ = (( uv        & 0x3f) | 0x80);
 789             continue;
 790         }
 791         else {
 792             *d++ = (( uv >> 18)         | 0xf0);
 793             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 794             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 795             *d++ = (( uv        & 0x3f) | 0x80);
 796             continue;
 797         }
 798     }
 799     *newlen = d - dstart;
 800     return d;
 801 }
 802
 803 /* Note: this one is slightly destructive of the source. */
 804
 805 U8*
 806 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 807 {
 808     U8* s = (U8*)p;
 809     U8* send = s + bytelen;
 810     while (s < send) {
 811         U8 tmp = s[0];
 812         s[0] = s[1];
 813         s[1] = tmp;
 814         s += 2;
 815     }
 816     return utf16_to_utf8(p, d, bytelen, newlen);
 817 }
 818
 819 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 820
 821 bool
 822 Perl_is_uni_alnum(pTHX_ U32 c)
 823 {
 824     U8 tmpbuf[UTF8_MAXLEN+1];
 825     uvuni_to_utf8(tmpbuf, (UV)c);
 826     return is_utf8_alnum(tmpbuf);
 827 }
 828
 829 bool
 830 Perl_is_uni_alnumc(pTHX_ U32 c)
 831 {
 832     U8 tmpbuf[UTF8_MAXLEN+1];
 833     uvuni_to_utf8(tmpbuf, (UV)c);
 834     return is_utf8_alnumc(tmpbuf);
 835 }
 836
 837 bool
 838 Perl_is_uni_idfirst(pTHX_ U32 c)
 839 {
 840     U8 tmpbuf[UTF8_MAXLEN+1];
 841     uvuni_to_utf8(tmpbuf, (UV)c);
 842     return is_utf8_idfirst(tmpbuf);
 843 }
 844
 845 bool
 846 Perl_is_uni_alpha(pTHX_ U32 c)
 847 {
 848     U8 tmpbuf[UTF8_MAXLEN+1];
 849     uvuni_to_utf8(tmpbuf, (UV)c);
 850     return is_utf8_alpha(tmpbuf);
 851 }
 852
 853 bool
 854 Perl_is_uni_ascii(pTHX_ U32 c)
 855 {
 856     U8 tmpbuf[UTF8_MAXLEN+1];
 857     uvuni_to_utf8(tmpbuf, (UV)c);
 858     return is_utf8_ascii(tmpbuf);
 859 }
 860
 861 bool
 862 Perl_is_uni_space(pTHX_ U32 c)
 863 {
 864     U8 tmpbuf[UTF8_MAXLEN+1];
 865     uvuni_to_utf8(tmpbuf, (UV)c);
 866     return is_utf8_space(tmpbuf);
 867 }
 868
 869 bool
 870 Perl_is_uni_digit(pTHX_ U32 c)
 871 {
 872     U8 tmpbuf[UTF8_MAXLEN+1];
 873     uvuni_to_utf8(tmpbuf, (UV)c);
 874     return is_utf8_digit(tmpbuf);
 875 }
 876
 877 bool
 878 Perl_is_uni_upper(pTHX_ U32 c)
 879 {
 880     U8 tmpbuf[UTF8_MAXLEN+1];
 881     uvuni_to_utf8(tmpbuf, (UV)c);
 882     return is_utf8_upper(tmpbuf);
 883 }
 884
 885 bool
 886 Perl_is_uni_lower(pTHX_ U32 c)
 887 {
 888     U8 tmpbuf[UTF8_MAXLEN+1];
 889     uvuni_to_utf8(tmpbuf, (UV)c);
 890     return is_utf8_lower(tmpbuf);
 891 }
 892
 893 bool
 894 Perl_is_uni_cntrl(pTHX_ U32 c)
 895 {
 896     U8 tmpbuf[UTF8_MAXLEN+1];
 897     uvuni_to_utf8(tmpbuf, (UV)c);
 898     return is_utf8_cntrl(tmpbuf);
 899 }
 900
 901 bool
 902 Perl_is_uni_graph(pTHX_ U32 c)
 903 {
 904     U8 tmpbuf[UTF8_MAXLEN+1];
 905     uvuni_to_utf8(tmpbuf, (UV)c);
 906     return is_utf8_graph(tmpbuf);
 907 }
 908
 909 bool
 910 Perl_is_uni_print(pTHX_ U32 c)
 911 {
 912     U8 tmpbuf[UTF8_MAXLEN+1];
 913     uvuni_to_utf8(tmpbuf, (UV)c);
 914     return is_utf8_print(tmpbuf);
 915 }
 916
 917 bool
 918 Perl_is_uni_punct(pTHX_ U32 c)
 919 {
 920     U8 tmpbuf[UTF8_MAXLEN+1];
 921     uvuni_to_utf8(tmpbuf, (UV)c);
 922     return is_utf8_punct(tmpbuf);
 923 }
 924
 925 bool
 926 Perl_is_uni_xdigit(pTHX_ U32 c)
 927 {
 928     U8 tmpbuf[UTF8_MAXLEN+1];
 929     uvuni_to_utf8(tmpbuf, (UV)c);
 930     return is_utf8_xdigit(tmpbuf);
 931 }
 932
 933 U32
 934 Perl_to_uni_upper(pTHX_ U32 c)
 935 {
 936     U8 tmpbuf[UTF8_MAXLEN+1];
 937     uvuni_to_utf8(tmpbuf, (UV)c);
 938     return to_utf8_upper(tmpbuf);
 939 }
 940
 941 U32
 942 Perl_to_uni_title(pTHX_ U32 c)
 943 {
 944     U8 tmpbuf[UTF8_MAXLEN+1];
 945     uvuni_to_utf8(tmpbuf, (UV)c);
 946     return to_utf8_title(tmpbuf);
 947 }
 948
 949 U32
 950 Perl_to_uni_lower(pTHX_ U32 c)
 951 {
 952     U8 tmpbuf[UTF8_MAXLEN+1];
 953     uvuni_to_utf8(tmpbuf, (UV)c);
 954     return to_utf8_lower(tmpbuf);
 955 }
 956
 957 /* for now these all assume no locale info available for Unicode > 255 */
 958
 959 bool
 960 Perl_is_uni_alnum_lc(pTHX_ U32 c)
 961 {
 962     return is_uni_alnum(c);     /* XXX no locale support yet */
 963 }
 964
 965 bool
 966 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
 967 {
 968     return is_uni_alnumc(c);    /* XXX no locale support yet */
 969 }
 970
 971 bool
 972 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
 973 {
 974     return is_uni_idfirst(c);   /* XXX no locale support yet */
 975 }
 976
 977 bool
 978 Perl_is_uni_alpha_lc(pTHX_ U32 c)
 979 {
 980     return is_uni_alpha(c);     /* XXX no locale support yet */
 981 }
 982
 983 bool
 984 Perl_is_uni_ascii_lc(pTHX_ U32 c)
 985 {
 986     return is_uni_ascii(c);     /* XXX no locale support yet */
 987 }
 988
 989 bool
 990 Perl_is_uni_space_lc(pTHX_ U32 c)
 991 {
 992     return is_uni_space(c);     /* XXX no locale support yet */
 993 }
 994
 995 bool
 996 Perl_is_uni_digit_lc(pTHX_ U32 c)
 997 {
 998     return is_uni_digit(c);     /* XXX no locale support yet */
 999 }
1000
1001 bool
1002 Perl_is_uni_upper_lc(pTHX_ U32 c)
1003 {
1004     return is_uni_upper(c);     /* XXX no locale support yet */
1005 }
1006
1007 bool
1008 Perl_is_uni_lower_lc(pTHX_ U32 c)
1009 {
1010     return is_uni_lower(c);     /* XXX no locale support yet */
1011 }
1012
1013 bool
1014 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1015 {
1016     return is_uni_cntrl(c);     /* XXX no locale support yet */
1017 }
1018
1019 bool
1020 Perl_is_uni_graph_lc(pTHX_ U32 c)
1021 {
1022     return is_uni_graph(c);     /* XXX no locale support yet */
1023 }
1024
1025 bool
1026 Perl_is_uni_print_lc(pTHX_ U32 c)
1027 {
1028     return is_uni_print(c);     /* XXX no locale support yet */
1029 }
1030
1031 bool
1032 Perl_is_uni_punct_lc(pTHX_ U32 c)
1033 {
1034     return is_uni_punct(c);     /* XXX no locale support yet */
1035 }
1036
1037 bool
1038 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1039 {
1040     return is_uni_xdigit(c);    /* XXX no locale support yet */
1041 }
1042
1043 U32
1044 Perl_to_uni_upper_lc(pTHX_ U32 c)
1045 {
1046     return to_uni_upper(c);     /* XXX no locale support yet */
1047 }
1048
1049 U32
1050 Perl_to_uni_title_lc(pTHX_ U32 c)
1051 {
1052     return to_uni_title(c);     /* XXX no locale support yet */
1053 }
1054
1055 U32
1056 Perl_to_uni_lower_lc(pTHX_ U32 c)
1057 {
1058     return to_uni_lower(c);     /* XXX no locale support yet */
1059 }
1060
1061 bool
1062 Perl_is_utf8_alnum(pTHX_ U8 *p)
1063 {
1064     if (!is_utf8_char(p))
1065         return FALSE;
1066     if (!PL_utf8_alnum)
1067         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1068          * descendant of isalnum(3), in other words, it doesn't
1069          * contain the '_'. --jhi */
1070         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1071     return swash_fetch(PL_utf8_alnum, p);
1072 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1073 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1074     if (!PL_utf8_alnum)
1075         PL_utf8_alnum = swash_init("utf8", "",
1076             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1077     return swash_fetch(PL_utf8_alnum, p);
1078 #endif
1079 }
1080
1081 bool
1082 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1083 {
1084     if (!is_utf8_char(p))
1085         return FALSE;
1086     if (!PL_utf8_alnum)
1087         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1088     return swash_fetch(PL_utf8_alnum, p);
1089 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1090 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1091     if (!PL_utf8_alnum)
1092         PL_utf8_alnum = swash_init("utf8", "",
1093             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1094     return swash_fetch(PL_utf8_alnum, p);
1095 #endif
1096 }
1097
1098 bool
1099 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1100 {
1101     return *p == '_' || is_utf8_alpha(p);
1102 }
1103
1104 bool
1105 Perl_is_utf8_alpha(pTHX_ U8 *p)
1106 {
1107     if (!is_utf8_char(p))
1108         return FALSE;
1109     if (!PL_utf8_alpha)
1110         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1111     return swash_fetch(PL_utf8_alpha, p);
1112 }
1113
1114 bool
1115 Perl_is_utf8_ascii(pTHX_ U8 *p)
1116 {
1117     if (!is_utf8_char(p))
1118         return FALSE;
1119     if (!PL_utf8_ascii)
1120         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1121     return swash_fetch(PL_utf8_ascii, p);
1122 }
1123
1124 bool
1125 Perl_is_utf8_space(pTHX_ U8 *p)
1126 {
1127     if (!is_utf8_char(p))
1128         return FALSE;
1129     if (!PL_utf8_space)
1130         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1131     return swash_fetch(PL_utf8_space, p);
1132 }
1133
1134 bool
1135 Perl_is_utf8_digit(pTHX_ U8 *p)
1136 {
1137     if (!is_utf8_char(p))
1138         return FALSE;
1139     if (!PL_utf8_digit)
1140         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1141     return swash_fetch(PL_utf8_digit, p);
1142 }
1143
1144 bool
1145 Perl_is_utf8_upper(pTHX_ U8 *p)
1146 {
1147     if (!is_utf8_char(p))
1148         return FALSE;
1149     if (!PL_utf8_upper)
1150         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1151     return swash_fetch(PL_utf8_upper, p);
1152 }
1153
1154 bool
1155 Perl_is_utf8_lower(pTHX_ U8 *p)
1156 {
1157     if (!is_utf8_char(p))
1158         return FALSE;
1159     if (!PL_utf8_lower)
1160         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1161     return swash_fetch(PL_utf8_lower, p);
1162 }
1163
1164 bool
1165 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1166 {
1167     if (!is_utf8_char(p))
1168         return FALSE;
1169     if (!PL_utf8_cntrl)
1170         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1171     return swash_fetch(PL_utf8_cntrl, p);
1172 }
1173
1174 bool
1175 Perl_is_utf8_graph(pTHX_ U8 *p)
1176 {
1177     if (!is_utf8_char(p))
1178         return FALSE;
1179     if (!PL_utf8_graph)
1180         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1181     return swash_fetch(PL_utf8_graph, p);
1182 }
1183
1184 bool
1185 Perl_is_utf8_print(pTHX_ U8 *p)
1186 {
1187     if (!is_utf8_char(p))
1188         return FALSE;
1189     if (!PL_utf8_print)
1190         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1191     return swash_fetch(PL_utf8_print, p);
1192 }
1193
1194 bool
1195 Perl_is_utf8_punct(pTHX_ U8 *p)
1196 {
1197     if (!is_utf8_char(p))
1198         return FALSE;
1199     if (!PL_utf8_punct)
1200         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1201     return swash_fetch(PL_utf8_punct, p);
1202 }
1203
1204 bool
1205 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1206 {
1207     if (!is_utf8_char(p))
1208         return FALSE;
1209     if (!PL_utf8_xdigit)
1210         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1211     return swash_fetch(PL_utf8_xdigit, p);
1212 }
1213
1214 bool
1215 Perl_is_utf8_mark(pTHX_ U8 *p)
1216 {
1217     if (!is_utf8_char(p))
1218         return FALSE;
1219     if (!PL_utf8_mark)
1220         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1221     return swash_fetch(PL_utf8_mark, p);
1222 }
1223
1224 UV
1225 Perl_to_utf8_upper(pTHX_ U8 *p)
1226 {
1227     UV uv;
1228
1229     if (!PL_utf8_toupper)
1230         PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1231     uv = swash_fetch(PL_utf8_toupper, p);
1232     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1233 }
1234
1235 UV
1236 Perl_to_utf8_title(pTHX_ U8 *p)
1237 {
1238     UV uv;
1239
1240     if (!PL_utf8_totitle)
1241         PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1242     uv = swash_fetch(PL_utf8_totitle, p);
1243     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1244 }
1245
1246 UV
1247 Perl_to_utf8_lower(pTHX_ U8 *p)
1248 {
1249     UV uv;
1250
1251     if (!PL_utf8_tolower)
1252         PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1253     uv = swash_fetch(PL_utf8_tolower, p);
1254     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1255 }
1256
1257 /* a "swash" is a swatch hash */
1258
1259 SV*
1260 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1261 {
1262     SV* retval;
1263     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1264     dSP;
1265
1266     if (!gv_stashpv(pkg, 0)) {  /* demand load utf8 */
1267         ENTER;
1268         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1269         LEAVE;
1270     }
1271     SPAGAIN;
1272     PUSHSTACKi(PERLSI_MAGIC);
1273     PUSHMARK(SP);
1274     EXTEND(SP,5);
1275     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1276     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1277     PUSHs(listsv);
1278     PUSHs(sv_2mortal(newSViv(minbits)));
1279     PUSHs(sv_2mortal(newSViv(none)));
1280     PUTBACK;
1281     ENTER;
1282     SAVEI32(PL_hints);
1283     PL_hints = 0;
1284     save_re_context();
1285     if (PL_curcop == &PL_compiling)
1286         /* XXX ought to be handled by lex_start */
1287         sv_setpv(tokenbufsv, PL_tokenbuf);
1288     if (call_method("SWASHNEW", G_SCALAR))
1289         retval = newSVsv(*PL_stack_sp--);
1290     else
1291         retval = &PL_sv_undef;
1292     LEAVE;
1293     POPSTACK;
1294     if (PL_curcop == &PL_compiling) {
1295         STRLEN len;
1296         char* pv = SvPV(tokenbufsv, len);
1297
1298         Copy(pv, PL_tokenbuf, len+1, char);
1299         PL_curcop->op_private = PL_hints;
1300     }
1301     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1302         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1303     return retval;
1304 }
1305
1306 UV
1307 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1308 {
1309     HV* hv = (HV*)SvRV(sv);
1310     U32 klen = UTF8SKIP(ptr) - 1;
1311     U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
1312     STRLEN slen;
1313     STRLEN needents = (klen ? 64 : 128);
1314     U8 *tmps;
1315     U32 bit;
1316     SV *retval;
1317
1318     /*
1319      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1320      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1321      * it's nothing to sniff at.)  Pity we usually come through at least
1322      * two function calls to get here...
1323      *
1324      * NB: this code assumes that swatches are never modified, once generated!
1325      */
1326
1327     if (hv == PL_last_swash_hv &&
1328         klen == PL_last_swash_klen &&
1329         (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1330     {
1331         tmps = PL_last_swash_tmps;
1332         slen = PL_last_swash_slen;
1333     }
1334     else {
1335         /* Try our second-level swatch cache, kept in a hash. */
1336         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1337
1338         /* If not cached, generate it via utf8::SWASHGET */
1339         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1340             dSP;
1341             ENTER;
1342             SAVETMPS;
1343             save_re_context();
1344             PUSHSTACKi(PERLSI_MAGIC);
1345             PUSHMARK(SP);
1346             EXTEND(SP,3);
1347             PUSHs((SV*)sv);
1348             /* We call utf8_to_uni as we want and index into Unicode tables,
1349                not a native character number.
1350              */
1351             PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
1352             PUSHs(sv_2mortal(newSViv(needents)));
1353             PUTBACK;
1354             if (call_method("SWASHGET", G_SCALAR))
1355                 retval = newSVsv(*PL_stack_sp--);
1356             else
1357                 retval = &PL_sv_undef;
1358             POPSTACK;
1359             FREETMPS;
1360             LEAVE;
1361             if (PL_curcop == &PL_compiling)
1362                 PL_curcop->op_private = PL_hints;
1363
1364             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1365
1366             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
1367                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1368         }
1369
1370         PL_last_swash_hv = hv;
1371         PL_last_swash_klen = klen;
1372         PL_last_swash_tmps = tmps;
1373         PL_last_swash_slen = slen;
1374         if (klen)
1375             Copy(ptr, PL_last_swash_key, klen, U8);
1376     }
1377
1378     switch ((int)((slen << 3) / needents)) {
1379     case 1:
1380         bit = 1 << (off & 7);
1381         off >>= 3;
1382         return (tmps[off] & bit) != 0;
1383     case 8:
1384         return tmps[off];
1385     case 16:
1386         off <<= 1;
1387         return (tmps[off] << 8) + tmps[off + 1] ;
1388     case 32:
1389         off <<= 2;
1390         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1391     }
1392     Perl_croak(aTHX_ "panic: swash_fetch");
1393     return 0;
1394 }