utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (uv < 0x80) {
  50         *d++ = uv;
  51         return d;
  52     }
  53     if (uv < 0x800) {
  54         *d++ = (( uv >>  6)         | 0xc0);
  55         *d++ = (( uv        & 0x3f) | 0x80);
  56         return d;
  57     }
  58     if (uv < 0x10000) {
  59         *d++ = (( uv >> 12)         | 0xe0);
  60         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  61         *d++ = (( uv        & 0x3f) | 0x80);
  62         return d;
  63     }
  64     if (uv < 0x200000) {
  65         *d++ = (( uv >> 18)         | 0xf0);
  66         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  67         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  68         *d++ = (( uv        & 0x3f) | 0x80);
  69         return d;
  70     }
  71     if (uv < 0x4000000) {
  72         *d++ = (( uv >> 24)         | 0xf8);
  73         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  74         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  75         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  76         *d++ = (( uv        & 0x3f) | 0x80);
  77         return d;
  78     }
  79     if (uv < 0x80000000) {
  80         *d++ = (( uv >> 30)         | 0xfc);
  81         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  82         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  83         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  84         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  85         *d++ = (( uv        & 0x3f) | 0x80);
  86         return d;
  87     }
  88 #ifdef HAS_QUAD
  89     if (uv < UTF8_QUAD_MAX)
  90 #endif
  91     {
  92         *d++ =                        0xfe;     /* Can't match U+FEFF! */
  93         *d++ = (((uv >> 30) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  96         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  97         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  98         *d++ = (( uv        & 0x3f) | 0x80);
  99         return d;
 100     }
 101 #ifdef HAS_QUAD
 102     {
 103         *d++ =                        0xff;     /* Can't match U+FFFE! */
 104         *d++ =                        0x80;     /* 6 Reserved bits */
 105         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 106         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 109         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 110         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 111         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 112         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 113         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 114         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 115         *d++ = (( uv        & 0x3f) | 0x80);
 116         return d;
 117     }
 118 #endif
 119 }
 120
 121 /*
 122 =for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
 123
 124 Adds the UTF8 representation of the Native codepoint C<uv> to the end
 125 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 126 bytes available. The return value is the pointer to the byte after the
 127 end of the new character. In other words,
 128
 129     d = uvchr_to_utf8(d, uv);
 130
 131 is the recommended wide native character-aware way of saying
 132
 133     *(d++) = uv;
 134
 135 =cut
 136 */
 137
 138 U8 *
 139 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 140 {
 141     if (uv < 0x100)
 142         uv = NATIVE_TO_ASCII(uv);
 143     return Perl_uvuni_to_utf8(aTHX_ d, uv);
 144 }
 145
 146
 147 /*
 148 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 149
 150 Tests if some arbitrary number of bytes begins in a valid UTF-8
 151 character.  Note that an ASCII character is a valid UTF-8 character.
 152 The actual number of bytes in the UTF-8 character will be returned if
 153 it is valid, otherwise 0.
 154
 155 =cut */
 156 STRLEN
 157 Perl_is_utf8_char(pTHX_ U8 *s)
 158 {
 159     U8 u = *s;
 160     STRLEN slen, len;
 161     UV uv, ouv;
 162
 163     if (UTF8_IS_ASCII(u))
 164         return 1;
 165
 166     if (!UTF8_IS_START(u))
 167         return 0;
 168
 169     len = UTF8SKIP(s);
 170
 171     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 172         return 0;
 173
 174     slen = len - 1;
 175     s++;
 176     uv = u;
 177     ouv = uv;
 178     while (slen--) {
 179         if (!UTF8_IS_CONTINUATION(*s))
 180             return 0;
 181         uv = UTF8_ACCUMULATE(uv, *s);
 182         if (uv < ouv)
 183             return 0;
 184         ouv = uv;
 185         s++;
 186     }
 187
 188     if (UNISKIP(uv) < len)
 189         return 0;
 190
 191     return len;
 192 }
 193
 194 /*
 195 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 196
 197 Returns true if first C<len> bytes of the given string form a valid UTF8
 198 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 199 'a string that contains UTF8' because a valid ASCII string is a valid
 200 UTF8 string.
 201
 202 =cut
 203 */
 204
 205 bool
 206 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 207 {
 208     U8* x = s;
 209     U8* send;
 210     STRLEN c;
 211
 212     if (!len)
 213         len = strlen((char *)s);
 214     send = s + len;
 215
 216     while (x < send) {
 217         c = is_utf8_char(x);
 218         if (!c)
 219             return FALSE;
 220         x += c;
 221     }
 222     if (x != send)
 223         return FALSE;
 224
 225     return TRUE;
 226 }
 227
 228 /*
 229 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 230
 231 Bottom level UTF-8 decode routine.
 232 Returns the unicode code point value of the first character in the string C<s>
 233 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 234 C<retlen> will be set to the length, in bytes, of that character.
 235
 236 If C<s> does not point to a well-formed UTF8 character, the behaviour
 237 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 238 it is assumed that the caller will raise a warning, and this function
 239 will silently just set C<retlen> to C<-1> and return zero.  If the
 240 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 241 malformations will be given, C<retlen> will be set to the expected
 242 length of the UTF-8 character in bytes, and zero will be returned.
 243
 244 The C<flags> can also contain various flags to allow deviations from
 245 the strict UTF-8 encoding (see F<utf8.h>).
 246
 247 Most code should use utf8_to_uvchr() rather than call this directly.
 248
 249 =cut */
 250
 251 UV
 252 Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 253 {
 254     UV uv = *s, ouv;
 255     STRLEN len = 1;
 256     bool dowarn = ckWARN_d(WARN_UTF8);
 257     STRLEN expectlen = 0;
 258     U32 warning = 0;
 259
 260 /* This list is a superset of the UTF8_ALLOW_XXX. */
 261
 262 #define UTF8_WARN_EMPTY                          1
 263 #define UTF8_WARN_CONTINUATION                   2
 264 #define UTF8_WARN_NON_CONTINUATION               3
 265 #define UTF8_WARN_FE_FF                          4
 266 #define UTF8_WARN_SHORT                          5
 267 #define UTF8_WARN_OVERFLOW                       6
 268 #define UTF8_WARN_SURROGATE                      7
 269 #define UTF8_WARN_BOM                            8
 270 #define UTF8_WARN_LONG                           9
 271 #define UTF8_WARN_FFFF                          10
 272
 273     if (curlen == 0 &&
 274         !(flags & UTF8_ALLOW_EMPTY)) {
 275         warning = UTF8_WARN_EMPTY;
 276         goto malformed;
 277     }
 278
 279     if (UTF8_IS_ASCII(uv)) {
 280         if (retlen)
 281             *retlen = 1;
 282         return (UV) (*s);
 283     }
 284
 285     if (UTF8_IS_CONTINUATION(uv) &&
 286         !(flags & UTF8_ALLOW_CONTINUATION)) {
 287         warning = UTF8_WARN_CONTINUATION;
 288         goto malformed;
 289     }
 290
 291     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 292         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 293         warning = UTF8_WARN_NON_CONTINUATION;
 294         goto malformed;
 295     }
 296
 297     if ((uv == 0xfe || uv == 0xff) &&
 298         !(flags & UTF8_ALLOW_FE_FF)) {
 299         warning = UTF8_WARN_FE_FF;
 300         goto malformed;
 301     }
 302
 303     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 304     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 305     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 306     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 307     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 308     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 309     else                        { len = 13; uv = 0; } /* whoa! */
 310
 311     if (retlen)
 312         *retlen = len;
 313
 314     expectlen = len;
 315
 316     if ((curlen < expectlen) &&
 317         !(flags & UTF8_ALLOW_SHORT)) {
 318         warning = UTF8_WARN_SHORT;
 319         goto malformed;
 320     }
 321
 322     len--;
 323     s++;
 324     ouv = uv;
 325
 326     while (len--) {
 327         if (!UTF8_IS_CONTINUATION(*s) &&
 328             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 329             s--;
 330             warning = UTF8_WARN_NON_CONTINUATION;
 331             goto malformed;
 332         }
 333         else
 334             uv = UTF8_ACCUMULATE(uv, *s);
 335         if (!(uv > ouv)) {
 336             /* These cannot be allowed. */
 337             if (uv == ouv) {
 338                 if (!(flags & UTF8_ALLOW_LONG)) {
 339                     warning = UTF8_WARN_LONG;
 340                     goto malformed;
 341                 }
 342             }
 343             else { /* uv < ouv */
 344                 /* This cannot be allowed. */
 345                 warning = UTF8_WARN_OVERFLOW;
 346                 goto malformed;
 347             }
 348         }
 349         s++;
 350         ouv = uv;
 351     }
 352
 353     if (UNICODE_IS_SURROGATE(uv) &&
 354         !(flags & UTF8_ALLOW_SURROGATE)) {
 355         warning = UTF8_WARN_SURROGATE;
 356         goto malformed;
 357     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 358                !(flags & UTF8_ALLOW_BOM)) {
 359         warning = UTF8_WARN_BOM;
 360         goto malformed;
 361     } else if ((expectlen > UNISKIP(uv)) &&
 362                !(flags & UTF8_ALLOW_LONG)) {
 363         warning = UTF8_WARN_LONG;
 364         goto malformed;
 365     } else if (UNICODE_IS_ILLEGAL(uv) &&
 366                !(flags & UTF8_ALLOW_FFFF)) {
 367         warning = UTF8_WARN_FFFF;
 368         goto malformed;
 369     }
 370
 371     return uv;
 372
 373 malformed:
 374
 375     if (flags & UTF8_CHECK_ONLY) {
 376         if (retlen)
 377             *retlen = -1;
 378         return 0;
 379     }
 380
 381     if (dowarn) {
 382         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 383
 384         switch (warning) {
 385         case 0: /* Intentionally empty. */ break;
 386         case UTF8_WARN_EMPTY:
 387             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 388             break;
 389         case UTF8_WARN_CONTINUATION:
 390             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 391             break;
 392         case UTF8_WARN_NON_CONTINUATION:
 393             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 394                            (UV)s[1], uv);
 395             break;
 396         case UTF8_WARN_FE_FF:
 397             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 398             break;
 399         case UTF8_WARN_SHORT:
 400             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 401                            curlen, curlen == 1 ? "" : "s", expectlen);
 402             break;
 403         case UTF8_WARN_OVERFLOW:
 404             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 405                            ouv, *s);
 406             break;
 407         case UTF8_WARN_SURROGATE:
 408             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 409             break;
 410         case UTF8_WARN_BOM:
 411             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 412             break;
 413         case UTF8_WARN_LONG:
 414             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 415                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 416             break;
 417         case UTF8_WARN_FFFF:
 418             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 419             break;
 420         default:
 421             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 422             break;
 423         }
 424
 425         if (warning) {
 426             char *s = SvPVX(sv);
 427
 428             if (PL_op)
 429                 Perl_warner(aTHX_ WARN_UTF8,
 430                             "%s in %s", s,  PL_op_desc[PL_op->op_type]);
 431             else
 432                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 433         }
 434     }
 435
 436     if (retlen)
 437         *retlen = expectlen ? expectlen : len;
 438
 439     return 0;
 440 }
 441
 442 /*
 443 =for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
 444
 445 Returns the native character value of the first character in the string C<s>
 446 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 447 length, in bytes, of that character.
 448
 449 Allows length and flags to be passed to low level routine.
 450
 451 =cut
 452 */
 453
 454 UV
 455 Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 456 {
 457     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
 458     if (uv < 0x100)
 459         return (UV) ASCII_TO_NATIVE(uv);
 460     return uv;
 461 }
 462
 463 /*
 464 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
 465
 466 Returns the native character value of the first character in the string C<s>
 467 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 468 length, in bytes, of that character.
 469
 470 If C<s> does not point to a well-formed UTF8 character, zero is
 471 returned and retlen is set, if possible, to -1.
 472
 473 =cut
 474 */
 475
 476 UV
 477 Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
 478 {
 479     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 480 }
 481
 482 /*
 483 =for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
 484
 485 Returns the Unicode code point of the first character in the string C<s>
 486 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 487 length, in bytes, of that character.
 488
 489 This function should only be used when returned UV is considered
 490 an index into the Unicode semantic tables (e.g. swashes).
 491
 492 If C<s> does not point to a well-formed UTF8 character, zero is
 493 returned and retlen is set, if possible, to -1.
 494
 495 =cut
 496 */
 497
 498 UV
 499 Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
 500 {
 501     /* Call the low level routine asking for checks */
 502     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 503 }
 504
 505 /*
 506 =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
 507
 508 Return the length of the UTF-8 char encoded string C<s> in characters.
 509 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 510 up past C<e>, croaks.
 511
 512 =cut
 513 */
 514
 515 STRLEN
 516 Perl_utf8_length(pTHX_ U8* s, U8* e)
 517 {
 518     STRLEN len = 0;
 519
 520     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 521      * the bitops (especially ~) can create illegal UTF-8.
 522      * In other words: in Perl UTF-8 is not just for Unicode. */
 523
 524     if (e < s)
 525         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 526     while (s < e) {
 527         U8 t = UTF8SKIP(s);
 528
 529         if (e - s < t)
 530             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 531         s += t;
 532         len++;
 533     }
 534
 535     return len;
 536 }
 537
 538 /*
 539 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 540
 541 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 542 and C<b>.
 543
 544 WARNING: use only if you *know* that the pointers point inside the
 545 same UTF-8 buffer.
 546
 547 =cut */
 548
 549 IV
 550 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 551 {
 552     IV off = 0;
 553
 554     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 555      * the bitops (especially ~) can create illegal UTF-8.
 556      * In other words: in Perl UTF-8 is not just for Unicode. */
 557
 558     if (a < b) {
 559         while (a < b) {
 560             U8 c = UTF8SKIP(a);
 561
 562             if (b - a < c)
 563                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 564             a += c;
 565             off--;
 566         }
 567     }
 568     else {
 569         while (b < a) {
 570             U8 c = UTF8SKIP(b);
 571
 572             if (a - b < c)
 573                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 574             b += c;
 575             off++;
 576         }
 577     }
 578
 579     return off;
 580 }
 581
 582 /*
 583 =for apidoc A|U8*|utf8_hop|U8 *s|I32 off
 584
 585 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 586 forward or backward.
 587
 588 WARNING: do not use the following unless you *know* C<off> is within
 589 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 590 on the first byte of character or just after the last byte of a character.
 591
 592 =cut */
 593
 594 U8 *
 595 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 596 {
 597     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 598      * the bitops (especially ~) can create illegal UTF-8.
 599      * In other words: in Perl UTF-8 is not just for Unicode. */
 600
 601     if (off >= 0) {
 602         while (off--)
 603             s += UTF8SKIP(s);
 604     }
 605     else {
 606         while (off++) {
 607             s--;
 608             while (UTF8_IS_CONTINUATION(*s))
 609                 s--;
 610         }
 611     }
 612     return s;
 613 }
 614
 615 /*
 616 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 617
 618 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 619 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 620 updates len to contain the new length.
 621 Returns zero on failure, setting C<len> to -1.
 622
 623 =cut
 624 */
 625
 626 U8 *
 627 Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 628 {
 629     U8 *send;
 630     U8 *d;
 631     U8 *save = s;
 632
 633     /* ensure valid UTF8 and chars < 256 before updating string */
 634     for (send = s + *len; s < send; ) {
 635         U8 c = *s++;
 636
 637         if (c >= 0x80 &&
 638             ((s >= send) ||
 639              ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
 640             *len = -1;
 641             return 0;
 642         }
 643     }
 644
 645     d = s = save;
 646     while (s < send) {
 647         STRLEN ulen;
 648         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 649         s += ulen;
 650     }
 651     *d = '\0';
 652     *len = d - save;
 653     return save;
 654 }
 655
 656 /*
 657 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 658
 659 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 660 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 661 the newly-created string, and updates C<len> to contain the new
 662 length.  Returns the original string if no conversion occurs, C<len>
 663 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 664 0 if C<s> is converted or contains all 7bit characters.
 665
 666 =cut */
 667
 668 U8 *
 669 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
 670 {
 671     U8 *send;
 672     U8 *d;
 673     U8 *start = s;
 674     I32 count = 0;
 675
 676     if (!*is_utf8)
 677         return start;
 678
 679     /* ensure valid UTF8 and chars < 256 before converting string */
 680     for (send = s + *len; s < send;) {
 681         U8 c = *s++;
 682         if (!UTF8_IS_ASCII(c)) {
 683             if (UTF8_IS_CONTINUATION(c) || s >= send ||
 684                 !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
 685                 return start;
 686             s++, count++;
 687         }
 688     }
 689
 690     *is_utf8 = 0;
 691
 692     if (!count)
 693         return start;
 694
 695     Newz(801, d, (*len) - count + 1, U8);
 696     s = start; start = d;
 697     while (s < send) {
 698         U8 c = *s++;
 699
 700         if (UTF8_IS_ASCII(c))
 701             *d++ = c;
 702         else
 703             *d++ = UTF8_ACCUMULATE(c, *s++);
 704     }
 705     *d = '\0';
 706     *len = d - start;
 707     return start;
 708 }
 709
 710 /*
 711 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 712
 713 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 714 Returns a pointer to the newly-created string, and sets C<len> to
 715 reflect the new length.
 716
 717 =cut
 718 */
 719
 720 U8*
 721 Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
 722 {
 723     U8 *send;
 724     U8 *d;
 725     U8 *dst;
 726     send = s + (*len);
 727
 728     Newz(801, d, (*len) * 2 + 1, U8);
 729     dst = d;
 730
 731     while (s < send) {
 732         if (UTF8_IS_ASCII(*s))
 733             *d++ = *s++;
 734         else {
 735             UV uv = *s++;
 736
 737             *d++ = UTF8_EIGHT_BIT_HI(uv);
 738             *d++ = UTF8_EIGHT_BIT_LO(uv);
 739         }
 740     }
 741     *d = '\0';
 742     *len = d-dst;
 743     return dst;
 744 }
 745
 746 /*
 747  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 748  *
 749  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 750  * We optimize for native, for obvious reasons. */
 751
 752 U8*
 753 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 754 {
 755     U8* pend;
 756     U8* dstart = d;
 757
 758     if (bytelen & 1)
 759         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 760
 761     pend = p + bytelen;
 762
 763     while (p < pend) {
 764         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 765         p += 2;
 766         if (uv < 0x80) {
 767             *d++ = uv;
 768             continue;
 769         }
 770         if (uv < 0x800) {
 771             *d++ = (( uv >>  6)         | 0xc0);
 772             *d++ = (( uv        & 0x3f) | 0x80);
 773             continue;
 774         }
 775         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 776             UV low = *p++;
 777             if (low < 0xdc00 || low >= 0xdfff)
 778                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 779             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 780         }
 781         if (uv < 0x10000) {
 782             *d++ = (( uv >> 12)         | 0xe0);
 783             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 784             *d++ = (( uv        & 0x3f) | 0x80);
 785             continue;
 786         }
 787         else {
 788             *d++ = (( uv >> 18)         | 0xf0);
 789             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 790             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 791             *d++ = (( uv        & 0x3f) | 0x80);
 792             continue;
 793         }
 794     }
 795     *newlen = d - dstart;
 796     return d;
 797 }
 798
 799 /* Note: this one is slightly destructive of the source. */
 800
 801 U8*
 802 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 803 {
 804     U8* s = (U8*)p;
 805     U8* send = s + bytelen;
 806     while (s < send) {
 807         U8 tmp = s[0];
 808         s[0] = s[1];
 809         s[1] = tmp;
 810         s += 2;
 811     }
 812     return utf16_to_utf8(p, d, bytelen, newlen);
 813 }
 814
 815 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 816
 817 bool
 818 Perl_is_uni_alnum(pTHX_ U32 c)
 819 {
 820     U8 tmpbuf[UTF8_MAXLEN+1];
 821     uvuni_to_utf8(tmpbuf, (UV)c);
 822     return is_utf8_alnum(tmpbuf);
 823 }
 824
 825 bool
 826 Perl_is_uni_alnumc(pTHX_ U32 c)
 827 {
 828     U8 tmpbuf[UTF8_MAXLEN+1];
 829     uvuni_to_utf8(tmpbuf, (UV)c);
 830     return is_utf8_alnumc(tmpbuf);
 831 }
 832
 833 bool
 834 Perl_is_uni_idfirst(pTHX_ U32 c)
 835 {
 836     U8 tmpbuf[UTF8_MAXLEN+1];
 837     uvuni_to_utf8(tmpbuf, (UV)c);
 838     return is_utf8_idfirst(tmpbuf);
 839 }
 840
 841 bool
 842 Perl_is_uni_alpha(pTHX_ U32 c)
 843 {
 844     U8 tmpbuf[UTF8_MAXLEN+1];
 845     uvuni_to_utf8(tmpbuf, (UV)c);
 846     return is_utf8_alpha(tmpbuf);
 847 }
 848
 849 bool
 850 Perl_is_uni_ascii(pTHX_ U32 c)
 851 {
 852     U8 tmpbuf[UTF8_MAXLEN+1];
 853     uvuni_to_utf8(tmpbuf, (UV)c);
 854     return is_utf8_ascii(tmpbuf);
 855 }
 856
 857 bool
 858 Perl_is_uni_space(pTHX_ U32 c)
 859 {
 860     U8 tmpbuf[UTF8_MAXLEN+1];
 861     uvuni_to_utf8(tmpbuf, (UV)c);
 862     return is_utf8_space(tmpbuf);
 863 }
 864
 865 bool
 866 Perl_is_uni_digit(pTHX_ U32 c)
 867 {
 868     U8 tmpbuf[UTF8_MAXLEN+1];
 869     uvuni_to_utf8(tmpbuf, (UV)c);
 870     return is_utf8_digit(tmpbuf);
 871 }
 872
 873 bool
 874 Perl_is_uni_upper(pTHX_ U32 c)
 875 {
 876     U8 tmpbuf[UTF8_MAXLEN+1];
 877     uvuni_to_utf8(tmpbuf, (UV)c);
 878     return is_utf8_upper(tmpbuf);
 879 }
 880
 881 bool
 882 Perl_is_uni_lower(pTHX_ U32 c)
 883 {
 884     U8 tmpbuf[UTF8_MAXLEN+1];
 885     uvuni_to_utf8(tmpbuf, (UV)c);
 886     return is_utf8_lower(tmpbuf);
 887 }
 888
 889 bool
 890 Perl_is_uni_cntrl(pTHX_ U32 c)
 891 {
 892     U8 tmpbuf[UTF8_MAXLEN+1];
 893     uvuni_to_utf8(tmpbuf, (UV)c);
 894     return is_utf8_cntrl(tmpbuf);
 895 }
 896
 897 bool
 898 Perl_is_uni_graph(pTHX_ U32 c)
 899 {
 900     U8 tmpbuf[UTF8_MAXLEN+1];
 901     uvuni_to_utf8(tmpbuf, (UV)c);
 902     return is_utf8_graph(tmpbuf);
 903 }
 904
 905 bool
 906 Perl_is_uni_print(pTHX_ U32 c)
 907 {
 908     U8 tmpbuf[UTF8_MAXLEN+1];
 909     uvuni_to_utf8(tmpbuf, (UV)c);
 910     return is_utf8_print(tmpbuf);
 911 }
 912
 913 bool
 914 Perl_is_uni_punct(pTHX_ U32 c)
 915 {
 916     U8 tmpbuf[UTF8_MAXLEN+1];
 917     uvuni_to_utf8(tmpbuf, (UV)c);
 918     return is_utf8_punct(tmpbuf);
 919 }
 920
 921 bool
 922 Perl_is_uni_xdigit(pTHX_ U32 c)
 923 {
 924     U8 tmpbuf[UTF8_MAXLEN+1];
 925     uvuni_to_utf8(tmpbuf, (UV)c);
 926     return is_utf8_xdigit(tmpbuf);
 927 }
 928
 929 U32
 930 Perl_to_uni_upper(pTHX_ U32 c)
 931 {
 932     U8 tmpbuf[UTF8_MAXLEN+1];
 933     uvuni_to_utf8(tmpbuf, (UV)c);
 934     return to_utf8_upper(tmpbuf);
 935 }
 936
 937 U32
 938 Perl_to_uni_title(pTHX_ U32 c)
 939 {
 940     U8 tmpbuf[UTF8_MAXLEN+1];
 941     uvuni_to_utf8(tmpbuf, (UV)c);
 942     return to_utf8_title(tmpbuf);
 943 }
 944
 945 U32
 946 Perl_to_uni_lower(pTHX_ U32 c)
 947 {
 948     U8 tmpbuf[UTF8_MAXLEN+1];
 949     uvuni_to_utf8(tmpbuf, (UV)c);
 950     return to_utf8_lower(tmpbuf);
 951 }
 952
 953 /* for now these all assume no locale info available for Unicode > 255 */
 954
 955 bool
 956 Perl_is_uni_alnum_lc(pTHX_ U32 c)
 957 {
 958     return is_uni_alnum(c);     /* XXX no locale support yet */
 959 }
 960
 961 bool
 962 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
 963 {
 964     return is_uni_alnumc(c);    /* XXX no locale support yet */
 965 }
 966
 967 bool
 968 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
 969 {
 970     return is_uni_idfirst(c);   /* XXX no locale support yet */
 971 }
 972
 973 bool
 974 Perl_is_uni_alpha_lc(pTHX_ U32 c)
 975 {
 976     return is_uni_alpha(c);     /* XXX no locale support yet */
 977 }
 978
 979 bool
 980 Perl_is_uni_ascii_lc(pTHX_ U32 c)
 981 {
 982     return is_uni_ascii(c);     /* XXX no locale support yet */
 983 }
 984
 985 bool
 986 Perl_is_uni_space_lc(pTHX_ U32 c)
 987 {
 988     return is_uni_space(c);     /* XXX no locale support yet */
 989 }
 990
 991 bool
 992 Perl_is_uni_digit_lc(pTHX_ U32 c)
 993 {
 994     return is_uni_digit(c);     /* XXX no locale support yet */
 995 }
 996
 997 bool
 998 Perl_is_uni_upper_lc(pTHX_ U32 c)
 999 {
1000     return is_uni_upper(c);     /* XXX no locale support yet */
1001 }
1002
1003 bool
1004 Perl_is_uni_lower_lc(pTHX_ U32 c)
1005 {
1006     return is_uni_lower(c);     /* XXX no locale support yet */
1007 }
1008
1009 bool
1010 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1011 {
1012     return is_uni_cntrl(c);     /* XXX no locale support yet */
1013 }
1014
1015 bool
1016 Perl_is_uni_graph_lc(pTHX_ U32 c)
1017 {
1018     return is_uni_graph(c);     /* XXX no locale support yet */
1019 }
1020
1021 bool
1022 Perl_is_uni_print_lc(pTHX_ U32 c)
1023 {
1024     return is_uni_print(c);     /* XXX no locale support yet */
1025 }
1026
1027 bool
1028 Perl_is_uni_punct_lc(pTHX_ U32 c)
1029 {
1030     return is_uni_punct(c);     /* XXX no locale support yet */
1031 }
1032
1033 bool
1034 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1035 {
1036     return is_uni_xdigit(c);    /* XXX no locale support yet */
1037 }
1038
1039 U32
1040 Perl_to_uni_upper_lc(pTHX_ U32 c)
1041 {
1042     return to_uni_upper(c);     /* XXX no locale support yet */
1043 }
1044
1045 U32
1046 Perl_to_uni_title_lc(pTHX_ U32 c)
1047 {
1048     return to_uni_title(c);     /* XXX no locale support yet */
1049 }
1050
1051 U32
1052 Perl_to_uni_lower_lc(pTHX_ U32 c)
1053 {
1054     return to_uni_lower(c);     /* XXX no locale support yet */
1055 }
1056
1057 bool
1058 Perl_is_utf8_alnum(pTHX_ U8 *p)
1059 {
1060     if (!is_utf8_char(p))
1061         return FALSE;
1062     if (!PL_utf8_alnum)
1063         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1064          * descendant of isalnum(3), in other words, it doesn't
1065          * contain the '_'. --jhi */
1066         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1067     return swash_fetch(PL_utf8_alnum, p);
1068 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1069 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1070     if (!PL_utf8_alnum)
1071         PL_utf8_alnum = swash_init("utf8", "",
1072             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1073     return swash_fetch(PL_utf8_alnum, p);
1074 #endif
1075 }
1076
1077 bool
1078 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1079 {
1080     if (!is_utf8_char(p))
1081         return FALSE;
1082     if (!PL_utf8_alnum)
1083         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1084     return swash_fetch(PL_utf8_alnum, p);
1085 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1086 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1087     if (!PL_utf8_alnum)
1088         PL_utf8_alnum = swash_init("utf8", "",
1089             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1090     return swash_fetch(PL_utf8_alnum, p);
1091 #endif
1092 }
1093
1094 bool
1095 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1096 {
1097     return *p == '_' || is_utf8_alpha(p);
1098 }
1099
1100 bool
1101 Perl_is_utf8_alpha(pTHX_ U8 *p)
1102 {
1103     if (!is_utf8_char(p))
1104         return FALSE;
1105     if (!PL_utf8_alpha)
1106         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1107     return swash_fetch(PL_utf8_alpha, p);
1108 }
1109
1110 bool
1111 Perl_is_utf8_ascii(pTHX_ U8 *p)
1112 {
1113     if (!is_utf8_char(p))
1114         return FALSE;
1115     if (!PL_utf8_ascii)
1116         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1117     return swash_fetch(PL_utf8_ascii, p);
1118 }
1119
1120 bool
1121 Perl_is_utf8_space(pTHX_ U8 *p)
1122 {
1123     if (!is_utf8_char(p))
1124         return FALSE;
1125     if (!PL_utf8_space)
1126         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1127     return swash_fetch(PL_utf8_space, p);
1128 }
1129
1130 bool
1131 Perl_is_utf8_digit(pTHX_ U8 *p)
1132 {
1133     if (!is_utf8_char(p))
1134         return FALSE;
1135     if (!PL_utf8_digit)
1136         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1137     return swash_fetch(PL_utf8_digit, p);
1138 }
1139
1140 bool
1141 Perl_is_utf8_upper(pTHX_ U8 *p)
1142 {
1143     if (!is_utf8_char(p))
1144         return FALSE;
1145     if (!PL_utf8_upper)
1146         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1147     return swash_fetch(PL_utf8_upper, p);
1148 }
1149
1150 bool
1151 Perl_is_utf8_lower(pTHX_ U8 *p)
1152 {
1153     if (!is_utf8_char(p))
1154         return FALSE;
1155     if (!PL_utf8_lower)
1156         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1157     return swash_fetch(PL_utf8_lower, p);
1158 }
1159
1160 bool
1161 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1162 {
1163     if (!is_utf8_char(p))
1164         return FALSE;
1165     if (!PL_utf8_cntrl)
1166         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1167     return swash_fetch(PL_utf8_cntrl, p);
1168 }
1169
1170 bool
1171 Perl_is_utf8_graph(pTHX_ U8 *p)
1172 {
1173     if (!is_utf8_char(p))
1174         return FALSE;
1175     if (!PL_utf8_graph)
1176         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1177     return swash_fetch(PL_utf8_graph, p);
1178 }
1179
1180 bool
1181 Perl_is_utf8_print(pTHX_ U8 *p)
1182 {
1183     if (!is_utf8_char(p))
1184         return FALSE;
1185     if (!PL_utf8_print)
1186         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1187     return swash_fetch(PL_utf8_print, p);
1188 }
1189
1190 bool
1191 Perl_is_utf8_punct(pTHX_ U8 *p)
1192 {
1193     if (!is_utf8_char(p))
1194         return FALSE;
1195     if (!PL_utf8_punct)
1196         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1197     return swash_fetch(PL_utf8_punct, p);
1198 }
1199
1200 bool
1201 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1202 {
1203     if (!is_utf8_char(p))
1204         return FALSE;
1205     if (!PL_utf8_xdigit)
1206         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1207     return swash_fetch(PL_utf8_xdigit, p);
1208 }
1209
1210 bool
1211 Perl_is_utf8_mark(pTHX_ U8 *p)
1212 {
1213     if (!is_utf8_char(p))
1214         return FALSE;
1215     if (!PL_utf8_mark)
1216         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1217     return swash_fetch(PL_utf8_mark, p);
1218 }
1219
1220 UV
1221 Perl_to_utf8_upper(pTHX_ U8 *p)
1222 {
1223     UV uv;
1224
1225     if (!PL_utf8_toupper)
1226         PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1227     uv = swash_fetch(PL_utf8_toupper, p);
1228     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1229 }
1230
1231 UV
1232 Perl_to_utf8_title(pTHX_ U8 *p)
1233 {
1234     UV uv;
1235
1236     if (!PL_utf8_totitle)
1237         PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1238     uv = swash_fetch(PL_utf8_totitle, p);
1239     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1240 }
1241
1242 UV
1243 Perl_to_utf8_lower(pTHX_ U8 *p)
1244 {
1245     UV uv;
1246
1247     if (!PL_utf8_tolower)
1248         PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1249     uv = swash_fetch(PL_utf8_tolower, p);
1250     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1251 }
1252
1253 /* a "swash" is a swatch hash */
1254
1255 SV*
1256 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1257 {
1258     SV* retval;
1259     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1260     dSP;
1261
1262     if (!gv_stashpv(pkg, 0)) {  /* demand load utf8 */
1263         ENTER;
1264         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1265         LEAVE;
1266     }
1267     SPAGAIN;
1268     PUSHSTACKi(PERLSI_MAGIC);
1269     PUSHMARK(SP);
1270     EXTEND(SP,5);
1271     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1272     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1273     PUSHs(listsv);
1274     PUSHs(sv_2mortal(newSViv(minbits)));
1275     PUSHs(sv_2mortal(newSViv(none)));
1276     PUTBACK;
1277     ENTER;
1278     SAVEI32(PL_hints);
1279     PL_hints = 0;
1280     save_re_context();
1281     if (PL_curcop == &PL_compiling)
1282         /* XXX ought to be handled by lex_start */
1283         sv_setpv(tokenbufsv, PL_tokenbuf);
1284     if (call_method("SWASHNEW", G_SCALAR))
1285         retval = newSVsv(*PL_stack_sp--);
1286     else
1287         retval = &PL_sv_undef;
1288     LEAVE;
1289     POPSTACK;
1290     if (PL_curcop == &PL_compiling) {
1291         STRLEN len;
1292         char* pv = SvPV(tokenbufsv, len);
1293
1294         Copy(pv, PL_tokenbuf, len+1, char);
1295         PL_curcop->op_private = PL_hints;
1296     }
1297     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1298         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1299     return retval;
1300 }
1301
1302 UV
1303 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1304 {
1305     HV* hv = (HV*)SvRV(sv);
1306     U32 klen = UTF8SKIP(ptr) - 1;
1307     U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
1308     STRLEN slen;
1309     STRLEN needents = (klen ? 64 : 128);
1310     U8 *tmps;
1311     U32 bit;
1312     SV *retval;
1313
1314     /*
1315      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1316      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1317      * it's nothing to sniff at.)  Pity we usually come through at least
1318      * two function calls to get here...
1319      *
1320      * NB: this code assumes that swatches are never modified, once generated!
1321      */
1322
1323     if (hv == PL_last_swash_hv &&
1324         klen == PL_last_swash_klen &&
1325         (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1326     {
1327         tmps = PL_last_swash_tmps;
1328         slen = PL_last_swash_slen;
1329     }
1330     else {
1331         /* Try our second-level swatch cache, kept in a hash. */
1332         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1333
1334         /* If not cached, generate it via utf8::SWASHGET */
1335         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1336             dSP;
1337             ENTER;
1338             SAVETMPS;
1339             save_re_context();
1340             PUSHSTACKi(PERLSI_MAGIC);
1341             PUSHMARK(SP);
1342             EXTEND(SP,3);
1343             PUSHs((SV*)sv);
1344             /* We call utf8_to_uni as we want and index into Unicode tables,
1345                not a native character number.
1346              */
1347             PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
1348             PUSHs(sv_2mortal(newSViv(needents)));
1349             PUTBACK;
1350             if (call_method("SWASHGET", G_SCALAR))
1351                 retval = newSVsv(*PL_stack_sp--);
1352             else
1353                 retval = &PL_sv_undef;
1354             POPSTACK;
1355             FREETMPS;
1356             LEAVE;
1357             if (PL_curcop == &PL_compiling)
1358                 PL_curcop->op_private = PL_hints;
1359
1360             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1361
1362             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
1363                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1364         }
1365
1366         PL_last_swash_hv = hv;
1367         PL_last_swash_klen = klen;
1368         PL_last_swash_tmps = tmps;
1369         PL_last_swash_slen = slen;
1370         if (klen)
1371             Copy(ptr, PL_last_swash_key, klen, U8);
1372     }
1373
1374     switch ((int)((slen << 3) / needents)) {
1375     case 1:
1376         bit = 1 << (off & 7);
1377         off >>= 3;
1378         return (tmps[off] & bit) != 0;
1379     case 8:
1380         return tmps[off];
1381     case 16:
1382         off <<= 1;
1383         return (tmps[off] << 8) + tmps[off + 1] ;
1384     case 32:
1385         off <<= 2;
1386         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1387     }
1388     Perl_croak(aTHX_ "panic: swash_fetch");
1389     return 0;
1390 }