utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (UNI_IS_INVARIANT(uv)) {
  50         *d++ = UTF_TO_NATIVE(uv);
  51         return d;
  52     }
  53 #if defined(EBCDIC) || 1 /* always for testing */
  54     else {
  55         STRLEN len  = UNISKIP(uv);
  56         U8 *p = d+len-1;
  57         while (p > d) {
  58             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  59             uv >>= UTF_ACCUMULATION_SHIFT;
  60         }
  61         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  62         return d+len;
  63     }
  64 #else /* Non loop style */
  65     if (uv < 0x800) {
  66         *d++ = (( uv >>  6)         | 0xc0);
  67         *d++ = (( uv        & 0x3f) | 0x80);
  68         return d;
  69     }
  70     if (uv < 0x10000) {
  71         *d++ = (( uv >> 12)         | 0xe0);
  72         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  73         *d++ = (( uv        & 0x3f) | 0x80);
  74         return d;
  75     }
  76     if (uv < 0x200000) {
  77         *d++ = (( uv >> 18)         | 0xf0);
  78         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  79         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  80         *d++ = (( uv        & 0x3f) | 0x80);
  81         return d;
  82     }
  83     if (uv < 0x4000000) {
  84         *d++ = (( uv >> 24)         | 0xf8);
  85         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x80000000) {
  92         *d++ = (( uv >> 30)         | 0xfc);
  93         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  96         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  97         *d++ = (( uv        & 0x3f) | 0x80);
  98         return d;
  99     }
 100 #ifdef HAS_QUAD
 101     if (uv < UTF8_QUAD_MAX)
 102 #endif
 103     {
 104         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 105         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 106         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 109         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 110         *d++ = (( uv        & 0x3f) | 0x80);
 111         return d;
 112     }
 113 #ifdef HAS_QUAD
 114     {
 115         *d++ =                        0xff;     /* Can't match U+FFFE! */
 116         *d++ =                        0x80;     /* 6 Reserved bits */
 117         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 118         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 119         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 120         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 121         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 122         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 126         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 127         *d++ = (( uv        & 0x3f) | 0x80);
 128         return d;
 129     }
 130 #endif
 131 #endif /* Loop style */
 132 }
 133
 134 /*
 135 =for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
 136
 137 Adds the UTF8 representation of the Native codepoint C<uv> to the end
 138 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 139 bytes available. The return value is the pointer to the byte after the
 140 end of the new character. In other words,
 141
 142     d = uvchr_to_utf8(d, uv);
 143
 144 is the recommended wide native character-aware way of saying
 145
 146     *(d++) = uv;
 147
 148 =cut
 149 */
 150
 151 U8 *
 152 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 153 {
 154     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
 155 }
 156
 157
 158 /*
 159 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 160
 161 Tests if some arbitrary number of bytes begins in a valid UTF-8
 162 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 163 The actual number of bytes in the UTF-8 character will be returned if
 164 it is valid, otherwise 0.
 165
 166 =cut */
 167 STRLEN
 168 Perl_is_utf8_char(pTHX_ U8 *s)
 169 {
 170     U8 u = *s;
 171     STRLEN slen, len;
 172     UV uv, ouv;
 173
 174     if (UTF8_IS_INVARIANT(u))
 175         return 1;
 176
 177     if (!UTF8_IS_START(u))
 178         return 0;
 179
 180     len = UTF8SKIP(s);
 181
 182     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 183         return 0;
 184
 185     slen = len - 1;
 186     s++;
 187     /* The initial value is dubious */
 188     uv  = u;
 189     ouv = uv;
 190     while (slen--) {
 191         if (!UTF8_IS_CONTINUATION(*s))
 192             return 0;
 193         uv = UTF8_ACCUMULATE(uv, *s);
 194         if (uv < ouv)
 195             return 0;
 196         ouv = uv;
 197         s++;
 198     }
 199
 200     if (UNISKIP(uv) < len)
 201         return 0;
 202
 203     return len;
 204 }
 205
 206 /*
 207 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 208
 209 Returns true if first C<len> bytes of the given string form a valid UTF8
 210 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 211 'a string that contains UTF8' because a valid ASCII string is a valid
 212 UTF8 string.
 213
 214 =cut
 215 */
 216
 217 bool
 218 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 219 {
 220     U8* x = s;
 221     U8* send;
 222     STRLEN c;
 223
 224     if (!len)
 225         len = strlen((char *)s);
 226     send = s + len;
 227
 228     while (x < send) {
 229         c = is_utf8_char(x);
 230         if (!c)
 231             return FALSE;
 232         x += c;
 233     }
 234     if (x != send)
 235         return FALSE;
 236
 237     return TRUE;
 238 }
 239
 240 /*
 241 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 242
 243 Bottom level UTF-8 decode routine.
 244 Returns the unicode code point value of the first character in the string C<s>
 245 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 246 C<retlen> will be set to the length, in bytes, of that character.
 247
 248 If C<s> does not point to a well-formed UTF8 character, the behaviour
 249 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 250 it is assumed that the caller will raise a warning, and this function
 251 will silently just set C<retlen> to C<-1> and return zero.  If the
 252 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 253 malformations will be given, C<retlen> will be set to the expected
 254 length of the UTF-8 character in bytes, and zero will be returned.
 255
 256 The C<flags> can also contain various flags to allow deviations from
 257 the strict UTF-8 encoding (see F<utf8.h>).
 258
 259 Most code should use utf8_to_uvchr() rather than call this directly.
 260
 261 =cut */
 262
 263 UV
 264 Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 265 {
 266     UV uv = *s, ouv;
 267     STRLEN len = 1;
 268     bool dowarn = ckWARN_d(WARN_UTF8);
 269     STRLEN expectlen = 0;
 270     U32 warning = 0;
 271
 272 /* This list is a superset of the UTF8_ALLOW_XXX. */
 273
 274 #define UTF8_WARN_EMPTY                          1
 275 #define UTF8_WARN_CONTINUATION                   2
 276 #define UTF8_WARN_NON_CONTINUATION               3
 277 #define UTF8_WARN_FE_FF                          4
 278 #define UTF8_WARN_SHORT                          5
 279 #define UTF8_WARN_OVERFLOW                       6
 280 #define UTF8_WARN_SURROGATE                      7
 281 #define UTF8_WARN_BOM                            8
 282 #define UTF8_WARN_LONG                           9
 283 #define UTF8_WARN_FFFF                          10
 284
 285     if (curlen == 0 &&
 286         !(flags & UTF8_ALLOW_EMPTY)) {
 287         warning = UTF8_WARN_EMPTY;
 288         goto malformed;
 289     }
 290
 291     if (UTF8_IS_INVARIANT(uv)) {
 292         if (retlen)
 293             *retlen = 1;
 294         return (UV) (NATIVE_TO_UTF(*s));
 295     }
 296
 297     if (UTF8_IS_CONTINUATION(uv) &&
 298         !(flags & UTF8_ALLOW_CONTINUATION)) {
 299         warning = UTF8_WARN_CONTINUATION;
 300         goto malformed;
 301     }
 302
 303     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 304         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 305         warning = UTF8_WARN_NON_CONTINUATION;
 306         goto malformed;
 307     }
 308
 309 #ifdef EBCDIC
 310     uv = NATIVE_TO_UTF(uv);
 311 #else
 312     if ((uv == 0xfe || uv == 0xff) &&
 313         !(flags & UTF8_ALLOW_FE_FF)) {
 314         warning = UTF8_WARN_FE_FF;
 315         goto malformed;
 316     }
 317 #endif
 318
 319     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 320     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 321     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 322     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 323 #ifdef EBCDIC
 324     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 325     else                        { len =  7; uv &= 0x01; }
 326 #else
 327     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 328     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 329     else                        { len = 13; uv = 0; } /* whoa! */
 330 #endif
 331
 332     if (retlen)
 333         *retlen = len;
 334
 335     expectlen = len;
 336
 337     if ((curlen < expectlen) &&
 338         !(flags & UTF8_ALLOW_SHORT)) {
 339         warning = UTF8_WARN_SHORT;
 340         goto malformed;
 341     }
 342
 343     len--;
 344     s++;
 345     ouv = uv;
 346
 347     while (len--) {
 348         if (!UTF8_IS_CONTINUATION(*s) &&
 349             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 350             s--;
 351             warning = UTF8_WARN_NON_CONTINUATION;
 352             goto malformed;
 353         }
 354         else
 355             uv = UTF8_ACCUMULATE(uv, *s);
 356         if (!(uv > ouv)) {
 357             /* These cannot be allowed. */
 358             if (uv == ouv) {
 359                 if (!(flags & UTF8_ALLOW_LONG)) {
 360                     warning = UTF8_WARN_LONG;
 361                     goto malformed;
 362                 }
 363             }
 364             else { /* uv < ouv */
 365                 /* This cannot be allowed. */
 366                 warning = UTF8_WARN_OVERFLOW;
 367                 goto malformed;
 368             }
 369         }
 370         s++;
 371         ouv = uv;
 372     }
 373
 374     if (UNICODE_IS_SURROGATE(uv) &&
 375         !(flags & UTF8_ALLOW_SURROGATE)) {
 376         warning = UTF8_WARN_SURROGATE;
 377         goto malformed;
 378     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 379                !(flags & UTF8_ALLOW_BOM)) {
 380         warning = UTF8_WARN_BOM;
 381         goto malformed;
 382     } else if ((expectlen > UNISKIP(uv)) &&
 383                !(flags & UTF8_ALLOW_LONG)) {
 384         warning = UTF8_WARN_LONG;
 385         goto malformed;
 386     } else if (UNICODE_IS_ILLEGAL(uv) &&
 387                !(flags & UTF8_ALLOW_FFFF)) {
 388         warning = UTF8_WARN_FFFF;
 389         goto malformed;
 390     }
 391
 392     return uv;
 393
 394 malformed:
 395
 396     if (flags & UTF8_CHECK_ONLY) {
 397         if (retlen)
 398             *retlen = -1;
 399         return 0;
 400     }
 401
 402     if (dowarn) {
 403         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 404
 405         switch (warning) {
 406         case 0: /* Intentionally empty. */ break;
 407         case UTF8_WARN_EMPTY:
 408             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 409             break;
 410         case UTF8_WARN_CONTINUATION:
 411             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 412             break;
 413         case UTF8_WARN_NON_CONTINUATION:
 414             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 415                            (UV)s[1], uv);
 416             break;
 417         case UTF8_WARN_FE_FF:
 418             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 419             break;
 420         case UTF8_WARN_SHORT:
 421             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 422                            curlen, curlen == 1 ? "" : "s", expectlen);
 423             break;
 424         case UTF8_WARN_OVERFLOW:
 425             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 426                            ouv, *s);
 427             break;
 428         case UTF8_WARN_SURROGATE:
 429             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 430             break;
 431         case UTF8_WARN_BOM:
 432             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 433             break;
 434         case UTF8_WARN_LONG:
 435             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 436                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 437             break;
 438         case UTF8_WARN_FFFF:
 439             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 440             break;
 441         default:
 442             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 443             break;
 444         }
 445
 446         if (warning) {
 447             char *s = SvPVX(sv);
 448
 449             if (PL_op)
 450                 Perl_warner(aTHX_ WARN_UTF8,
 451                             "%s in %s", s,  PL_op_desc[PL_op->op_type]);
 452             else
 453                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 454         }
 455     }
 456
 457     if (retlen)
 458         *retlen = expectlen ? expectlen : len;
 459
 460     return 0;
 461 }
 462
 463 /*
 464 =for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
 465
 466 Returns the native character value of the first character in the string C<s>
 467 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 468 length, in bytes, of that character.
 469
 470 Allows length and flags to be passed to low level routine.
 471
 472 =cut
 473 */
 474
 475 UV
 476 Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 477 {
 478     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
 479     return UNI_TO_NATIVE(uv);
 480 }
 481
 482 /*
 483 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
 484
 485 Returns the native character value of the first character in the string C<s>
 486 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 487 length, in bytes, of that character.
 488
 489 If C<s> does not point to a well-formed UTF8 character, zero is
 490 returned and retlen is set, if possible, to -1.
 491
 492 =cut
 493 */
 494
 495 UV
 496 Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
 497 {
 498     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 499 }
 500
 501 /*
 502 =for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
 503
 504 Returns the Unicode code point of the first character in the string C<s>
 505 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 506 length, in bytes, of that character.
 507
 508 This function should only be used when returned UV is considered
 509 an index into the Unicode semantic tables (e.g. swashes).
 510
 511 If C<s> does not point to a well-formed UTF8 character, zero is
 512 returned and retlen is set, if possible, to -1.
 513
 514 =cut
 515 */
 516
 517 UV
 518 Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
 519 {
 520     /* Call the low level routine asking for checks */
 521     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 522 }
 523
 524 /*
 525 =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
 526
 527 Return the length of the UTF-8 char encoded string C<s> in characters.
 528 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 529 up past C<e>, croaks.
 530
 531 =cut
 532 */
 533
 534 STRLEN
 535 Perl_utf8_length(pTHX_ U8* s, U8* e)
 536 {
 537     STRLEN len = 0;
 538
 539     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 540      * the bitops (especially ~) can create illegal UTF-8.
 541      * In other words: in Perl UTF-8 is not just for Unicode. */
 542
 543     if (e < s)
 544         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 545     while (s < e) {
 546         U8 t = UTF8SKIP(s);
 547
 548         if (e - s < t)
 549             Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
 550         s += t;
 551         len++;
 552     }
 553
 554     return len;
 555 }
 556
 557 /*
 558 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 559
 560 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 561 and C<b>.
 562
 563 WARNING: use only if you *know* that the pointers point inside the
 564 same UTF-8 buffer.
 565
 566 =cut */
 567
 568 IV
 569 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 570 {
 571     IV off = 0;
 572
 573     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 574      * the bitops (especially ~) can create illegal UTF-8.
 575      * In other words: in Perl UTF-8 is not just for Unicode. */
 576
 577     if (a < b) {
 578         while (a < b) {
 579             U8 c = UTF8SKIP(a);
 580
 581             if (b - a < c)
 582                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 583             a += c;
 584             off--;
 585         }
 586     }
 587     else {
 588         while (b < a) {
 589             U8 c = UTF8SKIP(b);
 590
 591             if (a - b < c)
 592                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 593             b += c;
 594             off++;
 595         }
 596     }
 597
 598     return off;
 599 }
 600
 601 /*
 602 =for apidoc A|U8*|utf8_hop|U8 *s|I32 off
 603
 604 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 605 forward or backward.
 606
 607 WARNING: do not use the following unless you *know* C<off> is within
 608 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 609 on the first byte of character or just after the last byte of a character.
 610
 611 =cut */
 612
 613 U8 *
 614 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 615 {
 616     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 617      * the bitops (especially ~) can create illegal UTF-8.
 618      * In other words: in Perl UTF-8 is not just for Unicode. */
 619
 620     if (off >= 0) {
 621         while (off--)
 622             s += UTF8SKIP(s);
 623     }
 624     else {
 625         while (off++) {
 626             s--;
 627             while (UTF8_IS_CONTINUATION(*s))
 628                 s--;
 629         }
 630     }
 631     return s;
 632 }
 633
 634 /*
 635 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 636
 637 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 638 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 639 updates len to contain the new length.
 640 Returns zero on failure, setting C<len> to -1.
 641
 642 =cut
 643 */
 644
 645 U8 *
 646 Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 647 {
 648     U8 *send;
 649     U8 *d;
 650     U8 *save = s;
 651
 652     /* ensure valid UTF8 and chars < 256 before updating string */
 653     for (send = s + *len; s < send; ) {
 654         U8 c = *s++;
 655
 656         if (!UTF8_IS_INVARIANT(c) &&
 657             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 658              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 659             *len = -1;
 660             return 0;
 661         }
 662     }
 663
 664     d = s = save;
 665     while (s < send) {
 666         STRLEN ulen;
 667         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 668         s += ulen;
 669     }
 670     *d = '\0';
 671     *len = d - save;
 672     return save;
 673 }
 674
 675 /*
 676 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 677
 678 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 679 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 680 the newly-created string, and updates C<len> to contain the new
 681 length.  Returns the original string if no conversion occurs, C<len>
 682 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 683 0 if C<s> is converted or contains all 7bit characters.
 684
 685 =cut */
 686
 687 U8 *
 688 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
 689 {
 690     U8 *d;
 691     U8 *start = s;
 692     U8 *send;
 693     I32 count = 0;
 694
 695     if (!*is_utf8)
 696         return start;
 697
 698     /* ensure valid UTF8 and chars < 256 before converting string */
 699     for (send = s + *len; s < send;) {
 700         U8 c = *s++;
 701         if (!UTF8_IS_INVARIANT(c)) {
 702             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 703                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 704                 count++;
 705             else
 706                 return start;
 707         }
 708     }
 709
 710     *is_utf8 = 0;
 711
 712     Newz(801, d, (*len) - count + 1, U8);
 713     s = start; start = d;
 714     while (s < send) {
 715         U8 c = *s++;
 716         if (!UTF8_IS_INVARIANT(c)) {
 717             /* Then it is two-byte encoded */
 718             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 719             c = ASCII_TO_NATIVE(c);
 720         }
 721         *d++ = c;
 722     }
 723     *d = '\0';
 724     *len = d - start;
 725     return start;
 726 }
 727
 728 /*
 729 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 730
 731 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 732 Returns a pointer to the newly-created string, and sets C<len> to
 733 reflect the new length.
 734
 735 =cut
 736 */
 737
 738 U8*
 739 Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
 740 {
 741     U8 *send;
 742     U8 *d;
 743     U8 *dst;
 744     send = s + (*len);
 745
 746     Newz(801, d, (*len) * 2 + 1, U8);
 747     dst = d;
 748
 749     while (s < send) {
 750         UV uv = NATIVE_TO_ASCII(*s++);
 751         if (UNI_IS_INVARIANT(uv))
 752             *d++ = UTF_TO_NATIVE(uv);
 753         else {
 754             *d++ = UTF8_EIGHT_BIT_HI(uv);
 755             *d++ = UTF8_EIGHT_BIT_LO(uv);
 756         }
 757     }
 758     *d = '\0';
 759     *len = d-dst;
 760     return dst;
 761 }
 762
 763 /*
 764  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 765  *
 766  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 767  * We optimize for native, for obvious reasons. */
 768
 769 U8*
 770 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 771 {
 772     U8* pend;
 773     U8* dstart = d;
 774
 775     if (bytelen & 1)
 776         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 777
 778     pend = p + bytelen;
 779
 780     while (p < pend) {
 781         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 782         p += 2;
 783         if (uv < 0x80) {
 784             *d++ = uv;
 785             continue;
 786         }
 787         if (uv < 0x800) {
 788             *d++ = (( uv >>  6)         | 0xc0);
 789             *d++ = (( uv        & 0x3f) | 0x80);
 790             continue;
 791         }
 792         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 793             UV low = *p++;
 794             if (low < 0xdc00 || low >= 0xdfff)
 795                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 796             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 797         }
 798         if (uv < 0x10000) {
 799             *d++ = (( uv >> 12)         | 0xe0);
 800             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 801             *d++ = (( uv        & 0x3f) | 0x80);
 802             continue;
 803         }
 804         else {
 805             *d++ = (( uv >> 18)         | 0xf0);
 806             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 807             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 808             *d++ = (( uv        & 0x3f) | 0x80);
 809             continue;
 810         }
 811     }
 812     *newlen = d - dstart;
 813     return d;
 814 }
 815
 816 /* Note: this one is slightly destructive of the source. */
 817
 818 U8*
 819 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 820 {
 821     U8* s = (U8*)p;
 822     U8* send = s + bytelen;
 823     while (s < send) {
 824         U8 tmp = s[0];
 825         s[0] = s[1];
 826         s[1] = tmp;
 827         s += 2;
 828     }
 829     return utf16_to_utf8(p, d, bytelen, newlen);
 830 }
 831
 832 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 833
 834 bool
 835 Perl_is_uni_alnum(pTHX_ U32 c)
 836 {
 837     U8 tmpbuf[UTF8_MAXLEN+1];
 838     uvuni_to_utf8(tmpbuf, (UV)c);
 839     return is_utf8_alnum(tmpbuf);
 840 }
 841
 842 bool
 843 Perl_is_uni_alnumc(pTHX_ U32 c)
 844 {
 845     U8 tmpbuf[UTF8_MAXLEN+1];
 846     uvuni_to_utf8(tmpbuf, (UV)c);
 847     return is_utf8_alnumc(tmpbuf);
 848 }
 849
 850 bool
 851 Perl_is_uni_idfirst(pTHX_ U32 c)
 852 {
 853     U8 tmpbuf[UTF8_MAXLEN+1];
 854     uvuni_to_utf8(tmpbuf, (UV)c);
 855     return is_utf8_idfirst(tmpbuf);
 856 }
 857
 858 bool
 859 Perl_is_uni_alpha(pTHX_ U32 c)
 860 {
 861     U8 tmpbuf[UTF8_MAXLEN+1];
 862     uvuni_to_utf8(tmpbuf, (UV)c);
 863     return is_utf8_alpha(tmpbuf);
 864 }
 865
 866 bool
 867 Perl_is_uni_ascii(pTHX_ U32 c)
 868 {
 869     U8 tmpbuf[UTF8_MAXLEN+1];
 870     uvuni_to_utf8(tmpbuf, (UV)c);
 871     return is_utf8_ascii(tmpbuf);
 872 }
 873
 874 bool
 875 Perl_is_uni_space(pTHX_ U32 c)
 876 {
 877     U8 tmpbuf[UTF8_MAXLEN+1];
 878     uvuni_to_utf8(tmpbuf, (UV)c);
 879     return is_utf8_space(tmpbuf);
 880 }
 881
 882 bool
 883 Perl_is_uni_digit(pTHX_ U32 c)
 884 {
 885     U8 tmpbuf[UTF8_MAXLEN+1];
 886     uvuni_to_utf8(tmpbuf, (UV)c);
 887     return is_utf8_digit(tmpbuf);
 888 }
 889
 890 bool
 891 Perl_is_uni_upper(pTHX_ U32 c)
 892 {
 893     U8 tmpbuf[UTF8_MAXLEN+1];
 894     uvuni_to_utf8(tmpbuf, (UV)c);
 895     return is_utf8_upper(tmpbuf);
 896 }
 897
 898 bool
 899 Perl_is_uni_lower(pTHX_ U32 c)
 900 {
 901     U8 tmpbuf[UTF8_MAXLEN+1];
 902     uvuni_to_utf8(tmpbuf, (UV)c);
 903     return is_utf8_lower(tmpbuf);
 904 }
 905
 906 bool
 907 Perl_is_uni_cntrl(pTHX_ U32 c)
 908 {
 909     U8 tmpbuf[UTF8_MAXLEN+1];
 910     uvuni_to_utf8(tmpbuf, (UV)c);
 911     return is_utf8_cntrl(tmpbuf);
 912 }
 913
 914 bool
 915 Perl_is_uni_graph(pTHX_ U32 c)
 916 {
 917     U8 tmpbuf[UTF8_MAXLEN+1];
 918     uvuni_to_utf8(tmpbuf, (UV)c);
 919     return is_utf8_graph(tmpbuf);
 920 }
 921
 922 bool
 923 Perl_is_uni_print(pTHX_ U32 c)
 924 {
 925     U8 tmpbuf[UTF8_MAXLEN+1];
 926     uvuni_to_utf8(tmpbuf, (UV)c);
 927     return is_utf8_print(tmpbuf);
 928 }
 929
 930 bool
 931 Perl_is_uni_punct(pTHX_ U32 c)
 932 {
 933     U8 tmpbuf[UTF8_MAXLEN+1];
 934     uvuni_to_utf8(tmpbuf, (UV)c);
 935     return is_utf8_punct(tmpbuf);
 936 }
 937
 938 bool
 939 Perl_is_uni_xdigit(pTHX_ U32 c)
 940 {
 941     U8 tmpbuf[UTF8_MAXLEN+1];
 942     uvuni_to_utf8(tmpbuf, (UV)c);
 943     return is_utf8_xdigit(tmpbuf);
 944 }
 945
 946 U32
 947 Perl_to_uni_upper(pTHX_ U32 c)
 948 {
 949     U8 tmpbuf[UTF8_MAXLEN+1];
 950     uvuni_to_utf8(tmpbuf, (UV)c);
 951     return to_utf8_upper(tmpbuf);
 952 }
 953
 954 U32
 955 Perl_to_uni_title(pTHX_ U32 c)
 956 {
 957     U8 tmpbuf[UTF8_MAXLEN+1];
 958     uvuni_to_utf8(tmpbuf, (UV)c);
 959     return to_utf8_title(tmpbuf);
 960 }
 961
 962 U32
 963 Perl_to_uni_lower(pTHX_ U32 c)
 964 {
 965     U8 tmpbuf[UTF8_MAXLEN+1];
 966     uvuni_to_utf8(tmpbuf, (UV)c);
 967     return to_utf8_lower(tmpbuf);
 968 }
 969
 970 /* for now these all assume no locale info available for Unicode > 255 */
 971
 972 bool
 973 Perl_is_uni_alnum_lc(pTHX_ U32 c)
 974 {
 975     return is_uni_alnum(c);     /* XXX no locale support yet */
 976 }
 977
 978 bool
 979 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
 980 {
 981     return is_uni_alnumc(c);    /* XXX no locale support yet */
 982 }
 983
 984 bool
 985 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
 986 {
 987     return is_uni_idfirst(c);   /* XXX no locale support yet */
 988 }
 989
 990 bool
 991 Perl_is_uni_alpha_lc(pTHX_ U32 c)
 992 {
 993     return is_uni_alpha(c);     /* XXX no locale support yet */
 994 }
 995
 996 bool
 997 Perl_is_uni_ascii_lc(pTHX_ U32 c)
 998 {
 999     return is_uni_ascii(c);     /* XXX no locale support yet */
1000 }
1001
1002 bool
1003 Perl_is_uni_space_lc(pTHX_ U32 c)
1004 {
1005     return is_uni_space(c);     /* XXX no locale support yet */
1006 }
1007
1008 bool
1009 Perl_is_uni_digit_lc(pTHX_ U32 c)
1010 {
1011     return is_uni_digit(c);     /* XXX no locale support yet */
1012 }
1013
1014 bool
1015 Perl_is_uni_upper_lc(pTHX_ U32 c)
1016 {
1017     return is_uni_upper(c);     /* XXX no locale support yet */
1018 }
1019
1020 bool
1021 Perl_is_uni_lower_lc(pTHX_ U32 c)
1022 {
1023     return is_uni_lower(c);     /* XXX no locale support yet */
1024 }
1025
1026 bool
1027 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1028 {
1029     return is_uni_cntrl(c);     /* XXX no locale support yet */
1030 }
1031
1032 bool
1033 Perl_is_uni_graph_lc(pTHX_ U32 c)
1034 {
1035     return is_uni_graph(c);     /* XXX no locale support yet */
1036 }
1037
1038 bool
1039 Perl_is_uni_print_lc(pTHX_ U32 c)
1040 {
1041     return is_uni_print(c);     /* XXX no locale support yet */
1042 }
1043
1044 bool
1045 Perl_is_uni_punct_lc(pTHX_ U32 c)
1046 {
1047     return is_uni_punct(c);     /* XXX no locale support yet */
1048 }
1049
1050 bool
1051 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1052 {
1053     return is_uni_xdigit(c);    /* XXX no locale support yet */
1054 }
1055
1056 U32
1057 Perl_to_uni_upper_lc(pTHX_ U32 c)
1058 {
1059     return to_uni_upper(c);     /* XXX no locale support yet */
1060 }
1061
1062 U32
1063 Perl_to_uni_title_lc(pTHX_ U32 c)
1064 {
1065     return to_uni_title(c);     /* XXX no locale support yet */
1066 }
1067
1068 U32
1069 Perl_to_uni_lower_lc(pTHX_ U32 c)
1070 {
1071     return to_uni_lower(c);     /* XXX no locale support yet */
1072 }
1073
1074 bool
1075 Perl_is_utf8_alnum(pTHX_ U8 *p)
1076 {
1077     if (!is_utf8_char(p))
1078         return FALSE;
1079     if (!PL_utf8_alnum)
1080         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1081          * descendant of isalnum(3), in other words, it doesn't
1082          * contain the '_'. --jhi */
1083         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1084     return swash_fetch(PL_utf8_alnum, p);
1085 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1086 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1087     if (!PL_utf8_alnum)
1088         PL_utf8_alnum = swash_init("utf8", "",
1089             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1090     return swash_fetch(PL_utf8_alnum, p);
1091 #endif
1092 }
1093
1094 bool
1095 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1096 {
1097     if (!is_utf8_char(p))
1098         return FALSE;
1099     if (!PL_utf8_alnum)
1100         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1101     return swash_fetch(PL_utf8_alnum, p);
1102 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1103 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1104     if (!PL_utf8_alnum)
1105         PL_utf8_alnum = swash_init("utf8", "",
1106             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1107     return swash_fetch(PL_utf8_alnum, p);
1108 #endif
1109 }
1110
1111 bool
1112 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1113 {
1114     return *p == '_' || is_utf8_alpha(p);
1115 }
1116
1117 bool
1118 Perl_is_utf8_alpha(pTHX_ U8 *p)
1119 {
1120     if (!is_utf8_char(p))
1121         return FALSE;
1122     if (!PL_utf8_alpha)
1123         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1124     return swash_fetch(PL_utf8_alpha, p);
1125 }
1126
1127 bool
1128 Perl_is_utf8_ascii(pTHX_ U8 *p)
1129 {
1130     if (!is_utf8_char(p))
1131         return FALSE;
1132     if (!PL_utf8_ascii)
1133         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1134     return swash_fetch(PL_utf8_ascii, p);
1135 }
1136
1137 bool
1138 Perl_is_utf8_space(pTHX_ U8 *p)
1139 {
1140     if (!is_utf8_char(p))
1141         return FALSE;
1142     if (!PL_utf8_space)
1143         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1144     return swash_fetch(PL_utf8_space, p);
1145 }
1146
1147 bool
1148 Perl_is_utf8_digit(pTHX_ U8 *p)
1149 {
1150     if (!is_utf8_char(p))
1151         return FALSE;
1152     if (!PL_utf8_digit)
1153         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1154     return swash_fetch(PL_utf8_digit, p);
1155 }
1156
1157 bool
1158 Perl_is_utf8_upper(pTHX_ U8 *p)
1159 {
1160     if (!is_utf8_char(p))
1161         return FALSE;
1162     if (!PL_utf8_upper)
1163         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1164     return swash_fetch(PL_utf8_upper, p);
1165 }
1166
1167 bool
1168 Perl_is_utf8_lower(pTHX_ U8 *p)
1169 {
1170     if (!is_utf8_char(p))
1171         return FALSE;
1172     if (!PL_utf8_lower)
1173         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1174     return swash_fetch(PL_utf8_lower, p);
1175 }
1176
1177 bool
1178 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1179 {
1180     if (!is_utf8_char(p))
1181         return FALSE;
1182     if (!PL_utf8_cntrl)
1183         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1184     return swash_fetch(PL_utf8_cntrl, p);
1185 }
1186
1187 bool
1188 Perl_is_utf8_graph(pTHX_ U8 *p)
1189 {
1190     if (!is_utf8_char(p))
1191         return FALSE;
1192     if (!PL_utf8_graph)
1193         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1194     return swash_fetch(PL_utf8_graph, p);
1195 }
1196
1197 bool
1198 Perl_is_utf8_print(pTHX_ U8 *p)
1199 {
1200     if (!is_utf8_char(p))
1201         return FALSE;
1202     if (!PL_utf8_print)
1203         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1204     return swash_fetch(PL_utf8_print, p);
1205 }
1206
1207 bool
1208 Perl_is_utf8_punct(pTHX_ U8 *p)
1209 {
1210     if (!is_utf8_char(p))
1211         return FALSE;
1212     if (!PL_utf8_punct)
1213         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1214     return swash_fetch(PL_utf8_punct, p);
1215 }
1216
1217 bool
1218 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1219 {
1220     if (!is_utf8_char(p))
1221         return FALSE;
1222     if (!PL_utf8_xdigit)
1223         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1224     return swash_fetch(PL_utf8_xdigit, p);
1225 }
1226
1227 bool
1228 Perl_is_utf8_mark(pTHX_ U8 *p)
1229 {
1230     if (!is_utf8_char(p))
1231         return FALSE;
1232     if (!PL_utf8_mark)
1233         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1234     return swash_fetch(PL_utf8_mark, p);
1235 }
1236
1237 UV
1238 Perl_to_utf8_upper(pTHX_ U8 *p)
1239 {
1240     UV uv;
1241
1242     if (!PL_utf8_toupper)
1243         PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1244     uv = swash_fetch(PL_utf8_toupper, p);
1245     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1246 }
1247
1248 UV
1249 Perl_to_utf8_title(pTHX_ U8 *p)
1250 {
1251     UV uv;
1252
1253     if (!PL_utf8_totitle)
1254         PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1255     uv = swash_fetch(PL_utf8_totitle, p);
1256     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1257 }
1258
1259 UV
1260 Perl_to_utf8_lower(pTHX_ U8 *p)
1261 {
1262     UV uv;
1263
1264     if (!PL_utf8_tolower)
1265         PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1266     uv = swash_fetch(PL_utf8_tolower, p);
1267     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1268 }
1269
1270 /* a "swash" is a swatch hash */
1271
1272 SV*
1273 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1274 {
1275     SV* retval;
1276     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1277     dSP;
1278     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1279
1280     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1281         ENTER;
1282         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1283         LEAVE;
1284     }
1285     SPAGAIN;
1286     PUSHSTACKi(PERLSI_MAGIC);
1287     PUSHMARK(SP);
1288     EXTEND(SP,5);
1289     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1290     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1291     PUSHs(listsv);
1292     PUSHs(sv_2mortal(newSViv(minbits)));
1293     PUSHs(sv_2mortal(newSViv(none)));
1294     PUTBACK;
1295     ENTER;
1296     SAVEI32(PL_hints);
1297     PL_hints = 0;
1298     save_re_context();
1299     if (PL_curcop == &PL_compiling)
1300         /* XXX ought to be handled by lex_start */
1301         sv_setpv(tokenbufsv, PL_tokenbuf);
1302     if (call_method("SWASHNEW", G_SCALAR))
1303         retval = newSVsv(*PL_stack_sp--);
1304     else
1305         retval = &PL_sv_undef;
1306     LEAVE;
1307     POPSTACK;
1308     if (PL_curcop == &PL_compiling) {
1309         STRLEN len;
1310         char* pv = SvPV(tokenbufsv, len);
1311
1312         Copy(pv, PL_tokenbuf, len+1, char);
1313         PL_curcop->op_private = PL_hints;
1314     }
1315     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1316         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1317     return retval;
1318 }
1319
1320 UV
1321 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1322 {
1323     HV* hv = (HV*)SvRV(sv);
1324     U32 klen = UTF8SKIP(ptr) - 1;
1325     U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
1326     STRLEN slen;
1327     STRLEN needents = (klen ? 64 : 128);
1328     U8 *tmps;
1329     U32 bit;
1330     SV *retval;
1331
1332     /*
1333      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1334      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1335      * it's nothing to sniff at.)  Pity we usually come through at least
1336      * two function calls to get here...
1337      *
1338      * NB: this code assumes that swatches are never modified, once generated!
1339      */
1340
1341     if (hv == PL_last_swash_hv &&
1342         klen == PL_last_swash_klen &&
1343         (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1344     {
1345         tmps = PL_last_swash_tmps;
1346         slen = PL_last_swash_slen;
1347     }
1348     else {
1349         /* Try our second-level swatch cache, kept in a hash. */
1350         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1351
1352         /* If not cached, generate it via utf8::SWASHGET */
1353         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1354             dSP;
1355             ENTER;
1356             SAVETMPS;
1357             save_re_context();
1358             PUSHSTACKi(PERLSI_MAGIC);
1359             PUSHMARK(SP);
1360             EXTEND(SP,3);
1361             PUSHs((SV*)sv);
1362             /* We call utf8_to_uni as we want and index into Unicode tables,
1363                not a native character number.
1364              */
1365             PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
1366             PUSHs(sv_2mortal(newSViv(needents)));
1367             PUTBACK;
1368             if (call_method("SWASHGET", G_SCALAR))
1369                 retval = newSVsv(*PL_stack_sp--);
1370             else
1371                 retval = &PL_sv_undef;
1372             POPSTACK;
1373             FREETMPS;
1374             LEAVE;
1375             if (PL_curcop == &PL_compiling)
1376                 PL_curcop->op_private = PL_hints;
1377
1378             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1379
1380             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
1381                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1382         }
1383
1384         PL_last_swash_hv = hv;
1385         PL_last_swash_klen = klen;
1386         PL_last_swash_tmps = tmps;
1387         PL_last_swash_slen = slen;
1388         if (klen)
1389             Copy(ptr, PL_last_swash_key, klen, U8);
1390     }
1391
1392     switch ((int)((slen << 3) / needents)) {
1393     case 1:
1394         bit = 1 << (off & 7);
1395         off >>= 3;
1396         return (tmps[off] & bit) != 0;
1397     case 8:
1398         return tmps[off];
1399     case 16:
1400         off <<= 1;
1401         return (tmps[off] << 8) + tmps[off + 1] ;
1402     case 32:
1403         off <<= 2;
1404         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1405     }
1406     Perl_croak(aTHX_ "panic: swash_fetch");
1407     return 0;
1408 }