utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (UTF8_IS_INVARIANT(uv)) {
  50         *d++ = uv;
  51         return d;
  52     }
  53 #if defined(EBCDIC) || 1 /* always for testing */
  54     else {
  55         STRLEN len  = UNISKIP(uv);
  56         U8 *p = d+len-1;
  57         while (p > d) {
  58             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  59             uv >>= UTF_ACCUMULATION_SHIFT;
  60         }
  61         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  62         return d+len;
  63     }
  64 #else /* Non loop style */
  65     if (uv < 0x800) {
  66         *d++ = (( uv >>  6)         | 0xc0);
  67         *d++ = (( uv        & 0x3f) | 0x80);
  68         return d;
  69     }
  70     if (uv < 0x10000) {
  71         *d++ = (( uv >> 12)         | 0xe0);
  72         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  73         *d++ = (( uv        & 0x3f) | 0x80);
  74         return d;
  75     }
  76     if (uv < 0x200000) {
  77         *d++ = (( uv >> 18)         | 0xf0);
  78         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  79         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  80         *d++ = (( uv        & 0x3f) | 0x80);
  81         return d;
  82     }
  83     if (uv < 0x4000000) {
  84         *d++ = (( uv >> 24)         | 0xf8);
  85         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x80000000) {
  92         *d++ = (( uv >> 30)         | 0xfc);
  93         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  96         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  97         *d++ = (( uv        & 0x3f) | 0x80);
  98         return d;
  99     }
 100 #ifdef HAS_QUAD
 101     if (uv < UTF8_QUAD_MAX)
 102 #endif
 103     {
 104         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 105         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 106         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 109         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 110         *d++ = (( uv        & 0x3f) | 0x80);
 111         return d;
 112     }
 113 #ifdef HAS_QUAD
 114     {
 115         *d++ =                        0xff;     /* Can't match U+FFFE! */
 116         *d++ =                        0x80;     /* 6 Reserved bits */
 117         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 118         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 119         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 120         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 121         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 122         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 126         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 127         *d++ = (( uv        & 0x3f) | 0x80);
 128         return d;
 129     }
 130 #endif
 131 #endif /* Loop style */
 132 }
 133
 134 /*
 135 =for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
 136
 137 Adds the UTF8 representation of the Native codepoint C<uv> to the end
 138 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 139 bytes available. The return value is the pointer to the byte after the
 140 end of the new character. In other words,
 141
 142     d = uvchr_to_utf8(d, uv);
 143
 144 is the recommended wide native character-aware way of saying
 145
 146     *(d++) = uv;
 147
 148 =cut
 149 */
 150
 151 U8 *
 152 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 153 {
 154     if (uv < 0x100)
 155         uv = NATIVE_TO_ASCII(uv);
 156     return Perl_uvuni_to_utf8(aTHX_ d, uv);
 157 }
 158
 159
 160 /*
 161 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 162
 163 Tests if some arbitrary number of bytes begins in a valid UTF-8
 164 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 165 The actual number of bytes in the UTF-8 character will be returned if
 166 it is valid, otherwise 0.
 167
 168 =cut */
 169 STRLEN
 170 Perl_is_utf8_char(pTHX_ U8 *s)
 171 {
 172     U8 u = *s;
 173     STRLEN slen, len;
 174     UV uv, ouv;
 175
 176     if (UTF8_IS_INVARIANT(u))
 177         return 1;
 178
 179     if (!UTF8_IS_START(u))
 180         return 0;
 181
 182     len = UTF8SKIP(s);
 183
 184     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 185         return 0;
 186
 187     slen = len - 1;
 188     s++;
 189     /* The initial value is dubious */
 190     uv  = u;
 191     ouv = uv;
 192     while (slen--) {
 193         if (!UTF8_IS_CONTINUATION(*s))
 194             return 0;
 195         uv = UTF8_ACCUMULATE(uv, *s);
 196         if (uv < ouv)
 197             return 0;
 198         ouv = uv;
 199         s++;
 200     }
 201
 202     if (UNISKIP(uv) < len)
 203         return 0;
 204
 205     return len;
 206 }
 207
 208 /*
 209 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 210
 211 Returns true if first C<len> bytes of the given string form a valid UTF8
 212 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 213 'a string that contains UTF8' because a valid ASCII string is a valid
 214 UTF8 string.
 215
 216 =cut
 217 */
 218
 219 bool
 220 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 221 {
 222     U8* x = s;
 223     U8* send;
 224     STRLEN c;
 225
 226     if (!len)
 227         len = strlen((char *)s);
 228     send = s + len;
 229
 230     while (x < send) {
 231         c = is_utf8_char(x);
 232         if (!c)
 233             return FALSE;
 234         x += c;
 235     }
 236     if (x != send)
 237         return FALSE;
 238
 239     return TRUE;
 240 }
 241
 242 /*
 243 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 244
 245 Bottom level UTF-8 decode routine.
 246 Returns the unicode code point value of the first character in the string C<s>
 247 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 248 C<retlen> will be set to the length, in bytes, of that character.
 249
 250 If C<s> does not point to a well-formed UTF8 character, the behaviour
 251 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 252 it is assumed that the caller will raise a warning, and this function
 253 will silently just set C<retlen> to C<-1> and return zero.  If the
 254 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 255 malformations will be given, C<retlen> will be set to the expected
 256 length of the UTF-8 character in bytes, and zero will be returned.
 257
 258 The C<flags> can also contain various flags to allow deviations from
 259 the strict UTF-8 encoding (see F<utf8.h>).
 260
 261 Most code should use utf8_to_uvchr() rather than call this directly.
 262
 263 =cut */
 264
 265 UV
 266 Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 267 {
 268     UV uv = *s, ouv;
 269     STRLEN len = 1;
 270     bool dowarn = ckWARN_d(WARN_UTF8);
 271     STRLEN expectlen = 0;
 272     U32 warning = 0;
 273
 274 /* This list is a superset of the UTF8_ALLOW_XXX. */
 275
 276 #define UTF8_WARN_EMPTY                          1
 277 #define UTF8_WARN_CONTINUATION                   2
 278 #define UTF8_WARN_NON_CONTINUATION               3
 279 #define UTF8_WARN_FE_FF                          4
 280 #define UTF8_WARN_SHORT                          5
 281 #define UTF8_WARN_OVERFLOW                       6
 282 #define UTF8_WARN_SURROGATE                      7
 283 #define UTF8_WARN_BOM                            8
 284 #define UTF8_WARN_LONG                           9
 285 #define UTF8_WARN_FFFF                          10
 286
 287     if (curlen == 0 &&
 288         !(flags & UTF8_ALLOW_EMPTY)) {
 289         warning = UTF8_WARN_EMPTY;
 290         goto malformed;
 291     }
 292
 293     if (UTF8_IS_INVARIANT(uv)) {
 294         if (retlen)
 295             *retlen = 1;
 296         return (UV) (*s);
 297     }
 298
 299     if (UTF8_IS_CONTINUATION(uv) &&
 300         !(flags & UTF8_ALLOW_CONTINUATION)) {
 301         warning = UTF8_WARN_CONTINUATION;
 302         goto malformed;
 303     }
 304
 305     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 306         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 307         warning = UTF8_WARN_NON_CONTINUATION;
 308         goto malformed;
 309     }
 310
 311 #ifdef EBCDIC
 312     uv = NATIVE_TO_UTF(uv);
 313 #else
 314     if ((uv == 0xfe || uv == 0xff) &&
 315         !(flags & UTF8_ALLOW_FE_FF)) {
 316         warning = UTF8_WARN_FE_FF;
 317         goto malformed;
 318     }
 319 #endif
 320
 321     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 322     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 323     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 324     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 325 #ifdef EBCDIC
 326     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 327     else                        { len =  7; uv &= 0x01; }
 328 #else
 329     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 330     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 331     else                        { len = 13; uv = 0; } /* whoa! */
 332 #endif
 333
 334     if (retlen)
 335         *retlen = len;
 336
 337     expectlen = len;
 338
 339     if ((curlen < expectlen) &&
 340         !(flags & UTF8_ALLOW_SHORT)) {
 341         warning = UTF8_WARN_SHORT;
 342         goto malformed;
 343     }
 344
 345     len--;
 346     s++;
 347     ouv = uv;
 348
 349     while (len--) {
 350         if (!UTF8_IS_CONTINUATION(*s) &&
 351             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 352             s--;
 353             warning = UTF8_WARN_NON_CONTINUATION;
 354             goto malformed;
 355         }
 356         else
 357             uv = UTF8_ACCUMULATE(uv, *s);
 358         if (!(uv > ouv)) {
 359             /* These cannot be allowed. */
 360             if (uv == ouv) {
 361                 if (!(flags & UTF8_ALLOW_LONG)) {
 362                     warning = UTF8_WARN_LONG;
 363                     goto malformed;
 364                 }
 365             }
 366             else { /* uv < ouv */
 367                 /* This cannot be allowed. */
 368                 warning = UTF8_WARN_OVERFLOW;
 369                 goto malformed;
 370             }
 371         }
 372         s++;
 373         ouv = uv;
 374     }
 375
 376     if (UNICODE_IS_SURROGATE(uv) &&
 377         !(flags & UTF8_ALLOW_SURROGATE)) {
 378         warning = UTF8_WARN_SURROGATE;
 379         goto malformed;
 380     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 381                !(flags & UTF8_ALLOW_BOM)) {
 382         warning = UTF8_WARN_BOM;
 383         goto malformed;
 384     } else if ((expectlen > UNISKIP(uv)) &&
 385                !(flags & UTF8_ALLOW_LONG)) {
 386         warning = UTF8_WARN_LONG;
 387         goto malformed;
 388     } else if (UNICODE_IS_ILLEGAL(uv) &&
 389                !(flags & UTF8_ALLOW_FFFF)) {
 390         warning = UTF8_WARN_FFFF;
 391         goto malformed;
 392     }
 393
 394     return uv;
 395
 396 malformed:
 397
 398     if (flags & UTF8_CHECK_ONLY) {
 399         if (retlen)
 400             *retlen = -1;
 401         return 0;
 402     }
 403
 404     if (dowarn) {
 405         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 406
 407         switch (warning) {
 408         case 0: /* Intentionally empty. */ break;
 409         case UTF8_WARN_EMPTY:
 410             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 411             break;
 412         case UTF8_WARN_CONTINUATION:
 413             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 414             break;
 415         case UTF8_WARN_NON_CONTINUATION:
 416             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 417                            (UV)s[1], uv);
 418             break;
 419         case UTF8_WARN_FE_FF:
 420             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 421             break;
 422         case UTF8_WARN_SHORT:
 423             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 424                            curlen, curlen == 1 ? "" : "s", expectlen);
 425             break;
 426         case UTF8_WARN_OVERFLOW:
 427             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 428                            ouv, *s);
 429             break;
 430         case UTF8_WARN_SURROGATE:
 431             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 432             break;
 433         case UTF8_WARN_BOM:
 434             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 435             break;
 436         case UTF8_WARN_LONG:
 437             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 438                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 439             break;
 440         case UTF8_WARN_FFFF:
 441             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 442             break;
 443         default:
 444             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 445             break;
 446         }
 447
 448         if (warning) {
 449             char *s = SvPVX(sv);
 450
 451             if (PL_op)
 452                 Perl_warner(aTHX_ WARN_UTF8,
 453                             "%s in %s", s,  PL_op_desc[PL_op->op_type]);
 454             else
 455                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 456         }
 457     }
 458
 459     if (retlen)
 460         *retlen = expectlen ? expectlen : len;
 461
 462     return 0;
 463 }
 464
 465 /*
 466 =for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
 467
 468 Returns the native character value of the first character in the string C<s>
 469 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 470 length, in bytes, of that character.
 471
 472 Allows length and flags to be passed to low level routine.
 473
 474 =cut
 475 */
 476
 477 UV
 478 Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 479 {
 480     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
 481     if (uv < 0x100)
 482         return (UV) ASCII_TO_NATIVE(uv);
 483     return uv;
 484 }
 485
 486 /*
 487 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
 488
 489 Returns the native character value of the first character in the string C<s>
 490 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 491 length, in bytes, of that character.
 492
 493 If C<s> does not point to a well-formed UTF8 character, zero is
 494 returned and retlen is set, if possible, to -1.
 495
 496 =cut
 497 */
 498
 499 UV
 500 Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
 501 {
 502     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 503 }
 504
 505 /*
 506 =for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
 507
 508 Returns the Unicode code point of the first character in the string C<s>
 509 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 510 length, in bytes, of that character.
 511
 512 This function should only be used when returned UV is considered
 513 an index into the Unicode semantic tables (e.g. swashes).
 514
 515 If C<s> does not point to a well-formed UTF8 character, zero is
 516 returned and retlen is set, if possible, to -1.
 517
 518 =cut
 519 */
 520
 521 UV
 522 Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
 523 {
 524     /* Call the low level routine asking for checks */
 525     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 526 }
 527
 528 /*
 529 =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
 530
 531 Return the length of the UTF-8 char encoded string C<s> in characters.
 532 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 533 up past C<e>, croaks.
 534
 535 =cut
 536 */
 537
 538 STRLEN
 539 Perl_utf8_length(pTHX_ U8* s, U8* e)
 540 {
 541     STRLEN len = 0;
 542
 543     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 544      * the bitops (especially ~) can create illegal UTF-8.
 545      * In other words: in Perl UTF-8 is not just for Unicode. */
 546
 547     if (e < s)
 548         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 549     while (s < e) {
 550         U8 t = UTF8SKIP(s);
 551
 552         if (e - s < t)
 553             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 554         s += t;
 555         len++;
 556     }
 557
 558     return len;
 559 }
 560
 561 /*
 562 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 563
 564 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 565 and C<b>.
 566
 567 WARNING: use only if you *know* that the pointers point inside the
 568 same UTF-8 buffer.
 569
 570 =cut */
 571
 572 IV
 573 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 574 {
 575     IV off = 0;
 576
 577     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 578      * the bitops (especially ~) can create illegal UTF-8.
 579      * In other words: in Perl UTF-8 is not just for Unicode. */
 580
 581     if (a < b) {
 582         while (a < b) {
 583             U8 c = UTF8SKIP(a);
 584
 585             if (b - a < c)
 586                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 587             a += c;
 588             off--;
 589         }
 590     }
 591     else {
 592         while (b < a) {
 593             U8 c = UTF8SKIP(b);
 594
 595             if (a - b < c)
 596                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 597             b += c;
 598             off++;
 599         }
 600     }
 601
 602     return off;
 603 }
 604
 605 /*
 606 =for apidoc A|U8*|utf8_hop|U8 *s|I32 off
 607
 608 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 609 forward or backward.
 610
 611 WARNING: do not use the following unless you *know* C<off> is within
 612 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 613 on the first byte of character or just after the last byte of a character.
 614
 615 =cut */
 616
 617 U8 *
 618 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 619 {
 620     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 621      * the bitops (especially ~) can create illegal UTF-8.
 622      * In other words: in Perl UTF-8 is not just for Unicode. */
 623
 624     if (off >= 0) {
 625         while (off--)
 626             s += UTF8SKIP(s);
 627     }
 628     else {
 629         while (off++) {
 630             s--;
 631             while (UTF8_IS_CONTINUATION(*s))
 632                 s--;
 633         }
 634     }
 635     return s;
 636 }
 637
 638 /*
 639 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 640
 641 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 642 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 643 updates len to contain the new length.
 644 Returns zero on failure, setting C<len> to -1.
 645
 646 =cut
 647 */
 648
 649 U8 *
 650 Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 651 {
 652     U8 *send;
 653     U8 *d;
 654     U8 *save = s;
 655
 656     /* ensure valid UTF8 and chars < 256 before updating string */
 657     for (send = s + *len; s < send; ) {
 658         U8 c = *s++;
 659
 660         if (!UTF8_IS_INVARIANT(c) &&
 661             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 662              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 663             *len = -1;
 664             return 0;
 665         }
 666     }
 667
 668     d = s = save;
 669     while (s < send) {
 670         STRLEN ulen;
 671         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 672         s += ulen;
 673     }
 674     *d = '\0';
 675     *len = d - save;
 676     return save;
 677 }
 678
 679 /*
 680 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 681
 682 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 683 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 684 the newly-created string, and updates C<len> to contain the new
 685 length.  Returns the original string if no conversion occurs, C<len>
 686 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 687 0 if C<s> is converted or contains all 7bit characters.
 688
 689 =cut */
 690
 691 U8 *
 692 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
 693 {
 694     U8 *d;
 695     U8 *start = s;
 696     U8 *send;
 697     I32 count = 0;
 698
 699     if (!*is_utf8)
 700         return start;
 701
 702     /* ensure valid UTF8 and chars < 256 before converting string */
 703     for (send = s + *len; s < send;) {
 704         U8 c = *s++;
 705         if (!UTF8_IS_INVARIANT(c)) {
 706             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 707                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 708                 count++;
 709             else
 710                 return start;
 711         }
 712     }
 713
 714     *is_utf8 = 0;
 715
 716 #ifndef EBCDIC
 717     /* Can use as-is if no high chars */
 718     if (!count)
 719         return start;
 720 #endif
 721
 722     Newz(801, d, (*len) - count + 1, U8);
 723     s = start; start = d;
 724     while (s < send) {
 725         U8 c = *s++;
 726         if (!UTF8_IS_INVARIANT(c))
 727             c = UTF8_ACCUMULATE(c, *s++);
 728         *d++ = ASCII_TO_NATIVE(c);
 729     }
 730     *d = '\0';
 731     *len = d - start;
 732     return start;
 733 }
 734
 735 /*
 736 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 737
 738 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 739 Returns a pointer to the newly-created string, and sets C<len> to
 740 reflect the new length.
 741
 742 =cut
 743 */
 744
 745 U8*
 746 Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
 747 {
 748     U8 *send;
 749     U8 *d;
 750     U8 *dst;
 751     send = s + (*len);
 752
 753     Newz(801, d, (*len) * 2 + 1, U8);
 754     dst = d;
 755
 756     while (s < send) {
 757         UV uv = NATIVE_TO_ASCII(*s++);
 758         if (UTF8_IS_INVARIANT(uv))
 759             *d++ = uv;
 760         else {
 761             *d++ = UTF8_EIGHT_BIT_HI(uv);
 762             *d++ = UTF8_EIGHT_BIT_LO(uv);
 763         }
 764     }
 765     *d = '\0';
 766     *len = d-dst;
 767     return dst;
 768 }
 769
 770 /*
 771  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 772  *
 773  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 774  * We optimize for native, for obvious reasons. */
 775
 776 U8*
 777 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 778 {
 779     U8* pend;
 780     U8* dstart = d;
 781
 782     if (bytelen & 1)
 783         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 784
 785     pend = p + bytelen;
 786
 787     while (p < pend) {
 788         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 789         p += 2;
 790         if (uv < 0x80) {
 791             *d++ = uv;
 792             continue;
 793         }
 794         if (uv < 0x800) {
 795             *d++ = (( uv >>  6)         | 0xc0);
 796             *d++ = (( uv        & 0x3f) | 0x80);
 797             continue;
 798         }
 799         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 800             UV low = *p++;
 801             if (low < 0xdc00 || low >= 0xdfff)
 802                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 803             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 804         }
 805         if (uv < 0x10000) {
 806             *d++ = (( uv >> 12)         | 0xe0);
 807             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 808             *d++ = (( uv        & 0x3f) | 0x80);
 809             continue;
 810         }
 811         else {
 812             *d++ = (( uv >> 18)         | 0xf0);
 813             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 814             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 815             *d++ = (( uv        & 0x3f) | 0x80);
 816             continue;
 817         }
 818     }
 819     *newlen = d - dstart;
 820     return d;
 821 }
 822
 823 /* Note: this one is slightly destructive of the source. */
 824
 825 U8*
 826 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 827 {
 828     U8* s = (U8*)p;
 829     U8* send = s + bytelen;
 830     while (s < send) {
 831         U8 tmp = s[0];
 832         s[0] = s[1];
 833         s[1] = tmp;
 834         s += 2;
 835     }
 836     return utf16_to_utf8(p, d, bytelen, newlen);
 837 }
 838
 839 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 840
 841 bool
 842 Perl_is_uni_alnum(pTHX_ U32 c)
 843 {
 844     U8 tmpbuf[UTF8_MAXLEN+1];
 845     uvuni_to_utf8(tmpbuf, (UV)c);
 846     return is_utf8_alnum(tmpbuf);
 847 }
 848
 849 bool
 850 Perl_is_uni_alnumc(pTHX_ U32 c)
 851 {
 852     U8 tmpbuf[UTF8_MAXLEN+1];
 853     uvuni_to_utf8(tmpbuf, (UV)c);
 854     return is_utf8_alnumc(tmpbuf);
 855 }
 856
 857 bool
 858 Perl_is_uni_idfirst(pTHX_ U32 c)
 859 {
 860     U8 tmpbuf[UTF8_MAXLEN+1];
 861     uvuni_to_utf8(tmpbuf, (UV)c);
 862     return is_utf8_idfirst(tmpbuf);
 863 }
 864
 865 bool
 866 Perl_is_uni_alpha(pTHX_ U32 c)
 867 {
 868     U8 tmpbuf[UTF8_MAXLEN+1];
 869     uvuni_to_utf8(tmpbuf, (UV)c);
 870     return is_utf8_alpha(tmpbuf);
 871 }
 872
 873 bool
 874 Perl_is_uni_ascii(pTHX_ U32 c)
 875 {
 876     U8 tmpbuf[UTF8_MAXLEN+1];
 877     uvuni_to_utf8(tmpbuf, (UV)c);
 878     return is_utf8_ascii(tmpbuf);
 879 }
 880
 881 bool
 882 Perl_is_uni_space(pTHX_ U32 c)
 883 {
 884     U8 tmpbuf[UTF8_MAXLEN+1];
 885     uvuni_to_utf8(tmpbuf, (UV)c);
 886     return is_utf8_space(tmpbuf);
 887 }
 888
 889 bool
 890 Perl_is_uni_digit(pTHX_ U32 c)
 891 {
 892     U8 tmpbuf[UTF8_MAXLEN+1];
 893     uvuni_to_utf8(tmpbuf, (UV)c);
 894     return is_utf8_digit(tmpbuf);
 895 }
 896
 897 bool
 898 Perl_is_uni_upper(pTHX_ U32 c)
 899 {
 900     U8 tmpbuf[UTF8_MAXLEN+1];
 901     uvuni_to_utf8(tmpbuf, (UV)c);
 902     return is_utf8_upper(tmpbuf);
 903 }
 904
 905 bool
 906 Perl_is_uni_lower(pTHX_ U32 c)
 907 {
 908     U8 tmpbuf[UTF8_MAXLEN+1];
 909     uvuni_to_utf8(tmpbuf, (UV)c);
 910     return is_utf8_lower(tmpbuf);
 911 }
 912
 913 bool
 914 Perl_is_uni_cntrl(pTHX_ U32 c)
 915 {
 916     U8 tmpbuf[UTF8_MAXLEN+1];
 917     uvuni_to_utf8(tmpbuf, (UV)c);
 918     return is_utf8_cntrl(tmpbuf);
 919 }
 920
 921 bool
 922 Perl_is_uni_graph(pTHX_ U32 c)
 923 {
 924     U8 tmpbuf[UTF8_MAXLEN+1];
 925     uvuni_to_utf8(tmpbuf, (UV)c);
 926     return is_utf8_graph(tmpbuf);
 927 }
 928
 929 bool
 930 Perl_is_uni_print(pTHX_ U32 c)
 931 {
 932     U8 tmpbuf[UTF8_MAXLEN+1];
 933     uvuni_to_utf8(tmpbuf, (UV)c);
 934     return is_utf8_print(tmpbuf);
 935 }
 936
 937 bool
 938 Perl_is_uni_punct(pTHX_ U32 c)
 939 {
 940     U8 tmpbuf[UTF8_MAXLEN+1];
 941     uvuni_to_utf8(tmpbuf, (UV)c);
 942     return is_utf8_punct(tmpbuf);
 943 }
 944
 945 bool
 946 Perl_is_uni_xdigit(pTHX_ U32 c)
 947 {
 948     U8 tmpbuf[UTF8_MAXLEN+1];
 949     uvuni_to_utf8(tmpbuf, (UV)c);
 950     return is_utf8_xdigit(tmpbuf);
 951 }
 952
 953 U32
 954 Perl_to_uni_upper(pTHX_ U32 c)
 955 {
 956     U8 tmpbuf[UTF8_MAXLEN+1];
 957     uvuni_to_utf8(tmpbuf, (UV)c);
 958     return to_utf8_upper(tmpbuf);
 959 }
 960
 961 U32
 962 Perl_to_uni_title(pTHX_ U32 c)
 963 {
 964     U8 tmpbuf[UTF8_MAXLEN+1];
 965     uvuni_to_utf8(tmpbuf, (UV)c);
 966     return to_utf8_title(tmpbuf);
 967 }
 968
 969 U32
 970 Perl_to_uni_lower(pTHX_ U32 c)
 971 {
 972     U8 tmpbuf[UTF8_MAXLEN+1];
 973     uvuni_to_utf8(tmpbuf, (UV)c);
 974     return to_utf8_lower(tmpbuf);
 975 }
 976
 977 /* for now these all assume no locale info available for Unicode > 255 */
 978
 979 bool
 980 Perl_is_uni_alnum_lc(pTHX_ U32 c)
 981 {
 982     return is_uni_alnum(c);     /* XXX no locale support yet */
 983 }
 984
 985 bool
 986 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
 987 {
 988     return is_uni_alnumc(c);    /* XXX no locale support yet */
 989 }
 990
 991 bool
 992 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
 993 {
 994     return is_uni_idfirst(c);   /* XXX no locale support yet */
 995 }
 996
 997 bool
 998 Perl_is_uni_alpha_lc(pTHX_ U32 c)
 999 {
1000     return is_uni_alpha(c);     /* XXX no locale support yet */
1001 }
1002
1003 bool
1004 Perl_is_uni_ascii_lc(pTHX_ U32 c)
1005 {
1006     return is_uni_ascii(c);     /* XXX no locale support yet */
1007 }
1008
1009 bool
1010 Perl_is_uni_space_lc(pTHX_ U32 c)
1011 {
1012     return is_uni_space(c);     /* XXX no locale support yet */
1013 }
1014
1015 bool
1016 Perl_is_uni_digit_lc(pTHX_ U32 c)
1017 {
1018     return is_uni_digit(c);     /* XXX no locale support yet */
1019 }
1020
1021 bool
1022 Perl_is_uni_upper_lc(pTHX_ U32 c)
1023 {
1024     return is_uni_upper(c);     /* XXX no locale support yet */
1025 }
1026
1027 bool
1028 Perl_is_uni_lower_lc(pTHX_ U32 c)
1029 {
1030     return is_uni_lower(c);     /* XXX no locale support yet */
1031 }
1032
1033 bool
1034 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1035 {
1036     return is_uni_cntrl(c);     /* XXX no locale support yet */
1037 }
1038
1039 bool
1040 Perl_is_uni_graph_lc(pTHX_ U32 c)
1041 {
1042     return is_uni_graph(c);     /* XXX no locale support yet */
1043 }
1044
1045 bool
1046 Perl_is_uni_print_lc(pTHX_ U32 c)
1047 {
1048     return is_uni_print(c);     /* XXX no locale support yet */
1049 }
1050
1051 bool
1052 Perl_is_uni_punct_lc(pTHX_ U32 c)
1053 {
1054     return is_uni_punct(c);     /* XXX no locale support yet */
1055 }
1056
1057 bool
1058 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1059 {
1060     return is_uni_xdigit(c);    /* XXX no locale support yet */
1061 }
1062
1063 U32
1064 Perl_to_uni_upper_lc(pTHX_ U32 c)
1065 {
1066     return to_uni_upper(c);     /* XXX no locale support yet */
1067 }
1068
1069 U32
1070 Perl_to_uni_title_lc(pTHX_ U32 c)
1071 {
1072     return to_uni_title(c);     /* XXX no locale support yet */
1073 }
1074
1075 U32
1076 Perl_to_uni_lower_lc(pTHX_ U32 c)
1077 {
1078     return to_uni_lower(c);     /* XXX no locale support yet */
1079 }
1080
1081 bool
1082 Perl_is_utf8_alnum(pTHX_ U8 *p)
1083 {
1084     if (!is_utf8_char(p))
1085         return FALSE;
1086     if (!PL_utf8_alnum)
1087         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1088          * descendant of isalnum(3), in other words, it doesn't
1089          * contain the '_'. --jhi */
1090         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1091     return swash_fetch(PL_utf8_alnum, p);
1092 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1093 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1094     if (!PL_utf8_alnum)
1095         PL_utf8_alnum = swash_init("utf8", "",
1096             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1097     return swash_fetch(PL_utf8_alnum, p);
1098 #endif
1099 }
1100
1101 bool
1102 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1103 {
1104     if (!is_utf8_char(p))
1105         return FALSE;
1106     if (!PL_utf8_alnum)
1107         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1108     return swash_fetch(PL_utf8_alnum, p);
1109 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1110 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1111     if (!PL_utf8_alnum)
1112         PL_utf8_alnum = swash_init("utf8", "",
1113             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1114     return swash_fetch(PL_utf8_alnum, p);
1115 #endif
1116 }
1117
1118 bool
1119 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1120 {
1121     return *p == '_' || is_utf8_alpha(p);
1122 }
1123
1124 bool
1125 Perl_is_utf8_alpha(pTHX_ U8 *p)
1126 {
1127     if (!is_utf8_char(p))
1128         return FALSE;
1129     if (!PL_utf8_alpha)
1130         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1131     return swash_fetch(PL_utf8_alpha, p);
1132 }
1133
1134 bool
1135 Perl_is_utf8_ascii(pTHX_ U8 *p)
1136 {
1137     if (!is_utf8_char(p))
1138         return FALSE;
1139     if (!PL_utf8_ascii)
1140         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1141     return swash_fetch(PL_utf8_ascii, p);
1142 }
1143
1144 bool
1145 Perl_is_utf8_space(pTHX_ U8 *p)
1146 {
1147     if (!is_utf8_char(p))
1148         return FALSE;
1149     if (!PL_utf8_space)
1150         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1151     return swash_fetch(PL_utf8_space, p);
1152 }
1153
1154 bool
1155 Perl_is_utf8_digit(pTHX_ U8 *p)
1156 {
1157     if (!is_utf8_char(p))
1158         return FALSE;
1159     if (!PL_utf8_digit)
1160         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1161     return swash_fetch(PL_utf8_digit, p);
1162 }
1163
1164 bool
1165 Perl_is_utf8_upper(pTHX_ U8 *p)
1166 {
1167     if (!is_utf8_char(p))
1168         return FALSE;
1169     if (!PL_utf8_upper)
1170         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1171     return swash_fetch(PL_utf8_upper, p);
1172 }
1173
1174 bool
1175 Perl_is_utf8_lower(pTHX_ U8 *p)
1176 {
1177     if (!is_utf8_char(p))
1178         return FALSE;
1179     if (!PL_utf8_lower)
1180         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1181     return swash_fetch(PL_utf8_lower, p);
1182 }
1183
1184 bool
1185 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1186 {
1187     if (!is_utf8_char(p))
1188         return FALSE;
1189     if (!PL_utf8_cntrl)
1190         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1191     return swash_fetch(PL_utf8_cntrl, p);
1192 }
1193
1194 bool
1195 Perl_is_utf8_graph(pTHX_ U8 *p)
1196 {
1197     if (!is_utf8_char(p))
1198         return FALSE;
1199     if (!PL_utf8_graph)
1200         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1201     return swash_fetch(PL_utf8_graph, p);
1202 }
1203
1204 bool
1205 Perl_is_utf8_print(pTHX_ U8 *p)
1206 {
1207     if (!is_utf8_char(p))
1208         return FALSE;
1209     if (!PL_utf8_print)
1210         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1211     return swash_fetch(PL_utf8_print, p);
1212 }
1213
1214 bool
1215 Perl_is_utf8_punct(pTHX_ U8 *p)
1216 {
1217     if (!is_utf8_char(p))
1218         return FALSE;
1219     if (!PL_utf8_punct)
1220         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1221     return swash_fetch(PL_utf8_punct, p);
1222 }
1223
1224 bool
1225 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1226 {
1227     if (!is_utf8_char(p))
1228         return FALSE;
1229     if (!PL_utf8_xdigit)
1230         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1231     return swash_fetch(PL_utf8_xdigit, p);
1232 }
1233
1234 bool
1235 Perl_is_utf8_mark(pTHX_ U8 *p)
1236 {
1237     if (!is_utf8_char(p))
1238         return FALSE;
1239     if (!PL_utf8_mark)
1240         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1241     return swash_fetch(PL_utf8_mark, p);
1242 }
1243
1244 UV
1245 Perl_to_utf8_upper(pTHX_ U8 *p)
1246 {
1247     UV uv;
1248
1249     if (!PL_utf8_toupper)
1250         PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1251     uv = swash_fetch(PL_utf8_toupper, p);
1252     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1253 }
1254
1255 UV
1256 Perl_to_utf8_title(pTHX_ U8 *p)
1257 {
1258     UV uv;
1259
1260     if (!PL_utf8_totitle)
1261         PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1262     uv = swash_fetch(PL_utf8_totitle, p);
1263     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1264 }
1265
1266 UV
1267 Perl_to_utf8_lower(pTHX_ U8 *p)
1268 {
1269     UV uv;
1270
1271     if (!PL_utf8_tolower)
1272         PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1273     uv = swash_fetch(PL_utf8_tolower, p);
1274     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1275 }
1276
1277 /* a "swash" is a swatch hash */
1278
1279 SV*
1280 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1281 {
1282     SV* retval;
1283     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1284     dSP;
1285     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1286
1287     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1288         ENTER;
1289         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1290         LEAVE;
1291     }
1292     SPAGAIN;
1293     PUSHSTACKi(PERLSI_MAGIC);
1294     PUSHMARK(SP);
1295     EXTEND(SP,5);
1296     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1297     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1298     PUSHs(listsv);
1299     PUSHs(sv_2mortal(newSViv(minbits)));
1300     PUSHs(sv_2mortal(newSViv(none)));
1301     PUTBACK;
1302     ENTER;
1303     SAVEI32(PL_hints);
1304     PL_hints = 0;
1305     save_re_context();
1306     if (PL_curcop == &PL_compiling)
1307         /* XXX ought to be handled by lex_start */
1308         sv_setpv(tokenbufsv, PL_tokenbuf);
1309     if (call_method("SWASHNEW", G_SCALAR))
1310         retval = newSVsv(*PL_stack_sp--);
1311     else
1312         retval = &PL_sv_undef;
1313     LEAVE;
1314     POPSTACK;
1315     if (PL_curcop == &PL_compiling) {
1316         STRLEN len;
1317         char* pv = SvPV(tokenbufsv, len);
1318
1319         Copy(pv, PL_tokenbuf, len+1, char);
1320         PL_curcop->op_private = PL_hints;
1321     }
1322     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1323         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1324     return retval;
1325 }
1326
1327 UV
1328 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1329 {
1330     HV* hv = (HV*)SvRV(sv);
1331     U32 klen = UTF8SKIP(ptr) - 1;
1332     U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
1333     STRLEN slen;
1334     STRLEN needents = (klen ? 64 : 128);
1335     U8 *tmps;
1336     U32 bit;
1337     SV *retval;
1338
1339     /*
1340      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1341      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1342      * it's nothing to sniff at.)  Pity we usually come through at least
1343      * two function calls to get here...
1344      *
1345      * NB: this code assumes that swatches are never modified, once generated!
1346      */
1347
1348     if (hv == PL_last_swash_hv &&
1349         klen == PL_last_swash_klen &&
1350         (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1351     {
1352         tmps = PL_last_swash_tmps;
1353         slen = PL_last_swash_slen;
1354     }
1355     else {
1356         /* Try our second-level swatch cache, kept in a hash. */
1357         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1358
1359         /* If not cached, generate it via utf8::SWASHGET */
1360         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1361             dSP;
1362             ENTER;
1363             SAVETMPS;
1364             save_re_context();
1365             PUSHSTACKi(PERLSI_MAGIC);
1366             PUSHMARK(SP);
1367             EXTEND(SP,3);
1368             PUSHs((SV*)sv);
1369             /* We call utf8_to_uni as we want and index into Unicode tables,
1370                not a native character number.
1371              */
1372             PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
1373             PUSHs(sv_2mortal(newSViv(needents)));
1374             PUTBACK;
1375             if (call_method("SWASHGET", G_SCALAR))
1376                 retval = newSVsv(*PL_stack_sp--);
1377             else
1378                 retval = &PL_sv_undef;
1379             POPSTACK;
1380             FREETMPS;
1381             LEAVE;
1382             if (PL_curcop == &PL_compiling)
1383                 PL_curcop->op_private = PL_hints;
1384
1385             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1386
1387             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
1388                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1389         }
1390
1391         PL_last_swash_hv = hv;
1392         PL_last_swash_klen = klen;
1393         PL_last_swash_tmps = tmps;
1394         PL_last_swash_slen = slen;
1395         if (klen)
1396             Copy(ptr, PL_last_swash_key, klen, U8);
1397     }
1398
1399     switch ((int)((slen << 3) / needents)) {
1400     case 1:
1401         bit = 1 << (off & 7);
1402         off >>= 3;
1403         return (tmps[off] & bit) != 0;
1404     case 8:
1405         return tmps[off];
1406     case 16:
1407         off <<= 1;
1408         return (tmps[off] << 8) + tmps[off + 1] ;
1409     case 32:
1410         off <<= 2;
1411         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1412     }
1413     Perl_croak(aTHX_ "panic: swash_fetch");
1414     return 0;
1415 }