utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (UNI_IS_INVARIANT(uv)) {
  50         *d++ = UTF_TO_NATIVE(uv);
  51         return d;
  52     }
  53 #if defined(EBCDIC) || 1 /* always for testing */
  54     else {
  55         STRLEN len  = UNISKIP(uv);
  56         U8 *p = d+len-1;
  57         while (p > d) {
  58             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  59             uv >>= UTF_ACCUMULATION_SHIFT;
  60         }
  61         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  62         return d+len;
  63     }
  64 #else /* Non loop style */
  65     if (uv < 0x800) {
  66         *d++ = (( uv >>  6)         | 0xc0);
  67         *d++ = (( uv        & 0x3f) | 0x80);
  68         return d;
  69     }
  70     if (uv < 0x10000) {
  71         *d++ = (( uv >> 12)         | 0xe0);
  72         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  73         *d++ = (( uv        & 0x3f) | 0x80);
  74         return d;
  75     }
  76     if (uv < 0x200000) {
  77         *d++ = (( uv >> 18)         | 0xf0);
  78         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  79         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  80         *d++ = (( uv        & 0x3f) | 0x80);
  81         return d;
  82     }
  83     if (uv < 0x4000000) {
  84         *d++ = (( uv >> 24)         | 0xf8);
  85         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x80000000) {
  92         *d++ = (( uv >> 30)         | 0xfc);
  93         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  96         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  97         *d++ = (( uv        & 0x3f) | 0x80);
  98         return d;
  99     }
 100 #ifdef HAS_QUAD
 101     if (uv < UTF8_QUAD_MAX)
 102 #endif
 103     {
 104         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 105         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 106         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 109         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 110         *d++ = (( uv        & 0x3f) | 0x80);
 111         return d;
 112     }
 113 #ifdef HAS_QUAD
 114     {
 115         *d++ =                        0xff;     /* Can't match U+FFFE! */
 116         *d++ =                        0x80;     /* 6 Reserved bits */
 117         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 118         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 119         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 120         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 121         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 122         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 126         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 127         *d++ = (( uv        & 0x3f) | 0x80);
 128         return d;
 129     }
 130 #endif
 131 #endif /* Loop style */
 132 }
 133
 134
 135
 136 /*
 137 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 138
 139 Tests if some arbitrary number of bytes begins in a valid UTF-8
 140 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 141 The actual number of bytes in the UTF-8 character will be returned if
 142 it is valid, otherwise 0.
 143
 144 =cut
 145 */
 146 STRLEN
 147 Perl_is_utf8_char(pTHX_ U8 *s)
 148 {
 149     U8 u = *s;
 150     STRLEN slen, len;
 151     UV uv, ouv;
 152
 153     if (UTF8_IS_INVARIANT(u))
 154         return 1;
 155
 156     if (!UTF8_IS_START(u))
 157         return 0;
 158
 159     len = UTF8SKIP(s);
 160
 161     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 162         return 0;
 163
 164     slen = len - 1;
 165     s++;
 166     /* The initial value is dubious */
 167     uv  = u;
 168     ouv = uv;
 169     while (slen--) {
 170         if (!UTF8_IS_CONTINUATION(*s))
 171             return 0;
 172         uv = UTF8_ACCUMULATE(uv, *s);
 173 #if 0
 174         /* Depending on the compiler the wrap of the value takig pladve
 175          * between 5 and 6 bytes of UTF-8 encoding either works or not.
 176          * See similar spot in utf8_to_uvuni(). --jhi */
 177         if (uv < ouv)
 178             return 0;
 179 #endif
 180         ouv = uv;
 181         s++;
 182     }
 183
 184     if (UNISKIP(uv) < len)
 185         return 0;
 186
 187     return len;
 188 }
 189
 190 /*
 191 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 192
 193 Returns true if first C<len> bytes of the given string form a valid UTF8
 194 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 195 'a string that contains UTF8' because a valid ASCII string is a valid
 196 UTF8 string.
 197
 198 =cut
 199 */
 200
 201 bool
 202 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 203 {
 204     U8* x = s;
 205     U8* send;
 206     STRLEN c;
 207
 208     if (!len)
 209         len = strlen((char *)s);
 210     send = s + len;
 211
 212     while (x < send) {
 213         c = is_utf8_char(x);
 214         if (!c)
 215             return FALSE;
 216         x += c;
 217     }
 218     if (x != send)
 219         return FALSE;
 220
 221     return TRUE;
 222 }
 223
 224 /*
 225 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 226
 227 Bottom level UTF-8 decode routine.
 228 Returns the unicode code point value of the first character in the string C<s>
 229 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 230 C<retlen> will be set to the length, in bytes, of that character.
 231
 232 If C<s> does not point to a well-formed UTF8 character, the behaviour
 233 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 234 it is assumed that the caller will raise a warning, and this function
 235 will silently just set C<retlen> to C<-1> and return zero.  If the
 236 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 237 malformations will be given, C<retlen> will be set to the expected
 238 length of the UTF-8 character in bytes, and zero will be returned.
 239
 240 The C<flags> can also contain various flags to allow deviations from
 241 the strict UTF-8 encoding (see F<utf8.h>).
 242
 243 Most code should use utf8_to_uvchr() rather than call this directly.
 244
 245 =cut
 246 */
 247
 248 UV
 249 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 250 {
 251     UV uv = *s, ouv;
 252     STRLEN len = 1;
 253     bool dowarn = ckWARN_d(WARN_UTF8);
 254     STRLEN expectlen = 0;
 255     U32 warning = 0;
 256
 257 /* This list is a superset of the UTF8_ALLOW_XXX. */
 258
 259 #define UTF8_WARN_EMPTY                          1
 260 #define UTF8_WARN_CONTINUATION                   2
 261 #define UTF8_WARN_NON_CONTINUATION               3
 262 #define UTF8_WARN_FE_FF                          4
 263 #define UTF8_WARN_SHORT                          5
 264 #define UTF8_WARN_OVERFLOW                       6
 265 #define UTF8_WARN_SURROGATE                      7
 266 #define UTF8_WARN_BOM                            8
 267 #define UTF8_WARN_LONG                           9
 268 #define UTF8_WARN_FFFF                          10
 269
 270     if (curlen == 0 &&
 271         !(flags & UTF8_ALLOW_EMPTY)) {
 272         warning = UTF8_WARN_EMPTY;
 273         goto malformed;
 274     }
 275
 276     if (UTF8_IS_INVARIANT(uv)) {
 277         if (retlen)
 278             *retlen = 1;
 279         return (UV) (NATIVE_TO_UTF(*s));
 280     }
 281
 282     if (UTF8_IS_CONTINUATION(uv) &&
 283         !(flags & UTF8_ALLOW_CONTINUATION)) {
 284         warning = UTF8_WARN_CONTINUATION;
 285         goto malformed;
 286     }
 287
 288     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 289         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 290         warning = UTF8_WARN_NON_CONTINUATION;
 291         goto malformed;
 292     }
 293
 294 #ifdef EBCDIC
 295     uv = NATIVE_TO_UTF(uv);
 296 #else
 297     if ((uv == 0xfe || uv == 0xff) &&
 298         !(flags & UTF8_ALLOW_FE_FF)) {
 299         warning = UTF8_WARN_FE_FF;
 300         goto malformed;
 301     }
 302 #endif
 303
 304     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 305     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 306     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 307     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 308 #ifdef EBCDIC
 309     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 310     else                        { len =  7; uv &= 0x01; }
 311 #else
 312     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 313     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 314     else                        { len = 13; uv = 0; } /* whoa! */
 315 #endif
 316
 317     if (retlen)
 318         *retlen = len;
 319
 320     expectlen = len;
 321
 322     if ((curlen < expectlen) &&
 323         !(flags & UTF8_ALLOW_SHORT)) {
 324         warning = UTF8_WARN_SHORT;
 325         goto malformed;
 326     }
 327
 328     len--;
 329     s++;
 330     ouv = uv;
 331
 332     while (len--) {
 333         if (!UTF8_IS_CONTINUATION(*s) &&
 334             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 335             s--;
 336             warning = UTF8_WARN_NON_CONTINUATION;
 337             goto malformed;
 338         }
 339         else
 340             uv = UTF8_ACCUMULATE(uv, *s);
 341         if (!(uv > ouv)) {
 342             /* These cannot be allowed. */
 343             if (uv == ouv) {
 344                 if (!(flags & UTF8_ALLOW_LONG)) {
 345                     warning = UTF8_WARN_LONG;
 346                     goto malformed;
 347                 }
 348             }
 349             else { /* uv < ouv */
 350 #if 0
 351         /* Depending on the compiler the wrap of the value takig pladve
 352          * between 5 and 6 bytes of UTF-8 encoding either works or not.
 353          * See similar spot in is_utf8_char(). --jhi */
 354                 /* This cannot be allowed. */
 355                 warning = UTF8_WARN_OVERFLOW;
 356                 goto malformed;
 357 #endif
 358             }
 359         }
 360         s++;
 361         ouv = uv;
 362     }
 363
 364     if (UNICODE_IS_SURROGATE(uv) &&
 365         !(flags & UTF8_ALLOW_SURROGATE)) {
 366         warning = UTF8_WARN_SURROGATE;
 367         goto malformed;
 368     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 369                !(flags & UTF8_ALLOW_BOM)) {
 370         warning = UTF8_WARN_BOM;
 371         goto malformed;
 372     } else if ((expectlen > UNISKIP(uv)) &&
 373                !(flags & UTF8_ALLOW_LONG)) {
 374         warning = UTF8_WARN_LONG;
 375         goto malformed;
 376     } else if (UNICODE_IS_ILLEGAL(uv) &&
 377                !(flags & UTF8_ALLOW_FFFF)) {
 378         warning = UTF8_WARN_FFFF;
 379         goto malformed;
 380     }
 381
 382     return uv;
 383
 384 malformed:
 385
 386     if (flags & UTF8_CHECK_ONLY) {
 387         if (retlen)
 388             *retlen = -1;
 389         return 0;
 390     }
 391
 392     if (dowarn) {
 393         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 394
 395         switch (warning) {
 396         case 0: /* Intentionally empty. */ break;
 397         case UTF8_WARN_EMPTY:
 398             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 399             break;
 400         case UTF8_WARN_CONTINUATION:
 401             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 402             break;
 403         case UTF8_WARN_NON_CONTINUATION:
 404             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 405                            (UV)s[1], uv);
 406             break;
 407         case UTF8_WARN_FE_FF:
 408             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 409             break;
 410         case UTF8_WARN_SHORT:
 411             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 412                            curlen, curlen == 1 ? "" : "s", expectlen);
 413             break;
 414         case UTF8_WARN_OVERFLOW:
 415             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 416                            ouv, *s);
 417             break;
 418         case UTF8_WARN_SURROGATE:
 419             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 420             break;
 421         case UTF8_WARN_BOM:
 422             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 423             break;
 424         case UTF8_WARN_LONG:
 425             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 426                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 427             break;
 428         case UTF8_WARN_FFFF:
 429             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 430             break;
 431         default:
 432             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 433             break;
 434         }
 435
 436         if (warning) {
 437             char *s = SvPVX(sv);
 438
 439             if (PL_op)
 440                 Perl_warner(aTHX_ WARN_UTF8,
 441                             "%s in %s", s,  PL_op_desc[PL_op->op_type]);
 442             else
 443                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 444         }
 445     }
 446
 447     if (retlen)
 448         *retlen = expectlen ? expectlen : len;
 449
 450     return 0;
 451 }
 452
 453 /*
 454 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 455
 456 Returns the native character value of the first character in the string C<s>
 457 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 458 length, in bytes, of that character.
 459
 460 If C<s> does not point to a well-formed UTF8 character, zero is
 461 returned and retlen is set, if possible, to -1.
 462
 463 =cut
 464 */
 465
 466 UV
 467 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 468 {
 469     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 470 }
 471
 472 /*
 473 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 474
 475 Returns the Unicode code point of the first character in the string C<s>
 476 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 477 length, in bytes, of that character.
 478
 479 This function should only be used when returned UV is considered
 480 an index into the Unicode semantic tables (e.g. swashes).
 481
 482 If C<s> does not point to a well-formed UTF8 character, zero is
 483 returned and retlen is set, if possible, to -1.
 484
 485 =cut
 486 */
 487
 488 UV
 489 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 490 {
 491     /* Call the low level routine asking for checks */
 492     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 493 }
 494
 495 /*
 496 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 497
 498 Return the length of the UTF-8 char encoded string C<s> in characters.
 499 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 500 up past C<e>, croaks.
 501
 502 =cut
 503 */
 504
 505 STRLEN
 506 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 507 {
 508     STRLEN len = 0;
 509
 510     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 511      * the bitops (especially ~) can create illegal UTF-8.
 512      * In other words: in Perl UTF-8 is not just for Unicode. */
 513
 514     if (e < s)
 515         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 516     while (s < e) {
 517         U8 t = UTF8SKIP(s);
 518
 519         if (e - s < t)
 520             Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
 521         s += t;
 522         len++;
 523     }
 524
 525     return len;
 526 }
 527
 528 /*
 529 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 530
 531 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 532 and C<b>.
 533
 534 WARNING: use only if you *know* that the pointers point inside the
 535 same UTF-8 buffer.
 536
 537 =cut
 538 */
 539
 540 IV
 541 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 542 {
 543     IV off = 0;
 544
 545     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 546      * the bitops (especially ~) can create illegal UTF-8.
 547      * In other words: in Perl UTF-8 is not just for Unicode. */
 548
 549     if (a < b) {
 550         while (a < b) {
 551             U8 c = UTF8SKIP(a);
 552
 553             if (b - a < c)
 554                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 555             a += c;
 556             off--;
 557         }
 558     }
 559     else {
 560         while (b < a) {
 561             U8 c = UTF8SKIP(b);
 562
 563             if (a - b < c)
 564                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 565             b += c;
 566             off++;
 567         }
 568     }
 569
 570     return off;
 571 }
 572
 573 /*
 574 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 575
 576 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 577 forward or backward.
 578
 579 WARNING: do not use the following unless you *know* C<off> is within
 580 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 581 on the first byte of character or just after the last byte of a character.
 582
 583 =cut
 584 */
 585
 586 U8 *
 587 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 588 {
 589     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 590      * the bitops (especially ~) can create illegal UTF-8.
 591      * In other words: in Perl UTF-8 is not just for Unicode. */
 592
 593     if (off >= 0) {
 594         while (off--)
 595             s += UTF8SKIP(s);
 596     }
 597     else {
 598         while (off++) {
 599             s--;
 600             while (UTF8_IS_CONTINUATION(*s))
 601                 s--;
 602         }
 603     }
 604     return s;
 605 }
 606
 607 /*
 608 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 609
 610 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 611 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 612 updates len to contain the new length.
 613 Returns zero on failure, setting C<len> to -1.
 614
 615 =cut
 616 */
 617
 618 U8 *
 619 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 620 {
 621     U8 *send;
 622     U8 *d;
 623     U8 *save = s;
 624
 625     /* ensure valid UTF8 and chars < 256 before updating string */
 626     for (send = s + *len; s < send; ) {
 627         U8 c = *s++;
 628
 629         if (!UTF8_IS_INVARIANT(c) &&
 630             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 631              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 632             *len = -1;
 633             return 0;
 634         }
 635     }
 636
 637     d = s = save;
 638     while (s < send) {
 639         STRLEN ulen;
 640         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 641         s += ulen;
 642     }
 643     *d = '\0';
 644     *len = d - save;
 645     return save;
 646 }
 647
 648 /*
 649 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 650
 651 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 652 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 653 the newly-created string, and updates C<len> to contain the new
 654 length.  Returns the original string if no conversion occurs, C<len>
 655 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 656 0 if C<s> is converted or contains all 7bit characters.
 657
 658 =cut
 659 */
 660
 661 U8 *
 662 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 663 {
 664     U8 *d;
 665     U8 *start = s;
 666     U8 *send;
 667     I32 count = 0;
 668
 669     if (!*is_utf8)
 670         return start;
 671
 672     /* ensure valid UTF8 and chars < 256 before converting string */
 673     for (send = s + *len; s < send;) {
 674         U8 c = *s++;
 675         if (!UTF8_IS_INVARIANT(c)) {
 676             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 677                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 678                 count++;
 679             else
 680                 return start;
 681         }
 682     }
 683
 684     *is_utf8 = 0;
 685
 686     Newz(801, d, (*len) - count + 1, U8);
 687     s = start; start = d;
 688     while (s < send) {
 689         U8 c = *s++;
 690         if (!UTF8_IS_INVARIANT(c)) {
 691             /* Then it is two-byte encoded */
 692             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 693             c = ASCII_TO_NATIVE(c);
 694         }
 695         *d++ = c;
 696     }
 697     *d = '\0';
 698     *len = d - start;
 699     return start;
 700 }
 701
 702 /*
 703 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 704
 705 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 706 Returns a pointer to the newly-created string, and sets C<len> to
 707 reflect the new length.
 708
 709 =cut
 710 */
 711
 712 U8*
 713 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 714 {
 715     U8 *send;
 716     U8 *d;
 717     U8 *dst;
 718     send = s + (*len);
 719
 720     Newz(801, d, (*len) * 2 + 1, U8);
 721     dst = d;
 722
 723     while (s < send) {
 724         UV uv = NATIVE_TO_ASCII(*s++);
 725         if (UNI_IS_INVARIANT(uv))
 726             *d++ = UTF_TO_NATIVE(uv);
 727         else {
 728             *d++ = UTF8_EIGHT_BIT_HI(uv);
 729             *d++ = UTF8_EIGHT_BIT_LO(uv);
 730         }
 731     }
 732     *d = '\0';
 733     *len = d-dst;
 734     return dst;
 735 }
 736
 737 /*
 738  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 739  *
 740  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 741  * We optimize for native, for obvious reasons. */
 742
 743 U8*
 744 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 745 {
 746     U8* pend;
 747     U8* dstart = d;
 748
 749     if (bytelen & 1)
 750         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 751
 752     pend = p + bytelen;
 753
 754     while (p < pend) {
 755         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 756         p += 2;
 757         if (uv < 0x80) {
 758             *d++ = uv;
 759             continue;
 760         }
 761         if (uv < 0x800) {
 762             *d++ = (( uv >>  6)         | 0xc0);
 763             *d++ = (( uv        & 0x3f) | 0x80);
 764             continue;
 765         }
 766         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 767             UV low = *p++;
 768             if (low < 0xdc00 || low >= 0xdfff)
 769                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 770             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 771         }
 772         if (uv < 0x10000) {
 773             *d++ = (( uv >> 12)         | 0xe0);
 774             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 775             *d++ = (( uv        & 0x3f) | 0x80);
 776             continue;
 777         }
 778         else {
 779             *d++ = (( uv >> 18)         | 0xf0);
 780             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 781             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 782             *d++ = (( uv        & 0x3f) | 0x80);
 783             continue;
 784         }
 785     }
 786     *newlen = d - dstart;
 787     return d;
 788 }
 789
 790 /* Note: this one is slightly destructive of the source. */
 791
 792 U8*
 793 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 794 {
 795     U8* s = (U8*)p;
 796     U8* send = s + bytelen;
 797     while (s < send) {
 798         U8 tmp = s[0];
 799         s[0] = s[1];
 800         s[1] = tmp;
 801         s += 2;
 802     }
 803     return utf16_to_utf8(p, d, bytelen, newlen);
 804 }
 805
 806 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 807
 808 bool
 809 Perl_is_uni_alnum(pTHX_ U32 c)
 810 {
 811     U8 tmpbuf[UTF8_MAXLEN+1];
 812     uvchr_to_utf8(tmpbuf, (UV)c);
 813     return is_utf8_alnum(tmpbuf);
 814 }
 815
 816 bool
 817 Perl_is_uni_alnumc(pTHX_ U32 c)
 818 {
 819     U8 tmpbuf[UTF8_MAXLEN+1];
 820     uvchr_to_utf8(tmpbuf, (UV)c);
 821     return is_utf8_alnumc(tmpbuf);
 822 }
 823
 824 bool
 825 Perl_is_uni_idfirst(pTHX_ U32 c)
 826 {
 827     U8 tmpbuf[UTF8_MAXLEN+1];
 828     uvchr_to_utf8(tmpbuf, (UV)c);
 829     return is_utf8_idfirst(tmpbuf);
 830 }
 831
 832 bool
 833 Perl_is_uni_alpha(pTHX_ U32 c)
 834 {
 835     U8 tmpbuf[UTF8_MAXLEN+1];
 836     uvchr_to_utf8(tmpbuf, (UV)c);
 837     return is_utf8_alpha(tmpbuf);
 838 }
 839
 840 bool
 841 Perl_is_uni_ascii(pTHX_ U32 c)
 842 {
 843     U8 tmpbuf[UTF8_MAXLEN+1];
 844     uvchr_to_utf8(tmpbuf, (UV)c);
 845     return is_utf8_ascii(tmpbuf);
 846 }
 847
 848 bool
 849 Perl_is_uni_space(pTHX_ U32 c)
 850 {
 851     U8 tmpbuf[UTF8_MAXLEN+1];
 852     uvchr_to_utf8(tmpbuf, (UV)c);
 853     return is_utf8_space(tmpbuf);
 854 }
 855
 856 bool
 857 Perl_is_uni_digit(pTHX_ U32 c)
 858 {
 859     U8 tmpbuf[UTF8_MAXLEN+1];
 860     uvchr_to_utf8(tmpbuf, (UV)c);
 861     return is_utf8_digit(tmpbuf);
 862 }
 863
 864 bool
 865 Perl_is_uni_upper(pTHX_ U32 c)
 866 {
 867     U8 tmpbuf[UTF8_MAXLEN+1];
 868     uvchr_to_utf8(tmpbuf, (UV)c);
 869     return is_utf8_upper(tmpbuf);
 870 }
 871
 872 bool
 873 Perl_is_uni_lower(pTHX_ U32 c)
 874 {
 875     U8 tmpbuf[UTF8_MAXLEN+1];
 876     uvchr_to_utf8(tmpbuf, (UV)c);
 877     return is_utf8_lower(tmpbuf);
 878 }
 879
 880 bool
 881 Perl_is_uni_cntrl(pTHX_ U32 c)
 882 {
 883     U8 tmpbuf[UTF8_MAXLEN+1];
 884     uvchr_to_utf8(tmpbuf, (UV)c);
 885     return is_utf8_cntrl(tmpbuf);
 886 }
 887
 888 bool
 889 Perl_is_uni_graph(pTHX_ U32 c)
 890 {
 891     U8 tmpbuf[UTF8_MAXLEN+1];
 892     uvchr_to_utf8(tmpbuf, (UV)c);
 893     return is_utf8_graph(tmpbuf);
 894 }
 895
 896 bool
 897 Perl_is_uni_print(pTHX_ U32 c)
 898 {
 899     U8 tmpbuf[UTF8_MAXLEN+1];
 900     uvchr_to_utf8(tmpbuf, (UV)c);
 901     return is_utf8_print(tmpbuf);
 902 }
 903
 904 bool
 905 Perl_is_uni_punct(pTHX_ U32 c)
 906 {
 907     U8 tmpbuf[UTF8_MAXLEN+1];
 908     uvchr_to_utf8(tmpbuf, (UV)c);
 909     return is_utf8_punct(tmpbuf);
 910 }
 911
 912 bool
 913 Perl_is_uni_xdigit(pTHX_ U32 c)
 914 {
 915     U8 tmpbuf[UTF8_MAXLEN+1];
 916     uvchr_to_utf8(tmpbuf, (UV)c);
 917     return is_utf8_xdigit(tmpbuf);
 918 }
 919
 920 U32
 921 Perl_to_uni_upper(pTHX_ U32 c)
 922 {
 923     U8 tmpbuf[UTF8_MAXLEN+1];
 924     uvchr_to_utf8(tmpbuf, (UV)c);
 925     return to_utf8_upper(tmpbuf);
 926 }
 927
 928 U32
 929 Perl_to_uni_title(pTHX_ U32 c)
 930 {
 931     U8 tmpbuf[UTF8_MAXLEN+1];
 932     uvchr_to_utf8(tmpbuf, (UV)c);
 933     return to_utf8_title(tmpbuf);
 934 }
 935
 936 U32
 937 Perl_to_uni_lower(pTHX_ U32 c)
 938 {
 939     U8 tmpbuf[UTF8_MAXLEN+1];
 940     uvchr_to_utf8(tmpbuf, (UV)c);
 941     return to_utf8_lower(tmpbuf);
 942 }
 943
 944 /* for now these all assume no locale info available for Unicode > 255 */
 945
 946 bool
 947 Perl_is_uni_alnum_lc(pTHX_ U32 c)
 948 {
 949     return is_uni_alnum(c);     /* XXX no locale support yet */
 950 }
 951
 952 bool
 953 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
 954 {
 955     return is_uni_alnumc(c);    /* XXX no locale support yet */
 956 }
 957
 958 bool
 959 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
 960 {
 961     return is_uni_idfirst(c);   /* XXX no locale support yet */
 962 }
 963
 964 bool
 965 Perl_is_uni_alpha_lc(pTHX_ U32 c)
 966 {
 967     return is_uni_alpha(c);     /* XXX no locale support yet */
 968 }
 969
 970 bool
 971 Perl_is_uni_ascii_lc(pTHX_ U32 c)
 972 {
 973     return is_uni_ascii(c);     /* XXX no locale support yet */
 974 }
 975
 976 bool
 977 Perl_is_uni_space_lc(pTHX_ U32 c)
 978 {
 979     return is_uni_space(c);     /* XXX no locale support yet */
 980 }
 981
 982 bool
 983 Perl_is_uni_digit_lc(pTHX_ U32 c)
 984 {
 985     return is_uni_digit(c);     /* XXX no locale support yet */
 986 }
 987
 988 bool
 989 Perl_is_uni_upper_lc(pTHX_ U32 c)
 990 {
 991     return is_uni_upper(c);     /* XXX no locale support yet */
 992 }
 993
 994 bool
 995 Perl_is_uni_lower_lc(pTHX_ U32 c)
 996 {
 997     return is_uni_lower(c);     /* XXX no locale support yet */
 998 }
 999
1000 bool
1001 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1002 {
1003     return is_uni_cntrl(c);     /* XXX no locale support yet */
1004 }
1005
1006 bool
1007 Perl_is_uni_graph_lc(pTHX_ U32 c)
1008 {
1009     return is_uni_graph(c);     /* XXX no locale support yet */
1010 }
1011
1012 bool
1013 Perl_is_uni_print_lc(pTHX_ U32 c)
1014 {
1015     return is_uni_print(c);     /* XXX no locale support yet */
1016 }
1017
1018 bool
1019 Perl_is_uni_punct_lc(pTHX_ U32 c)
1020 {
1021     return is_uni_punct(c);     /* XXX no locale support yet */
1022 }
1023
1024 bool
1025 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1026 {
1027     return is_uni_xdigit(c);    /* XXX no locale support yet */
1028 }
1029
1030 U32
1031 Perl_to_uni_upper_lc(pTHX_ U32 c)
1032 {
1033     return to_uni_upper(c);     /* XXX no locale support yet */
1034 }
1035
1036 U32
1037 Perl_to_uni_title_lc(pTHX_ U32 c)
1038 {
1039     return to_uni_title(c);     /* XXX no locale support yet */
1040 }
1041
1042 U32
1043 Perl_to_uni_lower_lc(pTHX_ U32 c)
1044 {
1045     return to_uni_lower(c);     /* XXX no locale support yet */
1046 }
1047
1048 bool
1049 Perl_is_utf8_alnum(pTHX_ U8 *p)
1050 {
1051     if (!is_utf8_char(p))
1052         return FALSE;
1053     if (!PL_utf8_alnum)
1054         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1055          * descendant of isalnum(3), in other words, it doesn't
1056          * contain the '_'. --jhi */
1057         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1058     return swash_fetch(PL_utf8_alnum, p);
1059 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1060 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1061     if (!PL_utf8_alnum)
1062         PL_utf8_alnum = swash_init("utf8", "",
1063             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1064     return swash_fetch(PL_utf8_alnum, p);
1065 #endif
1066 }
1067
1068 bool
1069 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1070 {
1071     if (!is_utf8_char(p))
1072         return FALSE;
1073     if (!PL_utf8_alnum)
1074         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1075     return swash_fetch(PL_utf8_alnum, p);
1076 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1077 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1078     if (!PL_utf8_alnum)
1079         PL_utf8_alnum = swash_init("utf8", "",
1080             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1081     return swash_fetch(PL_utf8_alnum, p);
1082 #endif
1083 }
1084
1085 bool
1086 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1087 {
1088     return *p == '_' || is_utf8_alpha(p);
1089 }
1090
1091 bool
1092 Perl_is_utf8_alpha(pTHX_ U8 *p)
1093 {
1094     if (!is_utf8_char(p))
1095         return FALSE;
1096     if (!PL_utf8_alpha)
1097         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1098     return swash_fetch(PL_utf8_alpha, p);
1099 }
1100
1101 bool
1102 Perl_is_utf8_ascii(pTHX_ U8 *p)
1103 {
1104     if (!is_utf8_char(p))
1105         return FALSE;
1106     if (!PL_utf8_ascii)
1107         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1108     return swash_fetch(PL_utf8_ascii, p);
1109 }
1110
1111 bool
1112 Perl_is_utf8_space(pTHX_ U8 *p)
1113 {
1114     if (!is_utf8_char(p))
1115         return FALSE;
1116     if (!PL_utf8_space)
1117         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1118     return swash_fetch(PL_utf8_space, p);
1119 }
1120
1121 bool
1122 Perl_is_utf8_digit(pTHX_ U8 *p)
1123 {
1124     if (!is_utf8_char(p))
1125         return FALSE;
1126     if (!PL_utf8_digit)
1127         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1128     return swash_fetch(PL_utf8_digit, p);
1129 }
1130
1131 bool
1132 Perl_is_utf8_upper(pTHX_ U8 *p)
1133 {
1134     if (!is_utf8_char(p))
1135         return FALSE;
1136     if (!PL_utf8_upper)
1137         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1138     return swash_fetch(PL_utf8_upper, p);
1139 }
1140
1141 bool
1142 Perl_is_utf8_lower(pTHX_ U8 *p)
1143 {
1144     if (!is_utf8_char(p))
1145         return FALSE;
1146     if (!PL_utf8_lower)
1147         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1148     return swash_fetch(PL_utf8_lower, p);
1149 }
1150
1151 bool
1152 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1153 {
1154     if (!is_utf8_char(p))
1155         return FALSE;
1156     if (!PL_utf8_cntrl)
1157         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1158     return swash_fetch(PL_utf8_cntrl, p);
1159 }
1160
1161 bool
1162 Perl_is_utf8_graph(pTHX_ U8 *p)
1163 {
1164     if (!is_utf8_char(p))
1165         return FALSE;
1166     if (!PL_utf8_graph)
1167         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1168     return swash_fetch(PL_utf8_graph, p);
1169 }
1170
1171 bool
1172 Perl_is_utf8_print(pTHX_ U8 *p)
1173 {
1174     if (!is_utf8_char(p))
1175         return FALSE;
1176     if (!PL_utf8_print)
1177         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1178     return swash_fetch(PL_utf8_print, p);
1179 }
1180
1181 bool
1182 Perl_is_utf8_punct(pTHX_ U8 *p)
1183 {
1184     if (!is_utf8_char(p))
1185         return FALSE;
1186     if (!PL_utf8_punct)
1187         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1188     return swash_fetch(PL_utf8_punct, p);
1189 }
1190
1191 bool
1192 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1193 {
1194     if (!is_utf8_char(p))
1195         return FALSE;
1196     if (!PL_utf8_xdigit)
1197         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1198     return swash_fetch(PL_utf8_xdigit, p);
1199 }
1200
1201 bool
1202 Perl_is_utf8_mark(pTHX_ U8 *p)
1203 {
1204     if (!is_utf8_char(p))
1205         return FALSE;
1206     if (!PL_utf8_mark)
1207         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1208     return swash_fetch(PL_utf8_mark, p);
1209 }
1210
1211 UV
1212 Perl_to_utf8_upper(pTHX_ U8 *p)
1213 {
1214     UV uv;
1215
1216     if (!PL_utf8_toupper)
1217         PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1218     uv = swash_fetch(PL_utf8_toupper, p);
1219     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1220 }
1221
1222 UV
1223 Perl_to_utf8_title(pTHX_ U8 *p)
1224 {
1225     UV uv;
1226
1227     if (!PL_utf8_totitle)
1228         PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1229     uv = swash_fetch(PL_utf8_totitle, p);
1230     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1231 }
1232
1233 UV
1234 Perl_to_utf8_lower(pTHX_ U8 *p)
1235 {
1236     UV uv;
1237
1238     if (!PL_utf8_tolower)
1239         PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1240     uv = swash_fetch(PL_utf8_tolower, p);
1241     return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1242 }
1243
1244 /* a "swash" is a swatch hash */
1245
1246 SV*
1247 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1248 {
1249     SV* retval;
1250     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1251     dSP;
1252     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1253
1254     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1255         ENTER;
1256         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1257         LEAVE;
1258     }
1259     SPAGAIN;
1260     PUSHSTACKi(PERLSI_MAGIC);
1261     PUSHMARK(SP);
1262     EXTEND(SP,5);
1263     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1264     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1265     PUSHs(listsv);
1266     PUSHs(sv_2mortal(newSViv(minbits)));
1267     PUSHs(sv_2mortal(newSViv(none)));
1268     PUTBACK;
1269     ENTER;
1270     SAVEI32(PL_hints);
1271     PL_hints = 0;
1272     save_re_context();
1273     if (PL_curcop == &PL_compiling)
1274         /* XXX ought to be handled by lex_start */
1275         sv_setpv(tokenbufsv, PL_tokenbuf);
1276     if (call_method("SWASHNEW", G_SCALAR))
1277         retval = newSVsv(*PL_stack_sp--);
1278     else
1279         retval = &PL_sv_undef;
1280     LEAVE;
1281     POPSTACK;
1282     if (PL_curcop == &PL_compiling) {
1283         STRLEN len;
1284         char* pv = SvPV(tokenbufsv, len);
1285
1286         Copy(pv, PL_tokenbuf, len+1, char);
1287         PL_curcop->op_private = PL_hints;
1288     }
1289     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1290         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1291     return retval;
1292 }
1293
1294 UV
1295 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1296 {
1297     HV* hv = (HV*)SvRV(sv);
1298     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1299        then the "swatch" is a vec() for al the chars which start
1300        with 0xAA..0xYY
1301        So the key in the hash is length of encoded char -1
1302      */
1303     U32 klen = UTF8SKIP(ptr) - 1;
1304     U32 off = ptr[klen];
1305     STRLEN slen;
1306     STRLEN needents;
1307     U8 *tmps;
1308     U32 bit;
1309     SV *retval;
1310
1311     if (klen == 0)
1312      {
1313       /* If char in invariant then swatch is for all the invariant chars
1314        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1315        */
1316       needents = UTF_CONTINUATION_MARK;
1317       off      = NATIVE_TO_UTF(ptr[klen]);
1318      }
1319     else
1320      {
1321       /* If char is encoded then swatch is for the prefix */
1322       needents = (1 << UTF_ACCUMULATION_SHIFT);
1323       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1324      }
1325
1326     /*
1327      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1328      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1329      * it's nothing to sniff at.)  Pity we usually come through at least
1330      * two function calls to get here...
1331      *
1332      * NB: this code assumes that swatches are never modified, once generated!
1333      */
1334
1335     if (hv == PL_last_swash_hv &&
1336         klen == PL_last_swash_klen &&
1337         (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1338     {
1339         tmps = PL_last_swash_tmps;
1340         slen = PL_last_swash_slen;
1341     }
1342     else {
1343         /* Try our second-level swatch cache, kept in a hash. */
1344         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1345
1346         /* If not cached, generate it via utf8::SWASHGET */
1347         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1348             dSP;
1349             /* We use utf8n_to_uvuni() as we want an index into
1350                Unicode tables, not a native character number.
1351              */
1352             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1353             ENTER;
1354             SAVETMPS;
1355             save_re_context();
1356             PUSHSTACKi(PERLSI_MAGIC);
1357             PUSHMARK(SP);
1358             EXTEND(SP,3);
1359             PUSHs((SV*)sv);
1360             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1361             PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
1362             PUSHs(sv_2mortal(newSViv(needents)));
1363             PUTBACK;
1364             if (call_method("SWASHGET", G_SCALAR))
1365                 retval = newSVsv(*PL_stack_sp--);
1366             else
1367                 retval = &PL_sv_undef;
1368             POPSTACK;
1369             FREETMPS;
1370             LEAVE;
1371             if (PL_curcop == &PL_compiling)
1372                 PL_curcop->op_private = PL_hints;
1373
1374             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1375
1376             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1377                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1378         }
1379
1380         PL_last_swash_hv = hv;
1381         PL_last_swash_klen = klen;
1382         PL_last_swash_tmps = tmps;
1383         PL_last_swash_slen = slen;
1384         if (klen)
1385             Copy(ptr, PL_last_swash_key, klen, U8);
1386     }
1387
1388     switch ((int)((slen << 3) / needents)) {
1389     case 1:
1390         bit = 1 << (off & 7);
1391         off >>= 3;
1392         return (tmps[off] & bit) != 0;
1393     case 8:
1394         return tmps[off];
1395     case 16:
1396         off <<= 1;
1397         return (tmps[off] << 8) + tmps[off + 1] ;
1398     case 32:
1399         off <<= 2;
1400         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1401     }
1402     Perl_croak(aTHX_ "panic: swash_fetch");
1403     return 0;
1404 }
1405
1406
1407 /*
1408 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1409
1410 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1411 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1412 bytes available. The return value is the pointer to the byte after the
1413 end of the new character. In other words,
1414
1415     d = uvchr_to_utf8(d, uv);
1416
1417 is the recommended wide native character-aware way of saying
1418
1419     *(d++) = uv;
1420
1421 =cut
1422 */
1423
1424 /* On ASCII machines this is normally a macro but we want a
1425    real function in case XS code wants it
1426 */
1427 #undef Perl_uvchr_to_utf8
1428 U8 *
1429 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1430 {
1431     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1432 }
1433
1434
1435 /*
1436 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1437
1438 Returns the native character value of the first character in the string C<s>
1439 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1440 length, in bytes, of that character.
1441
1442 Allows length and flags to be passed to low level routine.
1443
1444 =cut
1445 */
1446 /* On ASCII machines this is normally a macro but we want a
1447    real function in case XS code wants it
1448 */
1449 #undef Perl_utf8n_to_uvchr
1450 UV
1451 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1452 {
1453     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1454     return UNI_TO_NATIVE(uv);
1455 }
1456
1457