utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (ckWARN_d(WARN_UTF8)) {
  50          if (UNICODE_IS_SURROGATE(uv))
  51               Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
  52          else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
  53                   (uv == 0xFFFE || uv == 0xFFFF))
  54               Perl_warner(aTHX_ WARN_UTF8,
  55                          "Unicode character 0x%04"UVxf" is illegal", uv);
  56     }
  57     if (UNI_IS_INVARIANT(uv)) {
  58         *d++ = UTF_TO_NATIVE(uv);
  59         return d;
  60     }
  61 #if defined(EBCDIC)
  62     else {
  63         STRLEN len  = UNISKIP(uv);
  64         U8 *p = d+len-1;
  65         while (p > d) {
  66             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  67             uv >>= UTF_ACCUMULATION_SHIFT;
  68         }
  69         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  70         return d+len;
  71     }
  72 #else /* Non loop style */
  73     if (uv < 0x800) {
  74         *d++ = (( uv >>  6)         | 0xc0);
  75         *d++ = (( uv        & 0x3f) | 0x80);
  76         return d;
  77     }
  78     if (uv < 0x10000) {
  79         *d++ = (( uv >> 12)         | 0xe0);
  80         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  81         *d++ = (( uv        & 0x3f) | 0x80);
  82         return d;
  83     }
  84     if (uv < 0x200000) {
  85         *d++ = (( uv >> 18)         | 0xf0);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x4000000) {
  92         *d++ = (( uv >> 24)         | 0xf8);
  93         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  95         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  96         *d++ = (( uv        & 0x3f) | 0x80);
  97         return d;
  98     }
  99     if (uv < 0x80000000) {
 100         *d++ = (( uv >> 30)         | 0xfc);
 101         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 102         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 103         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 104         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 105         *d++ = (( uv        & 0x3f) | 0x80);
 106         return d;
 107     }
 108 #ifdef HAS_QUAD
 109     if (uv < UTF8_QUAD_MAX)
 110 #endif
 111     {
 112         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 113         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 114         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 115         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 116         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 117         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 118         *d++ = (( uv        & 0x3f) | 0x80);
 119         return d;
 120     }
 121 #ifdef HAS_QUAD
 122     {
 123         *d++ =                        0xff;     /* Can't match U+FFFE! */
 124         *d++ =                        0x80;     /* 6 Reserved bits */
 125         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 126         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 127         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 128         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 129         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 130         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 131         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 132         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 133         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 134         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 135         *d++ = (( uv        & 0x3f) | 0x80);
 136         return d;
 137     }
 138 #endif
 139 #endif /* Loop style */
 140 }
 141
 142
 143
 144 /*
 145 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 146
 147 Tests if some arbitrary number of bytes begins in a valid UTF-8
 148 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 149 The actual number of bytes in the UTF-8 character will be returned if
 150 it is valid, otherwise 0.
 151
 152 =cut
 153 */
 154 STRLEN
 155 Perl_is_utf8_char(pTHX_ U8 *s)
 156 {
 157     U8 u = *s;
 158     STRLEN slen, len;
 159     UV uv, ouv;
 160
 161     if (UTF8_IS_INVARIANT(u))
 162         return 1;
 163
 164     if (!UTF8_IS_START(u))
 165         return 0;
 166
 167     len = UTF8SKIP(s);
 168
 169     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 170         return 0;
 171
 172     slen = len - 1;
 173     s++;
 174     u &= UTF_START_MASK(len);
 175     uv  = u;
 176     ouv = uv;
 177     while (slen--) {
 178         if (!UTF8_IS_CONTINUATION(*s))
 179             return 0;
 180         uv = UTF8_ACCUMULATE(uv, *s);
 181         if (uv < ouv)
 182             return 0;
 183         ouv = uv;
 184         s++;
 185     }
 186
 187     if (UNISKIP(uv) < len)
 188         return 0;
 189
 190     return len;
 191 }
 192
 193 /*
 194 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 195
 196 Returns true if first C<len> bytes of the given string form a valid UTF8
 197 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 198 'a string that contains UTF8' because a valid ASCII string is a valid
 199 UTF8 string.
 200
 201 =cut
 202 */
 203
 204 bool
 205 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 206 {
 207     U8* x = s;
 208     U8* send;
 209     STRLEN c;
 210
 211     if (!len)
 212         len = strlen((char *)s);
 213     send = s + len;
 214
 215     while (x < send) {
 216         c = is_utf8_char(x);
 217         if (!c)
 218             return FALSE;
 219         x += c;
 220     }
 221     if (x != send)
 222         return FALSE;
 223
 224     return TRUE;
 225 }
 226
 227 /*
 228 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 229
 230 Bottom level UTF-8 decode routine.
 231 Returns the unicode code point value of the first character in the string C<s>
 232 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 233 C<retlen> will be set to the length, in bytes, of that character.
 234
 235 If C<s> does not point to a well-formed UTF8 character, the behaviour
 236 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 237 it is assumed that the caller will raise a warning, and this function
 238 will silently just set C<retlen> to C<-1> and return zero.  If the
 239 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 240 malformations will be given, C<retlen> will be set to the expected
 241 length of the UTF-8 character in bytes, and zero will be returned.
 242
 243 The C<flags> can also contain various flags to allow deviations from
 244 the strict UTF-8 encoding (see F<utf8.h>).
 245
 246 Most code should use utf8_to_uvchr() rather than call this directly.
 247
 248 =cut
 249 */
 250
 251 UV
 252 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 253 {
 254     UV uv = *s, ouv = 0;
 255     STRLEN len = 1;
 256     bool dowarn = ckWARN_d(WARN_UTF8);
 257     STRLEN expectlen = 0;
 258     U32 warning = 0;
 259
 260 /* This list is a superset of the UTF8_ALLOW_XXX. */
 261
 262 #define UTF8_WARN_EMPTY                          1
 263 #define UTF8_WARN_CONTINUATION                   2
 264 #define UTF8_WARN_NON_CONTINUATION               3
 265 #define UTF8_WARN_FE_FF                          4
 266 #define UTF8_WARN_SHORT                          5
 267 #define UTF8_WARN_OVERFLOW                       6
 268 #define UTF8_WARN_SURROGATE                      7
 269 #define UTF8_WARN_BOM                            8
 270 #define UTF8_WARN_LONG                           9
 271 #define UTF8_WARN_FFFF                          10
 272
 273     if (curlen == 0 &&
 274         !(flags & UTF8_ALLOW_EMPTY)) {
 275         warning = UTF8_WARN_EMPTY;
 276         goto malformed;
 277     }
 278
 279     if (UTF8_IS_INVARIANT(uv)) {
 280         if (retlen)
 281             *retlen = 1;
 282         return (UV) (NATIVE_TO_UTF(*s));
 283     }
 284
 285     if (UTF8_IS_CONTINUATION(uv) &&
 286         !(flags & UTF8_ALLOW_CONTINUATION)) {
 287         warning = UTF8_WARN_CONTINUATION;
 288         goto malformed;
 289     }
 290
 291     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 292         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 293         warning = UTF8_WARN_NON_CONTINUATION;
 294         goto malformed;
 295     }
 296
 297 #ifdef EBCDIC
 298     uv = NATIVE_TO_UTF(uv);
 299 #else
 300     if ((uv == 0xfe || uv == 0xff) &&
 301         !(flags & UTF8_ALLOW_FE_FF)) {
 302         warning = UTF8_WARN_FE_FF;
 303         goto malformed;
 304     }
 305 #endif
 306
 307     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 308     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 309     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 310     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 311 #ifdef EBCDIC
 312     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 313     else                        { len =  7; uv &= 0x01; }
 314 #else
 315     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 316     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 317     else                        { len = 13; uv = 0; } /* whoa! */
 318 #endif
 319
 320     if (retlen)
 321         *retlen = len;
 322
 323     expectlen = len;
 324
 325     if ((curlen < expectlen) &&
 326         !(flags & UTF8_ALLOW_SHORT)) {
 327         warning = UTF8_WARN_SHORT;
 328         goto malformed;
 329     }
 330
 331     len--;
 332     s++;
 333     ouv = uv;
 334
 335     while (len--) {
 336         if (!UTF8_IS_CONTINUATION(*s) &&
 337             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 338             s--;
 339             warning = UTF8_WARN_NON_CONTINUATION;
 340             goto malformed;
 341         }
 342         else
 343             uv = UTF8_ACCUMULATE(uv, *s);
 344         if (!(uv > ouv)) {
 345             /* These cannot be allowed. */
 346             if (uv == ouv) {
 347                 if (!(flags & UTF8_ALLOW_LONG)) {
 348                     warning = UTF8_WARN_LONG;
 349                     goto malformed;
 350                 }
 351             }
 352             else { /* uv < ouv */
 353                 /* This cannot be allowed. */
 354                 warning = UTF8_WARN_OVERFLOW;
 355                 goto malformed;
 356             }
 357         }
 358         s++;
 359         ouv = uv;
 360     }
 361
 362     if (UNICODE_IS_SURROGATE(uv) &&
 363         !(flags & UTF8_ALLOW_SURROGATE)) {
 364         warning = UTF8_WARN_SURROGATE;
 365         goto malformed;
 366     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 367                !(flags & UTF8_ALLOW_BOM)) {
 368         warning = UTF8_WARN_BOM;
 369         goto malformed;
 370     } else if ((expectlen > UNISKIP(uv)) &&
 371                !(flags & UTF8_ALLOW_LONG)) {
 372         warning = UTF8_WARN_LONG;
 373         goto malformed;
 374     } else if (UNICODE_IS_ILLEGAL(uv) &&
 375                !(flags & UTF8_ALLOW_FFFF)) {
 376         warning = UTF8_WARN_FFFF;
 377         goto malformed;
 378     }
 379
 380     return uv;
 381
 382 malformed:
 383
 384     if (flags & UTF8_CHECK_ONLY) {
 385         if (retlen)
 386             *retlen = -1;
 387         return 0;
 388     }
 389
 390     if (dowarn) {
 391         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 392
 393         switch (warning) {
 394         case 0: /* Intentionally empty. */ break;
 395         case UTF8_WARN_EMPTY:
 396             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 397             break;
 398         case UTF8_WARN_CONTINUATION:
 399             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 400             break;
 401         case UTF8_WARN_NON_CONTINUATION:
 402             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 403                            (UV)s[1], uv);
 404             break;
 405         case UTF8_WARN_FE_FF:
 406             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 407             break;
 408         case UTF8_WARN_SHORT:
 409             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 410                            curlen, curlen == 1 ? "" : "s", expectlen);
 411             expectlen = curlen;         /* distance for caller to skip */
 412             break;
 413         case UTF8_WARN_OVERFLOW:
 414             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 415                            ouv, *s);
 416             break;
 417         case UTF8_WARN_SURROGATE:
 418             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 419             break;
 420         case UTF8_WARN_BOM:
 421             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 422             break;
 423         case UTF8_WARN_LONG:
 424             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 425                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 426             break;
 427         case UTF8_WARN_FFFF:
 428             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 429             break;
 430         default:
 431             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 432             break;
 433         }
 434
 435         if (warning) {
 436             char *s = SvPVX(sv);
 437
 438             if (PL_op)
 439                 Perl_warner(aTHX_ WARN_UTF8,
 440                             "%s in %s", s,  OP_DESC(PL_op));
 441             else
 442                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 443         }
 444     }
 445
 446     if (retlen)
 447         *retlen = expectlen ? expectlen : len;
 448
 449     return 0;
 450 }
 451
 452 /*
 453 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 454
 455 Returns the native character value of the first character in the string C<s>
 456 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 457 length, in bytes, of that character.
 458
 459 If C<s> does not point to a well-formed UTF8 character, zero is
 460 returned and retlen is set, if possible, to -1.
 461
 462 =cut
 463 */
 464
 465 UV
 466 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 467 {
 468     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 469 }
 470
 471 /*
 472 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 473
 474 Returns the Unicode code point of the first character in the string C<s>
 475 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 476 length, in bytes, of that character.
 477
 478 This function should only be used when returned UV is considered
 479 an index into the Unicode semantic tables (e.g. swashes).
 480
 481 If C<s> does not point to a well-formed UTF8 character, zero is
 482 returned and retlen is set, if possible, to -1.
 483
 484 =cut
 485 */
 486
 487 UV
 488 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 489 {
 490     /* Call the low level routine asking for checks */
 491     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 492 }
 493
 494 /*
 495 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 496
 497 Return the length of the UTF-8 char encoded string C<s> in characters.
 498 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 499 up past C<e>, croaks.
 500
 501 =cut
 502 */
 503
 504 STRLEN
 505 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 506 {
 507     STRLEN len = 0;
 508
 509     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 510      * the bitops (especially ~) can create illegal UTF-8.
 511      * In other words: in Perl UTF-8 is not just for Unicode. */
 512
 513     if (e < s)
 514         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 515     while (s < e) {
 516         U8 t = UTF8SKIP(s);
 517
 518         if (e - s < t)
 519             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 520         s += t;
 521         len++;
 522     }
 523
 524     return len;
 525 }
 526
 527 /*
 528 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 529
 530 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 531 and C<b>.
 532
 533 WARNING: use only if you *know* that the pointers point inside the
 534 same UTF-8 buffer.
 535
 536 =cut
 537 */
 538
 539 IV
 540 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 541 {
 542     IV off = 0;
 543
 544     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 545      * the bitops (especially ~) can create illegal UTF-8.
 546      * In other words: in Perl UTF-8 is not just for Unicode. */
 547
 548     if (a < b) {
 549         while (a < b) {
 550             U8 c = UTF8SKIP(a);
 551
 552             if (b - a < c)
 553                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 554             a += c;
 555             off--;
 556         }
 557     }
 558     else {
 559         while (b < a) {
 560             U8 c = UTF8SKIP(b);
 561
 562             if (a - b < c)
 563                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 564             b += c;
 565             off++;
 566         }
 567     }
 568
 569     return off;
 570 }
 571
 572 /*
 573 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 574
 575 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 576 forward or backward.
 577
 578 WARNING: do not use the following unless you *know* C<off> is within
 579 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 580 on the first byte of character or just after the last byte of a character.
 581
 582 =cut
 583 */
 584
 585 U8 *
 586 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 587 {
 588     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 589      * the bitops (especially ~) can create illegal UTF-8.
 590      * In other words: in Perl UTF-8 is not just for Unicode. */
 591
 592     if (off >= 0) {
 593         while (off--)
 594             s += UTF8SKIP(s);
 595     }
 596     else {
 597         while (off++) {
 598             s--;
 599             while (UTF8_IS_CONTINUATION(*s))
 600                 s--;
 601         }
 602     }
 603     return s;
 604 }
 605
 606 /*
 607 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 608
 609 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 610 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 611 updates len to contain the new length.
 612 Returns zero on failure, setting C<len> to -1.
 613
 614 =cut
 615 */
 616
 617 U8 *
 618 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 619 {
 620     U8 *send;
 621     U8 *d;
 622     U8 *save = s;
 623
 624     /* ensure valid UTF8 and chars < 256 before updating string */
 625     for (send = s + *len; s < send; ) {
 626         U8 c = *s++;
 627
 628         if (!UTF8_IS_INVARIANT(c) &&
 629             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 630              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 631             *len = -1;
 632             return 0;
 633         }
 634     }
 635
 636     d = s = save;
 637     while (s < send) {
 638         STRLEN ulen;
 639         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 640         s += ulen;
 641     }
 642     *d = '\0';
 643     *len = d - save;
 644     return save;
 645 }
 646
 647 /*
 648 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 649
 650 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 651 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 652 the newly-created string, and updates C<len> to contain the new
 653 length.  Returns the original string if no conversion occurs, C<len>
 654 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 655 0 if C<s> is converted or contains all 7bit characters.
 656
 657 =cut
 658 */
 659
 660 U8 *
 661 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 662 {
 663     U8 *d;
 664     U8 *start = s;
 665     U8 *send;
 666     I32 count = 0;
 667
 668     if (!*is_utf8)
 669         return start;
 670
 671     /* ensure valid UTF8 and chars < 256 before converting string */
 672     for (send = s + *len; s < send;) {
 673         U8 c = *s++;
 674         if (!UTF8_IS_INVARIANT(c)) {
 675             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 676                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 677                 count++;
 678             else
 679                 return start;
 680         }
 681     }
 682
 683     *is_utf8 = 0;
 684
 685     Newz(801, d, (*len) - count + 1, U8);
 686     s = start; start = d;
 687     while (s < send) {
 688         U8 c = *s++;
 689         if (!UTF8_IS_INVARIANT(c)) {
 690             /* Then it is two-byte encoded */
 691             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 692             c = ASCII_TO_NATIVE(c);
 693         }
 694         *d++ = c;
 695     }
 696     *d = '\0';
 697     *len = d - start;
 698     return start;
 699 }
 700
 701 /*
 702 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 703
 704 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 705 Returns a pointer to the newly-created string, and sets C<len> to
 706 reflect the new length.
 707
 708 =cut
 709 */
 710
 711 U8*
 712 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 713 {
 714     U8 *send;
 715     U8 *d;
 716     U8 *dst;
 717     send = s + (*len);
 718
 719     Newz(801, d, (*len) * 2 + 1, U8);
 720     dst = d;
 721
 722     while (s < send) {
 723         UV uv = NATIVE_TO_ASCII(*s++);
 724         if (UNI_IS_INVARIANT(uv))
 725             *d++ = UTF_TO_NATIVE(uv);
 726         else {
 727             *d++ = UTF8_EIGHT_BIT_HI(uv);
 728             *d++ = UTF8_EIGHT_BIT_LO(uv);
 729         }
 730     }
 731     *d = '\0';
 732     *len = d-dst;
 733     return dst;
 734 }
 735
 736 /*
 737  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 738  *
 739  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 740  * We optimize for native, for obvious reasons. */
 741
 742 U8*
 743 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 744 {
 745     U8* pend;
 746     U8* dstart = d;
 747
 748     if (bytelen & 1)
 749         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 750
 751     pend = p + bytelen;
 752
 753     while (p < pend) {
 754         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 755         p += 2;
 756         if (uv < 0x80) {
 757             *d++ = uv;
 758             continue;
 759         }
 760         if (uv < 0x800) {
 761             *d++ = (( uv >>  6)         | 0xc0);
 762             *d++ = (( uv        & 0x3f) | 0x80);
 763             continue;
 764         }
 765         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 766             UV low = *p++;
 767             if (low < 0xdc00 || low >= 0xdfff)
 768                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 769             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 770         }
 771         if (uv < 0x10000) {
 772             *d++ = (( uv >> 12)         | 0xe0);
 773             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 774             *d++ = (( uv        & 0x3f) | 0x80);
 775             continue;
 776         }
 777         else {
 778             *d++ = (( uv >> 18)         | 0xf0);
 779             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 780             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 781             *d++ = (( uv        & 0x3f) | 0x80);
 782             continue;
 783         }
 784     }
 785     *newlen = d - dstart;
 786     return d;
 787 }
 788
 789 /* Note: this one is slightly destructive of the source. */
 790
 791 U8*
 792 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 793 {
 794     U8* s = (U8*)p;
 795     U8* send = s + bytelen;
 796     while (s < send) {
 797         U8 tmp = s[0];
 798         s[0] = s[1];
 799         s[1] = tmp;
 800         s += 2;
 801     }
 802     return utf16_to_utf8(p, d, bytelen, newlen);
 803 }
 804
 805 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 806
 807 bool
 808 Perl_is_uni_alnum(pTHX_ UV c)
 809 {
 810     U8 tmpbuf[UTF8_MAXLEN+1];
 811     uvchr_to_utf8(tmpbuf, (UV)c);
 812     return is_utf8_alnum(tmpbuf);
 813 }
 814
 815 bool
 816 Perl_is_uni_alnumc(pTHX_ UV c)
 817 {
 818     U8 tmpbuf[UTF8_MAXLEN+1];
 819     uvchr_to_utf8(tmpbuf, (UV)c);
 820     return is_utf8_alnumc(tmpbuf);
 821 }
 822
 823 bool
 824 Perl_is_uni_idfirst(pTHX_ UV c)
 825 {
 826     U8 tmpbuf[UTF8_MAXLEN+1];
 827     uvchr_to_utf8(tmpbuf, (UV)c);
 828     return is_utf8_idfirst(tmpbuf);
 829 }
 830
 831 bool
 832 Perl_is_uni_alpha(pTHX_ UV c)
 833 {
 834     U8 tmpbuf[UTF8_MAXLEN+1];
 835     uvchr_to_utf8(tmpbuf, (UV)c);
 836     return is_utf8_alpha(tmpbuf);
 837 }
 838
 839 bool
 840 Perl_is_uni_ascii(pTHX_ UV c)
 841 {
 842     U8 tmpbuf[UTF8_MAXLEN+1];
 843     uvchr_to_utf8(tmpbuf, (UV)c);
 844     return is_utf8_ascii(tmpbuf);
 845 }
 846
 847 bool
 848 Perl_is_uni_space(pTHX_ UV c)
 849 {
 850     U8 tmpbuf[UTF8_MAXLEN+1];
 851     uvchr_to_utf8(tmpbuf, (UV)c);
 852     return is_utf8_space(tmpbuf);
 853 }
 854
 855 bool
 856 Perl_is_uni_digit(pTHX_ UV c)
 857 {
 858     U8 tmpbuf[UTF8_MAXLEN+1];
 859     uvchr_to_utf8(tmpbuf, (UV)c);
 860     return is_utf8_digit(tmpbuf);
 861 }
 862
 863 bool
 864 Perl_is_uni_upper(pTHX_ UV c)
 865 {
 866     U8 tmpbuf[UTF8_MAXLEN+1];
 867     uvchr_to_utf8(tmpbuf, (UV)c);
 868     return is_utf8_upper(tmpbuf);
 869 }
 870
 871 bool
 872 Perl_is_uni_lower(pTHX_ UV c)
 873 {
 874     U8 tmpbuf[UTF8_MAXLEN+1];
 875     uvchr_to_utf8(tmpbuf, (UV)c);
 876     return is_utf8_lower(tmpbuf);
 877 }
 878
 879 bool
 880 Perl_is_uni_cntrl(pTHX_ UV c)
 881 {
 882     U8 tmpbuf[UTF8_MAXLEN+1];
 883     uvchr_to_utf8(tmpbuf, (UV)c);
 884     return is_utf8_cntrl(tmpbuf);
 885 }
 886
 887 bool
 888 Perl_is_uni_graph(pTHX_ UV c)
 889 {
 890     U8 tmpbuf[UTF8_MAXLEN+1];
 891     uvchr_to_utf8(tmpbuf, (UV)c);
 892     return is_utf8_graph(tmpbuf);
 893 }
 894
 895 bool
 896 Perl_is_uni_print(pTHX_ UV c)
 897 {
 898     U8 tmpbuf[UTF8_MAXLEN+1];
 899     uvchr_to_utf8(tmpbuf, (UV)c);
 900     return is_utf8_print(tmpbuf);
 901 }
 902
 903 bool
 904 Perl_is_uni_punct(pTHX_ UV c)
 905 {
 906     U8 tmpbuf[UTF8_MAXLEN+1];
 907     uvchr_to_utf8(tmpbuf, (UV)c);
 908     return is_utf8_punct(tmpbuf);
 909 }
 910
 911 bool
 912 Perl_is_uni_xdigit(pTHX_ UV c)
 913 {
 914     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 915     uvchr_to_utf8(tmpbuf, (UV)c);
 916     return is_utf8_xdigit(tmpbuf);
 917 }
 918
 919 UV
 920 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 921 {
 922     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 923     uvchr_to_utf8(tmpbuf, (UV)c);
 924     return to_utf8_upper(tmpbuf, p, lenp);
 925 }
 926
 927 UV
 928 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 929 {
 930     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 931     uvchr_to_utf8(tmpbuf, (UV)c);
 932     return to_utf8_title(tmpbuf, p, lenp);
 933 }
 934
 935 UV
 936 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 937 {
 938     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 939     uvchr_to_utf8(tmpbuf, (UV)c);
 940     return to_utf8_lower(tmpbuf, p, lenp);
 941 }
 942
 943 UV
 944 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 945 {
 946     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
 947     uvchr_to_utf8(tmpbuf, (UV)c);
 948     return to_utf8_fold(tmpbuf, p, lenp);
 949 }
 950
 951 /* for now these all assume no locale info available for Unicode > 255 */
 952
 953 bool
 954 Perl_is_uni_alnum_lc(pTHX_ UV c)
 955 {
 956     return is_uni_alnum(c);     /* XXX no locale support yet */
 957 }
 958
 959 bool
 960 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 961 {
 962     return is_uni_alnumc(c);    /* XXX no locale support yet */
 963 }
 964
 965 bool
 966 Perl_is_uni_idfirst_lc(pTHX_ UV c)
 967 {
 968     return is_uni_idfirst(c);   /* XXX no locale support yet */
 969 }
 970
 971 bool
 972 Perl_is_uni_alpha_lc(pTHX_ UV c)
 973 {
 974     return is_uni_alpha(c);     /* XXX no locale support yet */
 975 }
 976
 977 bool
 978 Perl_is_uni_ascii_lc(pTHX_ UV c)
 979 {
 980     return is_uni_ascii(c);     /* XXX no locale support yet */
 981 }
 982
 983 bool
 984 Perl_is_uni_space_lc(pTHX_ UV c)
 985 {
 986     return is_uni_space(c);     /* XXX no locale support yet */
 987 }
 988
 989 bool
 990 Perl_is_uni_digit_lc(pTHX_ UV c)
 991 {
 992     return is_uni_digit(c);     /* XXX no locale support yet */
 993 }
 994
 995 bool
 996 Perl_is_uni_upper_lc(pTHX_ UV c)
 997 {
 998     return is_uni_upper(c);     /* XXX no locale support yet */
 999 }
1000
1001 bool
1002 Perl_is_uni_lower_lc(pTHX_ UV c)
1003 {
1004     return is_uni_lower(c);     /* XXX no locale support yet */
1005 }
1006
1007 bool
1008 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1009 {
1010     return is_uni_cntrl(c);     /* XXX no locale support yet */
1011 }
1012
1013 bool
1014 Perl_is_uni_graph_lc(pTHX_ UV c)
1015 {
1016     return is_uni_graph(c);     /* XXX no locale support yet */
1017 }
1018
1019 bool
1020 Perl_is_uni_print_lc(pTHX_ UV c)
1021 {
1022     return is_uni_print(c);     /* XXX no locale support yet */
1023 }
1024
1025 bool
1026 Perl_is_uni_punct_lc(pTHX_ UV c)
1027 {
1028     return is_uni_punct(c);     /* XXX no locale support yet */
1029 }
1030
1031 bool
1032 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1033 {
1034     return is_uni_xdigit(c);    /* XXX no locale support yet */
1035 }
1036
1037 bool
1038 Perl_is_utf8_alnum(pTHX_ U8 *p)
1039 {
1040     if (!is_utf8_char(p))
1041         return FALSE;
1042     if (!PL_utf8_alnum)
1043         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1044          * descendant of isalnum(3), in other words, it doesn't
1045          * contain the '_'. --jhi */
1046         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1047     return swash_fetch(PL_utf8_alnum, p, TRUE);
1048 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1049 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1050     if (!PL_utf8_alnum)
1051         PL_utf8_alnum = swash_init("utf8", "",
1052             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1053     return swash_fetch(PL_utf8_alnum, p, TRUE);
1054 #endif
1055 }
1056
1057 bool
1058 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1059 {
1060     if (!is_utf8_char(p))
1061         return FALSE;
1062     if (!PL_utf8_alnum)
1063         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1064     return swash_fetch(PL_utf8_alnum, p, TRUE);
1065 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1066 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1067     if (!PL_utf8_alnum)
1068         PL_utf8_alnum = swash_init("utf8", "",
1069             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1070     return swash_fetch(PL_utf8_alnum, p, TRUE);
1071 #endif
1072 }
1073
1074 bool
1075 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1076 {
1077     return *p == '_' || is_utf8_alpha(p);
1078 }
1079
1080 bool
1081 Perl_is_utf8_alpha(pTHX_ U8 *p)
1082 {
1083     if (!is_utf8_char(p))
1084         return FALSE;
1085     if (!PL_utf8_alpha)
1086         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1087     return swash_fetch(PL_utf8_alpha, p, TRUE);
1088 }
1089
1090 bool
1091 Perl_is_utf8_ascii(pTHX_ U8 *p)
1092 {
1093     if (!is_utf8_char(p))
1094         return FALSE;
1095     if (!PL_utf8_ascii)
1096         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1097     return swash_fetch(PL_utf8_ascii, p, TRUE);
1098 }
1099
1100 bool
1101 Perl_is_utf8_space(pTHX_ U8 *p)
1102 {
1103     if (!is_utf8_char(p))
1104         return FALSE;
1105     if (!PL_utf8_space)
1106         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1107     return swash_fetch(PL_utf8_space, p, TRUE);
1108 }
1109
1110 bool
1111 Perl_is_utf8_digit(pTHX_ U8 *p)
1112 {
1113     if (!is_utf8_char(p))
1114         return FALSE;
1115     if (!PL_utf8_digit)
1116         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1117     return swash_fetch(PL_utf8_digit, p, TRUE);
1118 }
1119
1120 bool
1121 Perl_is_utf8_upper(pTHX_ U8 *p)
1122 {
1123     if (!is_utf8_char(p))
1124         return FALSE;
1125     if (!PL_utf8_upper)
1126         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1127     return swash_fetch(PL_utf8_upper, p, TRUE);
1128 }
1129
1130 bool
1131 Perl_is_utf8_lower(pTHX_ U8 *p)
1132 {
1133     if (!is_utf8_char(p))
1134         return FALSE;
1135     if (!PL_utf8_lower)
1136         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1137     return swash_fetch(PL_utf8_lower, p, TRUE);
1138 }
1139
1140 bool
1141 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1142 {
1143     if (!is_utf8_char(p))
1144         return FALSE;
1145     if (!PL_utf8_cntrl)
1146         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1147     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1148 }
1149
1150 bool
1151 Perl_is_utf8_graph(pTHX_ U8 *p)
1152 {
1153     if (!is_utf8_char(p))
1154         return FALSE;
1155     if (!PL_utf8_graph)
1156         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1157     return swash_fetch(PL_utf8_graph, p, TRUE);
1158 }
1159
1160 bool
1161 Perl_is_utf8_print(pTHX_ U8 *p)
1162 {
1163     if (!is_utf8_char(p))
1164         return FALSE;
1165     if (!PL_utf8_print)
1166         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1167     return swash_fetch(PL_utf8_print, p, TRUE);
1168 }
1169
1170 bool
1171 Perl_is_utf8_punct(pTHX_ U8 *p)
1172 {
1173     if (!is_utf8_char(p))
1174         return FALSE;
1175     if (!PL_utf8_punct)
1176         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1177     return swash_fetch(PL_utf8_punct, p, TRUE);
1178 }
1179
1180 bool
1181 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1182 {
1183     if (!is_utf8_char(p))
1184         return FALSE;
1185     if (!PL_utf8_xdigit)
1186         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1187     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1188 }
1189
1190 bool
1191 Perl_is_utf8_mark(pTHX_ U8 *p)
1192 {
1193     if (!is_utf8_char(p))
1194         return FALSE;
1195     if (!PL_utf8_mark)
1196         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1197     return swash_fetch(PL_utf8_mark, p, TRUE);
1198 }
1199
1200 /*
1201 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1202
1203 The "p" contains the pointer to the UTF-8 string encoding
1204 the character that is being converted.
1205
1206 The "ustrp" is a pointer to the character buffer to put the
1207 conversion result to.  The "lenp" is a pointer to the length
1208 of the result.
1209
1210 The "swash" is a pointer to the swash to use.
1211
1212 The "normal" is a string like "ToLower" which means the swash
1213 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1214 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1215
1216 The "special" is a string like "utf8::ToSpecLower", which means
1217 the hash %utf8::ToSpecLower, which is stored in the same file,
1218 lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
1219 to the hash is by Perl_to_utf8_case().
1220
1221 =cut
1222  */
1223
1224 UV
1225 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1226 {
1227     UV uv;
1228
1229     if (!*swashp)
1230         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1231     uv = swash_fetch(*swashp, p, TRUE);
1232     if (uv)
1233          uv = UNI_TO_NATIVE(uv);
1234     else {
1235          HV *hv;
1236          SV *keysv;
1237          HE *he;
1238
1239          uv = utf8_to_uvchr(p, 0);
1240
1241          if ((hv    = get_hv(special, FALSE)) &&
1242              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1243              (he    = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1244               SV *val = HeVAL(he);
1245               char *s = SvPV(val, *lenp);
1246               U8 c = *(U8*)s;
1247               if (*lenp > 1 || UNI_IS_INVARIANT(c))
1248                    Copy(s, ustrp, *lenp, U8);
1249               else {
1250                    /* something in the 0x80..0xFF range */
1251                    ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1252                    ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1253                    *lenp = 2;
1254               }
1255               return 0;
1256          }
1257     }
1258     if (lenp)
1259        *lenp = UNISKIP(uv);
1260     uvuni_to_utf8(ustrp, uv);
1261     return uv;
1262 }
1263
1264 UV
1265 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1266 {
1267     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1268                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1269 }
1270
1271 UV
1272 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1273 {
1274     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1275                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1276 }
1277
1278 UV
1279 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1280 {
1281     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1282                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1283 }
1284
1285 UV
1286 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1287 {
1288     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1289                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1290 }
1291
1292 /* a "swash" is a swatch hash */
1293
1294 SV*
1295 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1296 {
1297     SV* retval;
1298     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1299     dSP;
1300     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1301     SV* errsv_save;
1302
1303     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1304         ENTER;
1305         errsv_save = newSVsv(ERRSV);
1306         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1307         if (!SvTRUE(ERRSV))
1308             sv_setsv(ERRSV, errsv_save);
1309         SvREFCNT_dec(errsv_save);
1310         LEAVE;
1311     }
1312     SPAGAIN;
1313     PUSHSTACKi(PERLSI_MAGIC);
1314     PUSHMARK(SP);
1315     EXTEND(SP,5);
1316     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1317     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1318     PUSHs(listsv);
1319     PUSHs(sv_2mortal(newSViv(minbits)));
1320     PUSHs(sv_2mortal(newSViv(none)));
1321     PUTBACK;
1322     ENTER;
1323     SAVEI32(PL_hints);
1324     PL_hints = 0;
1325     save_re_context();
1326     if (PL_curcop == &PL_compiling)
1327         /* XXX ought to be handled by lex_start */
1328         sv_setpv(tokenbufsv, PL_tokenbuf);
1329     errsv_save = newSVsv(ERRSV);
1330     if (call_method("SWASHNEW", G_SCALAR))
1331         retval = newSVsv(*PL_stack_sp--);
1332     else
1333         retval = &PL_sv_undef;
1334     if (!SvTRUE(ERRSV))
1335         sv_setsv(ERRSV, errsv_save);
1336     SvREFCNT_dec(errsv_save);
1337     LEAVE;
1338     POPSTACK;
1339     if (PL_curcop == &PL_compiling) {
1340         STRLEN len;
1341         char* pv = SvPV(tokenbufsv, len);
1342
1343         Copy(pv, PL_tokenbuf, len+1, char);
1344         PL_curcop->op_private = PL_hints;
1345     }
1346     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1347         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1348     return retval;
1349 }
1350
1351
1352 /* This API is wrong for special case conversions since we may need to
1353  * return several Unicode characters for a single Unicode character
1354  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1355  * the lower-level routine, and it is similarly broken for returning
1356  * multiple values.  --jhi */
1357 UV
1358 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1359 {
1360     HV* hv = (HV*)SvRV(sv);
1361     U32 klen;
1362     U32 off;
1363     STRLEN slen;
1364     STRLEN needents;
1365     U8 *tmps = NULL;
1366     U32 bit;
1367     SV *retval;
1368     U8 tmputf8[2];
1369     UV c = NATIVE_TO_ASCII(*ptr);
1370
1371     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1372         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1373         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1374         ptr = tmputf8;
1375     }
1376     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1377      * then the "swatch" is a vec() for al the chars which start
1378      * with 0xAA..0xYY
1379      * So the key in the hash (klen) is length of encoded char -1
1380      */
1381     klen = UTF8SKIP(ptr) - 1;
1382     off  = ptr[klen];
1383
1384     if (klen == 0)
1385      {
1386       /* If char in invariant then swatch is for all the invariant chars
1387        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1388        */
1389       needents = UTF_CONTINUATION_MARK;
1390       off      = NATIVE_TO_UTF(ptr[klen]);
1391      }
1392     else
1393      {
1394       /* If char is encoded then swatch is for the prefix */
1395       needents = (1 << UTF_ACCUMULATION_SHIFT);
1396       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1397      }
1398
1399     /*
1400      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1401      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1402      * it's nothing to sniff at.)  Pity we usually come through at least
1403      * two function calls to get here...
1404      *
1405      * NB: this code assumes that swatches are never modified, once generated!
1406      */
1407
1408     if (hv   == PL_last_swash_hv &&
1409         klen == PL_last_swash_klen &&
1410         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1411     {
1412         tmps = PL_last_swash_tmps;
1413         slen = PL_last_swash_slen;
1414     }
1415     else {
1416         /* Try our second-level swatch cache, kept in a hash. */
1417         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1418
1419         /* If not cached, generate it via utf8::SWASHGET */
1420         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1421             dSP;
1422             /* We use utf8n_to_uvuni() as we want an index into
1423                Unicode tables, not a native character number.
1424              */
1425             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1426             SV *errsv_save;
1427             ENTER;
1428             SAVETMPS;
1429             save_re_context();
1430             PUSHSTACKi(PERLSI_MAGIC);
1431             PUSHMARK(SP);
1432             EXTEND(SP,3);
1433             PUSHs((SV*)sv);
1434             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1435             PUSHs(sv_2mortal(newSViv((klen) ?
1436                                      (code_point & ~(needents - 1)) : 0)));
1437             PUSHs(sv_2mortal(newSViv(needents)));
1438             PUTBACK;
1439             errsv_save = newSVsv(ERRSV);
1440             if (call_method("SWASHGET", G_SCALAR))
1441                 retval = newSVsv(*PL_stack_sp--);
1442             else
1443                 retval = &PL_sv_undef;
1444             if (!SvTRUE(ERRSV))
1445                 sv_setsv(ERRSV, errsv_save);
1446             SvREFCNT_dec(errsv_save);
1447             POPSTACK;
1448             FREETMPS;
1449             LEAVE;
1450             if (PL_curcop == &PL_compiling)
1451                 PL_curcop->op_private = PL_hints;
1452
1453             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1454
1455             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1456                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1457         }
1458
1459         PL_last_swash_hv = hv;
1460         PL_last_swash_klen = klen;
1461         PL_last_swash_tmps = tmps;
1462         PL_last_swash_slen = slen;
1463         if (klen)
1464             Copy(ptr, PL_last_swash_key, klen, U8);
1465     }
1466
1467     switch ((int)((slen << 3) / needents)) {
1468     case 1:
1469         bit = 1 << (off & 7);
1470         off >>= 3;
1471         return (tmps[off] & bit) != 0;
1472     case 8:
1473         return tmps[off];
1474     case 16:
1475         off <<= 1;
1476         return (tmps[off] << 8) + tmps[off + 1] ;
1477     case 32:
1478         off <<= 2;
1479         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1480     }
1481     Perl_croak(aTHX_ "panic: swash_fetch");
1482     return 0;
1483 }
1484
1485
1486 /*
1487 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1488
1489 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1490 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1491 bytes available. The return value is the pointer to the byte after the
1492 end of the new character. In other words,
1493
1494     d = uvchr_to_utf8(d, uv);
1495
1496 is the recommended wide native character-aware way of saying
1497
1498     *(d++) = uv;
1499
1500 =cut
1501 */
1502
1503 /* On ASCII machines this is normally a macro but we want a
1504    real function in case XS code wants it
1505 */
1506 #undef Perl_uvchr_to_utf8
1507 U8 *
1508 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1509 {
1510     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1511 }
1512
1513
1514 /*
1515 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1516
1517 Returns the native character value of the first character in the string C<s>
1518 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1519 length, in bytes, of that character.
1520
1521 Allows length and flags to be passed to low level routine.
1522
1523 =cut
1524 */
1525 /* On ASCII machines this is normally a macro but we want a
1526    real function in case XS code wants it
1527 */
1528 #undef Perl_utf8n_to_uvchr
1529 UV
1530 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1531 {
1532     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1533     return UNI_TO_NATIVE(uv);
1534 }
1535
1536 /*
1537 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1538
1539 Build to the scalar dsv a displayable version of the string spv,
1540 length len, the displayable version being at most pvlim bytes long
1541 (if longer, the rest is truncated and "..." will be appended).
1542 The flags argument is currently unused but available for future extensions.
1543 The pointer to the PV of the dsv is returned.
1544
1545 =cut */
1546 char *
1547 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1548 {
1549     int truncated = 0;
1550     char *s, *e;
1551
1552     sv_setpvn(dsv, "", 0);
1553     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1554          UV u;
1555          if (pvlim && SvCUR(dsv) >= pvlim) {
1556               truncated++;
1557               break;
1558          }
1559          u = utf8_to_uvchr((U8*)s, 0);
1560          Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1561     }
1562     if (truncated)
1563          sv_catpvn(dsv, "...", 3);
1564
1565     return SvPVX(dsv);
1566 }
1567
1568 /*
1569 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1570
1571 Build to the scalar dsv a displayable version of the scalar sv,
1572 he displayable version being at most pvlim bytes long
1573 (if longer, the rest is truncated and "..." will be appended).
1574 The flags argument is currently unused but available for future extensions.
1575 The pointer to the PV of the dsv is returned.
1576
1577 =cut */
1578 char *
1579 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1580 {
1581      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1582                                 pvlim, flags);
1583 }
1584
1585 /*
1586 =for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1587
1588 Return true if the strings s1 and s2 differ case-insensitively, false
1589 if not (if they are equal case-insensitively).  If u1 is true, the
1590 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1591 the string s2 is assumed to be in UTF-8-encoded Unicode.
1592
1593 For case-insensitiveness, the "casefolding" of Unicode is used
1594 instead of upper/lowercasing both the characters, see
1595 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1596
1597 =cut */
1598 I32
1599 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1600 {
1601      register U8 *a  = (U8*)s1;
1602      register U8 *b  = (U8*)s2;
1603      register U8 *ae = b + len1;
1604      register U8 *be = b + len2;
1605      STRLEN la, lb;
1606      UV ca, cb;
1607      STRLEN ulen1, ulen2;
1608      U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1609      U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
1610
1611      while (a < ae && b < be) {
1612           if (u1) {
1613                if (a + UTF8SKIP(a) > ae)
1614                     break;
1615                ca = utf8_to_uvchr((U8*)a, &la);
1616           } else {
1617                ca = *a;
1618                la = 1;
1619           }
1620           if (u2) {
1621                if (b + UTF8SKIP(b) > be)
1622                     break;
1623                cb = utf8_to_uvchr((U8*)b, &lb);
1624           } else {
1625                cb = *b;
1626                lb = 1;
1627           }
1628           if (ca != cb) {
1629                if (u1)
1630                     to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1631                else
1632                     ulen1 = 1;
1633                if (u2)
1634                     to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1635                else
1636                     ulen2 = 1;
1637                if (ulen1 != ulen2
1638                    || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1639                    || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1640                     return 1; /* mismatch */
1641           }
1642           a += la;
1643           b += lb;
1644      }
1645      return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
1646 }
1647