utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (ckWARN_d(WARN_UTF8)) {
  50          if (UNICODE_IS_SURROGATE(uv))
  51               Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
  52          else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
  53                   (uv == 0xFFFE || uv == 0xFFFF))
  54               Perl_warner(aTHX_ WARN_UTF8,
  55                          "Unicode character 0x%04"UVxf" is illegal", uv);
  56     }
  57     if (UNI_IS_INVARIANT(uv)) {
  58         *d++ = UTF_TO_NATIVE(uv);
  59         return d;
  60     }
  61 #if defined(EBCDIC)
  62     else {
  63         STRLEN len  = UNISKIP(uv);
  64         U8 *p = d+len-1;
  65         while (p > d) {
  66             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  67             uv >>= UTF_ACCUMULATION_SHIFT;
  68         }
  69         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  70         return d+len;
  71     }
  72 #else /* Non loop style */
  73     if (uv < 0x800) {
  74         *d++ = (( uv >>  6)         | 0xc0);
  75         *d++ = (( uv        & 0x3f) | 0x80);
  76         return d;
  77     }
  78     if (uv < 0x10000) {
  79         *d++ = (( uv >> 12)         | 0xe0);
  80         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  81         *d++ = (( uv        & 0x3f) | 0x80);
  82         return d;
  83     }
  84     if (uv < 0x200000) {
  85         *d++ = (( uv >> 18)         | 0xf0);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x4000000) {
  92         *d++ = (( uv >> 24)         | 0xf8);
  93         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  95         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  96         *d++ = (( uv        & 0x3f) | 0x80);
  97         return d;
  98     }
  99     if (uv < 0x80000000) {
 100         *d++ = (( uv >> 30)         | 0xfc);
 101         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 102         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 103         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 104         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 105         *d++ = (( uv        & 0x3f) | 0x80);
 106         return d;
 107     }
 108 #ifdef HAS_QUAD
 109     if (uv < UTF8_QUAD_MAX)
 110 #endif
 111     {
 112         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 113         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 114         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 115         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 116         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 117         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 118         *d++ = (( uv        & 0x3f) | 0x80);
 119         return d;
 120     }
 121 #ifdef HAS_QUAD
 122     {
 123         *d++ =                        0xff;     /* Can't match U+FFFE! */
 124         *d++ =                        0x80;     /* 6 Reserved bits */
 125         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 126         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 127         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 128         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 129         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 130         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 131         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 132         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 133         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 134         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 135         *d++ = (( uv        & 0x3f) | 0x80);
 136         return d;
 137     }
 138 #endif
 139 #endif /* Loop style */
 140 }
 141
 142
 143
 144 /*
 145 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 146
 147 Tests if some arbitrary number of bytes begins in a valid UTF-8
 148 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 149 The actual number of bytes in the UTF-8 character will be returned if
 150 it is valid, otherwise 0.
 151
 152 =cut
 153 */
 154 STRLEN
 155 Perl_is_utf8_char(pTHX_ U8 *s)
 156 {
 157     U8 u = *s;
 158     STRLEN slen, len;
 159     UV uv, ouv;
 160
 161     if (UTF8_IS_INVARIANT(u))
 162         return 1;
 163
 164     if (!UTF8_IS_START(u))
 165         return 0;
 166
 167     len = UTF8SKIP(s);
 168
 169     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 170         return 0;
 171
 172     slen = len - 1;
 173     s++;
 174     u &= UTF_START_MASK(len);
 175     uv  = u;
 176     ouv = uv;
 177     while (slen--) {
 178         if (!UTF8_IS_CONTINUATION(*s))
 179             return 0;
 180         uv = UTF8_ACCUMULATE(uv, *s);
 181         if (uv < ouv)
 182             return 0;
 183         ouv = uv;
 184         s++;
 185     }
 186
 187     if (UNISKIP(uv) < len)
 188         return 0;
 189
 190     return len;
 191 }
 192
 193 /*
 194 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 195
 196 Returns true if first C<len> bytes of the given string form a valid UTF8
 197 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 198 'a string that contains UTF8' because a valid ASCII string is a valid
 199 UTF8 string.
 200
 201 =cut
 202 */
 203
 204 bool
 205 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 206 {
 207     U8* x = s;
 208     U8* send;
 209     STRLEN c;
 210
 211     if (!len)
 212         len = strlen((char *)s);
 213     send = s + len;
 214
 215     while (x < send) {
 216         c = is_utf8_char(x);
 217         if (!c)
 218             return FALSE;
 219         x += c;
 220     }
 221     if (x != send)
 222         return FALSE;
 223
 224     return TRUE;
 225 }
 226
 227 /*
 228 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 229
 230 Bottom level UTF-8 decode routine.
 231 Returns the unicode code point value of the first character in the string C<s>
 232 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 233 C<retlen> will be set to the length, in bytes, of that character.
 234
 235 If C<s> does not point to a well-formed UTF8 character, the behaviour
 236 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 237 it is assumed that the caller will raise a warning, and this function
 238 will silently just set C<retlen> to C<-1> and return zero.  If the
 239 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 240 malformations will be given, C<retlen> will be set to the expected
 241 length of the UTF-8 character in bytes, and zero will be returned.
 242
 243 The C<flags> can also contain various flags to allow deviations from
 244 the strict UTF-8 encoding (see F<utf8.h>).
 245
 246 Most code should use utf8_to_uvchr() rather than call this directly.
 247
 248 =cut
 249 */
 250
 251 UV
 252 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 253 {
 254     U8 *s0 = s;
 255     UV uv = *s, ouv = 0;
 256     STRLEN len = 1;
 257     bool dowarn = ckWARN_d(WARN_UTF8);
 258     U8 startbyte = *s;
 259     STRLEN expectlen = 0;
 260     U32 warning = 0;
 261
 262 /* This list is a superset of the UTF8_ALLOW_XXX. */
 263
 264 #define UTF8_WARN_EMPTY                          1
 265 #define UTF8_WARN_CONTINUATION                   2
 266 #define UTF8_WARN_NON_CONTINUATION               3
 267 #define UTF8_WARN_FE_FF                          4
 268 #define UTF8_WARN_SHORT                          5
 269 #define UTF8_WARN_OVERFLOW                       6
 270 #define UTF8_WARN_SURROGATE                      7
 271 #define UTF8_WARN_BOM                            8
 272 #define UTF8_WARN_LONG                           9
 273 #define UTF8_WARN_FFFF                          10
 274
 275     if (curlen == 0 &&
 276         !(flags & UTF8_ALLOW_EMPTY)) {
 277         warning = UTF8_WARN_EMPTY;
 278         goto malformed;
 279     }
 280
 281     if (UTF8_IS_INVARIANT(uv)) {
 282         if (retlen)
 283             *retlen = 1;
 284         return (UV) (NATIVE_TO_UTF(*s));
 285     }
 286
 287     if (UTF8_IS_CONTINUATION(uv) &&
 288         !(flags & UTF8_ALLOW_CONTINUATION)) {
 289         warning = UTF8_WARN_CONTINUATION;
 290         goto malformed;
 291     }
 292
 293     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 294         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 295         warning = UTF8_WARN_NON_CONTINUATION;
 296         goto malformed;
 297     }
 298
 299 #ifdef EBCDIC
 300     uv = NATIVE_TO_UTF(uv);
 301 #else
 302     if ((uv == 0xfe || uv == 0xff) &&
 303         !(flags & UTF8_ALLOW_FE_FF)) {
 304         warning = UTF8_WARN_FE_FF;
 305         goto malformed;
 306     }
 307 #endif
 308
 309     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 310     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 311     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 312     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 313 #ifdef EBCDIC
 314     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 315     else                        { len =  7; uv &= 0x01; }
 316 #else
 317     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 318     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 319     else                        { len = 13; uv = 0; } /* whoa! */
 320 #endif
 321
 322     if (retlen)
 323         *retlen = len;
 324
 325     expectlen = len;
 326
 327     if ((curlen < expectlen) &&
 328         !(flags & UTF8_ALLOW_SHORT)) {
 329         warning = UTF8_WARN_SHORT;
 330         goto malformed;
 331     }
 332
 333     len--;
 334     s++;
 335     ouv = uv;
 336
 337     while (len--) {
 338         if (!UTF8_IS_CONTINUATION(*s) &&
 339             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 340             s--;
 341             warning = UTF8_WARN_NON_CONTINUATION;
 342             goto malformed;
 343         }
 344         else
 345             uv = UTF8_ACCUMULATE(uv, *s);
 346         if (!(uv > ouv)) {
 347             /* These cannot be allowed. */
 348             if (uv == ouv) {
 349                 if (!(flags & UTF8_ALLOW_LONG)) {
 350                     warning = UTF8_WARN_LONG;
 351                     goto malformed;
 352                 }
 353             }
 354             else { /* uv < ouv */
 355                 /* This cannot be allowed. */
 356                 warning = UTF8_WARN_OVERFLOW;
 357                 goto malformed;
 358             }
 359         }
 360         s++;
 361         ouv = uv;
 362     }
 363
 364     if (UNICODE_IS_SURROGATE(uv) &&
 365         !(flags & UTF8_ALLOW_SURROGATE)) {
 366         warning = UTF8_WARN_SURROGATE;
 367         goto malformed;
 368     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 369                !(flags & UTF8_ALLOW_BOM)) {
 370         warning = UTF8_WARN_BOM;
 371         goto malformed;
 372     } else if ((expectlen > UNISKIP(uv)) &&
 373                !(flags & UTF8_ALLOW_LONG)) {
 374         warning = UTF8_WARN_LONG;
 375         goto malformed;
 376     } else if (UNICODE_IS_ILLEGAL(uv) &&
 377                !(flags & UTF8_ALLOW_FFFF)) {
 378         warning = UTF8_WARN_FFFF;
 379         goto malformed;
 380     }
 381
 382     return uv;
 383
 384 malformed:
 385
 386     if (flags & UTF8_CHECK_ONLY) {
 387         if (retlen)
 388             *retlen = -1;
 389         return 0;
 390     }
 391
 392     if (dowarn) {
 393         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 394
 395         switch (warning) {
 396         case 0: /* Intentionally empty. */ break;
 397         case UTF8_WARN_EMPTY:
 398             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 399             break;
 400         case UTF8_WARN_CONTINUATION:
 401             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 402             break;
 403         case UTF8_WARN_NON_CONTINUATION:
 404             if (s == s0)
 405                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 406                            (UV)s[1], startbyte);
 407             else
 408                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 409                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 410
 411             break;
 412         case UTF8_WARN_FE_FF:
 413             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 414             break;
 415         case UTF8_WARN_SHORT:
 416             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 417                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 418             expectlen = curlen;         /* distance for caller to skip */
 419             break;
 420         case UTF8_WARN_OVERFLOW:
 421             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 422                            ouv, *s, startbyte);
 423             break;
 424         case UTF8_WARN_SURROGATE:
 425             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 426             break;
 427         case UTF8_WARN_BOM:
 428             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 429             break;
 430         case UTF8_WARN_LONG:
 431             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 432                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 433             break;
 434         case UTF8_WARN_FFFF:
 435             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 436             break;
 437         default:
 438             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 439             break;
 440         }
 441
 442         if (warning) {
 443             char *s = SvPVX(sv);
 444
 445             if (PL_op)
 446                 Perl_warner(aTHX_ WARN_UTF8,
 447                             "%s in %s", s,  OP_DESC(PL_op));
 448             else
 449                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 450         }
 451     }
 452
 453     if (retlen)
 454         *retlen = expectlen ? expectlen : len;
 455
 456     return 0;
 457 }
 458
 459 /*
 460 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 461
 462 Returns the native character value of the first character in the string C<s>
 463 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 464 length, in bytes, of that character.
 465
 466 If C<s> does not point to a well-formed UTF8 character, zero is
 467 returned and retlen is set, if possible, to -1.
 468
 469 =cut
 470 */
 471
 472 UV
 473 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 474 {
 475     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 476 }
 477
 478 /*
 479 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 480
 481 Returns the Unicode code point of the first character in the string C<s>
 482 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 483 length, in bytes, of that character.
 484
 485 This function should only be used when returned UV is considered
 486 an index into the Unicode semantic tables (e.g. swashes).
 487
 488 If C<s> does not point to a well-formed UTF8 character, zero is
 489 returned and retlen is set, if possible, to -1.
 490
 491 =cut
 492 */
 493
 494 UV
 495 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 496 {
 497     /* Call the low level routine asking for checks */
 498     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 499 }
 500
 501 /*
 502 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 503
 504 Return the length of the UTF-8 char encoded string C<s> in characters.
 505 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 506 up past C<e>, croaks.
 507
 508 =cut
 509 */
 510
 511 STRLEN
 512 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 513 {
 514     STRLEN len = 0;
 515
 516     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 517      * the bitops (especially ~) can create illegal UTF-8.
 518      * In other words: in Perl UTF-8 is not just for Unicode. */
 519
 520     if (e < s)
 521         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 522     while (s < e) {
 523         U8 t = UTF8SKIP(s);
 524
 525         if (e - s < t)
 526             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 527         s += t;
 528         len++;
 529     }
 530
 531     return len;
 532 }
 533
 534 /*
 535 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 536
 537 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 538 and C<b>.
 539
 540 WARNING: use only if you *know* that the pointers point inside the
 541 same UTF-8 buffer.
 542
 543 =cut
 544 */
 545
 546 IV
 547 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 548 {
 549     IV off = 0;
 550
 551     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 552      * the bitops (especially ~) can create illegal UTF-8.
 553      * In other words: in Perl UTF-8 is not just for Unicode. */
 554
 555     if (a < b) {
 556         while (a < b) {
 557             U8 c = UTF8SKIP(a);
 558
 559             if (b - a < c)
 560                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 561             a += c;
 562             off--;
 563         }
 564     }
 565     else {
 566         while (b < a) {
 567             U8 c = UTF8SKIP(b);
 568
 569             if (a - b < c)
 570                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 571             b += c;
 572             off++;
 573         }
 574     }
 575
 576     return off;
 577 }
 578
 579 /*
 580 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 581
 582 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 583 forward or backward.
 584
 585 WARNING: do not use the following unless you *know* C<off> is within
 586 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 587 on the first byte of character or just after the last byte of a character.
 588
 589 =cut
 590 */
 591
 592 U8 *
 593 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 594 {
 595     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 596      * the bitops (especially ~) can create illegal UTF-8.
 597      * In other words: in Perl UTF-8 is not just for Unicode. */
 598
 599     if (off >= 0) {
 600         while (off--)
 601             s += UTF8SKIP(s);
 602     }
 603     else {
 604         while (off++) {
 605             s--;
 606             while (UTF8_IS_CONTINUATION(*s))
 607                 s--;
 608         }
 609     }
 610     return s;
 611 }
 612
 613 /*
 614 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 615
 616 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 617 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 618 updates len to contain the new length.
 619 Returns zero on failure, setting C<len> to -1.
 620
 621 =cut
 622 */
 623
 624 U8 *
 625 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 626 {
 627     U8 *send;
 628     U8 *d;
 629     U8 *save = s;
 630
 631     /* ensure valid UTF8 and chars < 256 before updating string */
 632     for (send = s + *len; s < send; ) {
 633         U8 c = *s++;
 634
 635         if (!UTF8_IS_INVARIANT(c) &&
 636             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 637              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 638             *len = -1;
 639             return 0;
 640         }
 641     }
 642
 643     d = s = save;
 644     while (s < send) {
 645         STRLEN ulen;
 646         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 647         s += ulen;
 648     }
 649     *d = '\0';
 650     *len = d - save;
 651     return save;
 652 }
 653
 654 /*
 655 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 656
 657 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 658 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 659 the newly-created string, and updates C<len> to contain the new
 660 length.  Returns the original string if no conversion occurs, C<len>
 661 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 662 0 if C<s> is converted or contains all 7bit characters.
 663
 664 =cut
 665 */
 666
 667 U8 *
 668 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 669 {
 670     U8 *d;
 671     U8 *start = s;
 672     U8 *send;
 673     I32 count = 0;
 674
 675     if (!*is_utf8)
 676         return start;
 677
 678     /* ensure valid UTF8 and chars < 256 before converting string */
 679     for (send = s + *len; s < send;) {
 680         U8 c = *s++;
 681         if (!UTF8_IS_INVARIANT(c)) {
 682             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 683                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 684                 count++;
 685             else
 686                 return start;
 687         }
 688     }
 689
 690     *is_utf8 = 0;
 691
 692     Newz(801, d, (*len) - count + 1, U8);
 693     s = start; start = d;
 694     while (s < send) {
 695         U8 c = *s++;
 696         if (!UTF8_IS_INVARIANT(c)) {
 697             /* Then it is two-byte encoded */
 698             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 699             c = ASCII_TO_NATIVE(c);
 700         }
 701         *d++ = c;
 702     }
 703     *d = '\0';
 704     *len = d - start;
 705     return start;
 706 }
 707
 708 /*
 709 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 710
 711 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 712 Returns a pointer to the newly-created string, and sets C<len> to
 713 reflect the new length.
 714
 715 =cut
 716 */
 717
 718 U8*
 719 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 720 {
 721     U8 *send;
 722     U8 *d;
 723     U8 *dst;
 724     send = s + (*len);
 725
 726     Newz(801, d, (*len) * 2 + 1, U8);
 727     dst = d;
 728
 729     while (s < send) {
 730         UV uv = NATIVE_TO_ASCII(*s++);
 731         if (UNI_IS_INVARIANT(uv))
 732             *d++ = UTF_TO_NATIVE(uv);
 733         else {
 734             *d++ = UTF8_EIGHT_BIT_HI(uv);
 735             *d++ = UTF8_EIGHT_BIT_LO(uv);
 736         }
 737     }
 738     *d = '\0';
 739     *len = d-dst;
 740     return dst;
 741 }
 742
 743 /*
 744  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 745  *
 746  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 747  * We optimize for native, for obvious reasons. */
 748
 749 U8*
 750 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 751 {
 752     U8* pend;
 753     U8* dstart = d;
 754
 755     if (bytelen & 1)
 756         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 757
 758     pend = p + bytelen;
 759
 760     while (p < pend) {
 761         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 762         p += 2;
 763         if (uv < 0x80) {
 764             *d++ = uv;
 765             continue;
 766         }
 767         if (uv < 0x800) {
 768             *d++ = (( uv >>  6)         | 0xc0);
 769             *d++ = (( uv        & 0x3f) | 0x80);
 770             continue;
 771         }
 772         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 773             UV low = *p++;
 774             if (low < 0xdc00 || low >= 0xdfff)
 775                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 776             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 777         }
 778         if (uv < 0x10000) {
 779             *d++ = (( uv >> 12)         | 0xe0);
 780             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 781             *d++ = (( uv        & 0x3f) | 0x80);
 782             continue;
 783         }
 784         else {
 785             *d++ = (( uv >> 18)         | 0xf0);
 786             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 787             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 788             *d++ = (( uv        & 0x3f) | 0x80);
 789             continue;
 790         }
 791     }
 792     *newlen = d - dstart;
 793     return d;
 794 }
 795
 796 /* Note: this one is slightly destructive of the source. */
 797
 798 U8*
 799 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 800 {
 801     U8* s = (U8*)p;
 802     U8* send = s + bytelen;
 803     while (s < send) {
 804         U8 tmp = s[0];
 805         s[0] = s[1];
 806         s[1] = tmp;
 807         s += 2;
 808     }
 809     return utf16_to_utf8(p, d, bytelen, newlen);
 810 }
 811
 812 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 813
 814 bool
 815 Perl_is_uni_alnum(pTHX_ UV c)
 816 {
 817     U8 tmpbuf[UTF8_MAXLEN+1];
 818     uvchr_to_utf8(tmpbuf, (UV)c);
 819     return is_utf8_alnum(tmpbuf);
 820 }
 821
 822 bool
 823 Perl_is_uni_alnumc(pTHX_ UV c)
 824 {
 825     U8 tmpbuf[UTF8_MAXLEN+1];
 826     uvchr_to_utf8(tmpbuf, (UV)c);
 827     return is_utf8_alnumc(tmpbuf);
 828 }
 829
 830 bool
 831 Perl_is_uni_idfirst(pTHX_ UV c)
 832 {
 833     U8 tmpbuf[UTF8_MAXLEN+1];
 834     uvchr_to_utf8(tmpbuf, (UV)c);
 835     return is_utf8_idfirst(tmpbuf);
 836 }
 837
 838 bool
 839 Perl_is_uni_alpha(pTHX_ UV c)
 840 {
 841     U8 tmpbuf[UTF8_MAXLEN+1];
 842     uvchr_to_utf8(tmpbuf, (UV)c);
 843     return is_utf8_alpha(tmpbuf);
 844 }
 845
 846 bool
 847 Perl_is_uni_ascii(pTHX_ UV c)
 848 {
 849     U8 tmpbuf[UTF8_MAXLEN+1];
 850     uvchr_to_utf8(tmpbuf, (UV)c);
 851     return is_utf8_ascii(tmpbuf);
 852 }
 853
 854 bool
 855 Perl_is_uni_space(pTHX_ UV c)
 856 {
 857     U8 tmpbuf[UTF8_MAXLEN+1];
 858     uvchr_to_utf8(tmpbuf, (UV)c);
 859     return is_utf8_space(tmpbuf);
 860 }
 861
 862 bool
 863 Perl_is_uni_digit(pTHX_ UV c)
 864 {
 865     U8 tmpbuf[UTF8_MAXLEN+1];
 866     uvchr_to_utf8(tmpbuf, (UV)c);
 867     return is_utf8_digit(tmpbuf);
 868 }
 869
 870 bool
 871 Perl_is_uni_upper(pTHX_ UV c)
 872 {
 873     U8 tmpbuf[UTF8_MAXLEN+1];
 874     uvchr_to_utf8(tmpbuf, (UV)c);
 875     return is_utf8_upper(tmpbuf);
 876 }
 877
 878 bool
 879 Perl_is_uni_lower(pTHX_ UV c)
 880 {
 881     U8 tmpbuf[UTF8_MAXLEN+1];
 882     uvchr_to_utf8(tmpbuf, (UV)c);
 883     return is_utf8_lower(tmpbuf);
 884 }
 885
 886 bool
 887 Perl_is_uni_cntrl(pTHX_ UV c)
 888 {
 889     U8 tmpbuf[UTF8_MAXLEN+1];
 890     uvchr_to_utf8(tmpbuf, (UV)c);
 891     return is_utf8_cntrl(tmpbuf);
 892 }
 893
 894 bool
 895 Perl_is_uni_graph(pTHX_ UV c)
 896 {
 897     U8 tmpbuf[UTF8_MAXLEN+1];
 898     uvchr_to_utf8(tmpbuf, (UV)c);
 899     return is_utf8_graph(tmpbuf);
 900 }
 901
 902 bool
 903 Perl_is_uni_print(pTHX_ UV c)
 904 {
 905     U8 tmpbuf[UTF8_MAXLEN+1];
 906     uvchr_to_utf8(tmpbuf, (UV)c);
 907     return is_utf8_print(tmpbuf);
 908 }
 909
 910 bool
 911 Perl_is_uni_punct(pTHX_ UV c)
 912 {
 913     U8 tmpbuf[UTF8_MAXLEN+1];
 914     uvchr_to_utf8(tmpbuf, (UV)c);
 915     return is_utf8_punct(tmpbuf);
 916 }
 917
 918 bool
 919 Perl_is_uni_xdigit(pTHX_ UV c)
 920 {
 921     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 922     uvchr_to_utf8(tmpbuf, (UV)c);
 923     return is_utf8_xdigit(tmpbuf);
 924 }
 925
 926 UV
 927 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 928 {
 929     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 930     uvchr_to_utf8(tmpbuf, (UV)c);
 931     return to_utf8_upper(tmpbuf, p, lenp);
 932 }
 933
 934 UV
 935 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 936 {
 937     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 938     uvchr_to_utf8(tmpbuf, (UV)c);
 939     return to_utf8_title(tmpbuf, p, lenp);
 940 }
 941
 942 UV
 943 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 944 {
 945     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 946     uvchr_to_utf8(tmpbuf, (UV)c);
 947     return to_utf8_lower(tmpbuf, p, lenp);
 948 }
 949
 950 UV
 951 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 952 {
 953     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
 954     uvchr_to_utf8(tmpbuf, (UV)c);
 955     return to_utf8_fold(tmpbuf, p, lenp);
 956 }
 957
 958 /* for now these all assume no locale info available for Unicode > 255 */
 959
 960 bool
 961 Perl_is_uni_alnum_lc(pTHX_ UV c)
 962 {
 963     return is_uni_alnum(c);     /* XXX no locale support yet */
 964 }
 965
 966 bool
 967 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 968 {
 969     return is_uni_alnumc(c);    /* XXX no locale support yet */
 970 }
 971
 972 bool
 973 Perl_is_uni_idfirst_lc(pTHX_ UV c)
 974 {
 975     return is_uni_idfirst(c);   /* XXX no locale support yet */
 976 }
 977
 978 bool
 979 Perl_is_uni_alpha_lc(pTHX_ UV c)
 980 {
 981     return is_uni_alpha(c);     /* XXX no locale support yet */
 982 }
 983
 984 bool
 985 Perl_is_uni_ascii_lc(pTHX_ UV c)
 986 {
 987     return is_uni_ascii(c);     /* XXX no locale support yet */
 988 }
 989
 990 bool
 991 Perl_is_uni_space_lc(pTHX_ UV c)
 992 {
 993     return is_uni_space(c);     /* XXX no locale support yet */
 994 }
 995
 996 bool
 997 Perl_is_uni_digit_lc(pTHX_ UV c)
 998 {
 999     return is_uni_digit(c);     /* XXX no locale support yet */
1000 }
1001
1002 bool
1003 Perl_is_uni_upper_lc(pTHX_ UV c)
1004 {
1005     return is_uni_upper(c);     /* XXX no locale support yet */
1006 }
1007
1008 bool
1009 Perl_is_uni_lower_lc(pTHX_ UV c)
1010 {
1011     return is_uni_lower(c);     /* XXX no locale support yet */
1012 }
1013
1014 bool
1015 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1016 {
1017     return is_uni_cntrl(c);     /* XXX no locale support yet */
1018 }
1019
1020 bool
1021 Perl_is_uni_graph_lc(pTHX_ UV c)
1022 {
1023     return is_uni_graph(c);     /* XXX no locale support yet */
1024 }
1025
1026 bool
1027 Perl_is_uni_print_lc(pTHX_ UV c)
1028 {
1029     return is_uni_print(c);     /* XXX no locale support yet */
1030 }
1031
1032 bool
1033 Perl_is_uni_punct_lc(pTHX_ UV c)
1034 {
1035     return is_uni_punct(c);     /* XXX no locale support yet */
1036 }
1037
1038 bool
1039 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1040 {
1041     return is_uni_xdigit(c);    /* XXX no locale support yet */
1042 }
1043
1044 bool
1045 Perl_is_utf8_alnum(pTHX_ U8 *p)
1046 {
1047     if (!is_utf8_char(p))
1048         return FALSE;
1049     if (!PL_utf8_alnum)
1050         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1051          * descendant of isalnum(3), in other words, it doesn't
1052          * contain the '_'. --jhi */
1053         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1054     return swash_fetch(PL_utf8_alnum, p, TRUE);
1055 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1056 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1057     if (!PL_utf8_alnum)
1058         PL_utf8_alnum = swash_init("utf8", "",
1059             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1060     return swash_fetch(PL_utf8_alnum, p, TRUE);
1061 #endif
1062 }
1063
1064 bool
1065 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1066 {
1067     if (!is_utf8_char(p))
1068         return FALSE;
1069     if (!PL_utf8_alnum)
1070         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1071     return swash_fetch(PL_utf8_alnum, p, TRUE);
1072 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1073 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1074     if (!PL_utf8_alnum)
1075         PL_utf8_alnum = swash_init("utf8", "",
1076             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1077     return swash_fetch(PL_utf8_alnum, p, TRUE);
1078 #endif
1079 }
1080
1081 bool
1082 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1083 {
1084     return *p == '_' || is_utf8_alpha(p);
1085 }
1086
1087 bool
1088 Perl_is_utf8_alpha(pTHX_ U8 *p)
1089 {
1090     if (!is_utf8_char(p))
1091         return FALSE;
1092     if (!PL_utf8_alpha)
1093         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1094     return swash_fetch(PL_utf8_alpha, p, TRUE);
1095 }
1096
1097 bool
1098 Perl_is_utf8_ascii(pTHX_ U8 *p)
1099 {
1100     if (!is_utf8_char(p))
1101         return FALSE;
1102     if (!PL_utf8_ascii)
1103         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1104     return swash_fetch(PL_utf8_ascii, p, TRUE);
1105 }
1106
1107 bool
1108 Perl_is_utf8_space(pTHX_ U8 *p)
1109 {
1110     if (!is_utf8_char(p))
1111         return FALSE;
1112     if (!PL_utf8_space)
1113         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1114     return swash_fetch(PL_utf8_space, p, TRUE);
1115 }
1116
1117 bool
1118 Perl_is_utf8_digit(pTHX_ U8 *p)
1119 {
1120     if (!is_utf8_char(p))
1121         return FALSE;
1122     if (!PL_utf8_digit)
1123         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1124     return swash_fetch(PL_utf8_digit, p, TRUE);
1125 }
1126
1127 bool
1128 Perl_is_utf8_upper(pTHX_ U8 *p)
1129 {
1130     if (!is_utf8_char(p))
1131         return FALSE;
1132     if (!PL_utf8_upper)
1133         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1134     return swash_fetch(PL_utf8_upper, p, TRUE);
1135 }
1136
1137 bool
1138 Perl_is_utf8_lower(pTHX_ U8 *p)
1139 {
1140     if (!is_utf8_char(p))
1141         return FALSE;
1142     if (!PL_utf8_lower)
1143         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1144     return swash_fetch(PL_utf8_lower, p, TRUE);
1145 }
1146
1147 bool
1148 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1149 {
1150     if (!is_utf8_char(p))
1151         return FALSE;
1152     if (!PL_utf8_cntrl)
1153         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1154     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1155 }
1156
1157 bool
1158 Perl_is_utf8_graph(pTHX_ U8 *p)
1159 {
1160     if (!is_utf8_char(p))
1161         return FALSE;
1162     if (!PL_utf8_graph)
1163         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1164     return swash_fetch(PL_utf8_graph, p, TRUE);
1165 }
1166
1167 bool
1168 Perl_is_utf8_print(pTHX_ U8 *p)
1169 {
1170     if (!is_utf8_char(p))
1171         return FALSE;
1172     if (!PL_utf8_print)
1173         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1174     return swash_fetch(PL_utf8_print, p, TRUE);
1175 }
1176
1177 bool
1178 Perl_is_utf8_punct(pTHX_ U8 *p)
1179 {
1180     if (!is_utf8_char(p))
1181         return FALSE;
1182     if (!PL_utf8_punct)
1183         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1184     return swash_fetch(PL_utf8_punct, p, TRUE);
1185 }
1186
1187 bool
1188 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1189 {
1190     if (!is_utf8_char(p))
1191         return FALSE;
1192     if (!PL_utf8_xdigit)
1193         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1194     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1195 }
1196
1197 bool
1198 Perl_is_utf8_mark(pTHX_ U8 *p)
1199 {
1200     if (!is_utf8_char(p))
1201         return FALSE;
1202     if (!PL_utf8_mark)
1203         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1204     return swash_fetch(PL_utf8_mark, p, TRUE);
1205 }
1206
1207 /*
1208 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1209
1210 The "p" contains the pointer to the UTF-8 string encoding
1211 the character that is being converted.
1212
1213 The "ustrp" is a pointer to the character buffer to put the
1214 conversion result to.  The "lenp" is a pointer to the length
1215 of the result.
1216
1217 The "swash" is a pointer to the swash to use.
1218
1219 The "normal" is a string like "ToLower" which means the swash
1220 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1221 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1222
1223 The "special" is a string like "utf8::ToSpecLower", which means
1224 the hash %utf8::ToSpecLower, which is stored in the same file,
1225 lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
1226 to the hash is by Perl_to_utf8_case().
1227
1228 =cut
1229  */
1230
1231 UV
1232 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1233 {
1234     UV uv;
1235
1236     if (!*swashp)
1237         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1238     uv = swash_fetch(*swashp, p, TRUE);
1239     if (uv)
1240          uv = UNI_TO_NATIVE(uv);
1241     else {
1242          HV *hv;
1243          SV *keysv;
1244          HE *he;
1245
1246          uv = utf8_to_uvchr(p, 0);
1247
1248          if ((hv    = get_hv(special, FALSE)) &&
1249              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1250              (he    = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1251               SV *val = HeVAL(he);
1252               char *s = SvPV(val, *lenp);
1253               U8 c = *(U8*)s;
1254               if (*lenp > 1 || UNI_IS_INVARIANT(c))
1255                    Copy(s, ustrp, *lenp, U8);
1256               else {
1257                    /* something in the 0x80..0xFF range */
1258                    ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1259                    ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1260                    *lenp = 2;
1261               }
1262               return 0;
1263          }
1264     }
1265     if (lenp)
1266        *lenp = UNISKIP(uv);
1267     uvuni_to_utf8(ustrp, uv);
1268     return uv;
1269 }
1270
1271 UV
1272 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1273 {
1274     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1275                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1276 }
1277
1278 UV
1279 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1280 {
1281     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1282                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1283 }
1284
1285 UV
1286 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1287 {
1288     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1289                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1290 }
1291
1292 UV
1293 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1294 {
1295     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1296                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1297 }
1298
1299 /* a "swash" is a swatch hash */
1300
1301 SV*
1302 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1303 {
1304     SV* retval;
1305     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1306     dSP;
1307     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1308     SV* errsv_save;
1309
1310     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1311         ENTER;
1312         errsv_save = newSVsv(ERRSV);
1313         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1314         if (!SvTRUE(ERRSV))
1315             sv_setsv(ERRSV, errsv_save);
1316         SvREFCNT_dec(errsv_save);
1317         LEAVE;
1318     }
1319     SPAGAIN;
1320     PUSHSTACKi(PERLSI_MAGIC);
1321     PUSHMARK(SP);
1322     EXTEND(SP,5);
1323     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1324     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1325     PUSHs(listsv);
1326     PUSHs(sv_2mortal(newSViv(minbits)));
1327     PUSHs(sv_2mortal(newSViv(none)));
1328     PUTBACK;
1329     ENTER;
1330     SAVEI32(PL_hints);
1331     PL_hints = 0;
1332     save_re_context();
1333     if (PL_curcop == &PL_compiling)
1334         /* XXX ought to be handled by lex_start */
1335         sv_setpv(tokenbufsv, PL_tokenbuf);
1336     errsv_save = newSVsv(ERRSV);
1337     if (call_method("SWASHNEW", G_SCALAR))
1338         retval = newSVsv(*PL_stack_sp--);
1339     else
1340         retval = &PL_sv_undef;
1341     if (!SvTRUE(ERRSV))
1342         sv_setsv(ERRSV, errsv_save);
1343     SvREFCNT_dec(errsv_save);
1344     LEAVE;
1345     POPSTACK;
1346     if (PL_curcop == &PL_compiling) {
1347         STRLEN len;
1348         char* pv = SvPV(tokenbufsv, len);
1349
1350         Copy(pv, PL_tokenbuf, len+1, char);
1351         PL_curcop->op_private = PL_hints;
1352     }
1353     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1354         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1355     return retval;
1356 }
1357
1358
1359 /* This API is wrong for special case conversions since we may need to
1360  * return several Unicode characters for a single Unicode character
1361  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1362  * the lower-level routine, and it is similarly broken for returning
1363  * multiple values.  --jhi */
1364 UV
1365 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1366 {
1367     HV* hv = (HV*)SvRV(sv);
1368     U32 klen;
1369     U32 off;
1370     STRLEN slen;
1371     STRLEN needents;
1372     U8 *tmps = NULL;
1373     U32 bit;
1374     SV *retval;
1375     U8 tmputf8[2];
1376     UV c = NATIVE_TO_ASCII(*ptr);
1377
1378     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1379         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1380         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1381         ptr = tmputf8;
1382     }
1383     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1384      * then the "swatch" is a vec() for al the chars which start
1385      * with 0xAA..0xYY
1386      * So the key in the hash (klen) is length of encoded char -1
1387      */
1388     klen = UTF8SKIP(ptr) - 1;
1389     off  = ptr[klen];
1390
1391     if (klen == 0)
1392      {
1393       /* If char in invariant then swatch is for all the invariant chars
1394        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1395        */
1396       needents = UTF_CONTINUATION_MARK;
1397       off      = NATIVE_TO_UTF(ptr[klen]);
1398      }
1399     else
1400      {
1401       /* If char is encoded then swatch is for the prefix */
1402       needents = (1 << UTF_ACCUMULATION_SHIFT);
1403       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1404      }
1405
1406     /*
1407      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1408      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1409      * it's nothing to sniff at.)  Pity we usually come through at least
1410      * two function calls to get here...
1411      *
1412      * NB: this code assumes that swatches are never modified, once generated!
1413      */
1414
1415     if (hv   == PL_last_swash_hv &&
1416         klen == PL_last_swash_klen &&
1417         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1418     {
1419         tmps = PL_last_swash_tmps;
1420         slen = PL_last_swash_slen;
1421     }
1422     else {
1423         /* Try our second-level swatch cache, kept in a hash. */
1424         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1425
1426         /* If not cached, generate it via utf8::SWASHGET */
1427         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1428             dSP;
1429             /* We use utf8n_to_uvuni() as we want an index into
1430                Unicode tables, not a native character number.
1431              */
1432             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1433             SV *errsv_save;
1434             ENTER;
1435             SAVETMPS;
1436             save_re_context();
1437             PUSHSTACKi(PERLSI_MAGIC);
1438             PUSHMARK(SP);
1439             EXTEND(SP,3);
1440             PUSHs((SV*)sv);
1441             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1442             PUSHs(sv_2mortal(newSViv((klen) ?
1443                                      (code_point & ~(needents - 1)) : 0)));
1444             PUSHs(sv_2mortal(newSViv(needents)));
1445             PUTBACK;
1446             errsv_save = newSVsv(ERRSV);
1447             if (call_method("SWASHGET", G_SCALAR))
1448                 retval = newSVsv(*PL_stack_sp--);
1449             else
1450                 retval = &PL_sv_undef;
1451             if (!SvTRUE(ERRSV))
1452                 sv_setsv(ERRSV, errsv_save);
1453             SvREFCNT_dec(errsv_save);
1454             POPSTACK;
1455             FREETMPS;
1456             LEAVE;
1457             if (PL_curcop == &PL_compiling)
1458                 PL_curcop->op_private = PL_hints;
1459
1460             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1461
1462             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1463                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1464         }
1465
1466         PL_last_swash_hv = hv;
1467         PL_last_swash_klen = klen;
1468         PL_last_swash_tmps = tmps;
1469         PL_last_swash_slen = slen;
1470         if (klen)
1471             Copy(ptr, PL_last_swash_key, klen, U8);
1472     }
1473
1474     switch ((int)((slen << 3) / needents)) {
1475     case 1:
1476         bit = 1 << (off & 7);
1477         off >>= 3;
1478         return (tmps[off] & bit) != 0;
1479     case 8:
1480         return tmps[off];
1481     case 16:
1482         off <<= 1;
1483         return (tmps[off] << 8) + tmps[off + 1] ;
1484     case 32:
1485         off <<= 2;
1486         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1487     }
1488     Perl_croak(aTHX_ "panic: swash_fetch");
1489     return 0;
1490 }
1491
1492
1493 /*
1494 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1495
1496 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1497 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1498 bytes available. The return value is the pointer to the byte after the
1499 end of the new character. In other words,
1500
1501     d = uvchr_to_utf8(d, uv);
1502
1503 is the recommended wide native character-aware way of saying
1504
1505     *(d++) = uv;
1506
1507 =cut
1508 */
1509
1510 /* On ASCII machines this is normally a macro but we want a
1511    real function in case XS code wants it
1512 */
1513 #undef Perl_uvchr_to_utf8
1514 U8 *
1515 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1516 {
1517     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1518 }
1519
1520
1521 /*
1522 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1523
1524 Returns the native character value of the first character in the string C<s>
1525 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1526 length, in bytes, of that character.
1527
1528 Allows length and flags to be passed to low level routine.
1529
1530 =cut
1531 */
1532 /* On ASCII machines this is normally a macro but we want a
1533    real function in case XS code wants it
1534 */
1535 #undef Perl_utf8n_to_uvchr
1536 UV
1537 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1538 {
1539     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1540     return UNI_TO_NATIVE(uv);
1541 }
1542
1543 /*
1544 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1545
1546 Build to the scalar dsv a displayable version of the string spv,
1547 length len, the displayable version being at most pvlim bytes long
1548 (if longer, the rest is truncated and "..." will be appended).
1549 The flags argument is currently unused but available for future extensions.
1550 The pointer to the PV of the dsv is returned.
1551
1552 =cut */
1553 char *
1554 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1555 {
1556     int truncated = 0;
1557     char *s, *e;
1558
1559     sv_setpvn(dsv, "", 0);
1560     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1561          UV u;
1562          if (pvlim && SvCUR(dsv) >= pvlim) {
1563               truncated++;
1564               break;
1565          }
1566          u = utf8_to_uvchr((U8*)s, 0);
1567          Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1568     }
1569     if (truncated)
1570          sv_catpvn(dsv, "...", 3);
1571
1572     return SvPVX(dsv);
1573 }
1574
1575 /*
1576 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1577
1578 Build to the scalar dsv a displayable version of the scalar sv,
1579 he displayable version being at most pvlim bytes long
1580 (if longer, the rest is truncated and "..." will be appended).
1581 The flags argument is currently unused but available for future extensions.
1582 The pointer to the PV of the dsv is returned.
1583
1584 =cut */
1585 char *
1586 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1587 {
1588      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1589                                 pvlim, flags);
1590 }
1591
1592 /*
1593 =for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1594
1595 Return true if the strings s1 and s2 differ case-insensitively, false
1596 if not (if they are equal case-insensitively).  If u1 is true, the
1597 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1598 the string s2 is assumed to be in UTF-8-encoded Unicode.
1599
1600 For case-insensitiveness, the "casefolding" of Unicode is used
1601 instead of upper/lowercasing both the characters, see
1602 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1603
1604 =cut */
1605 I32
1606 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1607 {
1608      register U8 *a  = (U8*)s1;
1609      register U8 *b  = (U8*)s2;
1610      register U8 *ae = b + len1;
1611      register U8 *be = b + len2;
1612      STRLEN la, lb;
1613      UV ca, cb;
1614      STRLEN ulen1, ulen2;
1615      U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1616      U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
1617
1618      while (a < ae && b < be) {
1619           if (u1) {
1620                if (a + UTF8SKIP(a) > ae)
1621                     break;
1622                ca = utf8_to_uvchr((U8*)a, &la);
1623           } else {
1624                ca = *a;
1625                la = 1;
1626           }
1627           if (u2) {
1628                if (b + UTF8SKIP(b) > be)
1629                     break;
1630                cb = utf8_to_uvchr((U8*)b, &lb);
1631           } else {
1632                cb = *b;
1633                lb = 1;
1634           }
1635           if (ca != cb) {
1636                if (u1)
1637                     to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1638                else
1639                     ulen1 = 1;
1640                if (u2)
1641                     to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1642                else
1643                     ulen2 = 1;
1644                if (ulen1 != ulen2
1645                    || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1646                    || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1647                     return 1; /* mismatch */
1648           }
1649           a += la;
1650           b += lb;
1651      }
1652      return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
1653 }
1654