utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (ckWARN_d(WARN_UTF8)) {
  50          if (UNICODE_IS_SURROGATE(uv))
  51               Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
  52          else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
  53                   (uv == 0xFFFE || uv == 0xFFFF))
  54               Perl_warner(aTHX_ WARN_UTF8,
  55                          "Unicode character 0x%04"UVxf" is illegal", uv);
  56     }
  57     if (UNI_IS_INVARIANT(uv)) {
  58         *d++ = UTF_TO_NATIVE(uv);
  59         return d;
  60     }
  61 #if defined(EBCDIC)
  62     else {
  63         STRLEN len  = UNISKIP(uv);
  64         U8 *p = d+len-1;
  65         while (p > d) {
  66             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  67             uv >>= UTF_ACCUMULATION_SHIFT;
  68         }
  69         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  70         return d+len;
  71     }
  72 #else /* Non loop style */
  73     if (uv < 0x800) {
  74         *d++ = (( uv >>  6)         | 0xc0);
  75         *d++ = (( uv        & 0x3f) | 0x80);
  76         return d;
  77     }
  78     if (uv < 0x10000) {
  79         *d++ = (( uv >> 12)         | 0xe0);
  80         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  81         *d++ = (( uv        & 0x3f) | 0x80);
  82         return d;
  83     }
  84     if (uv < 0x200000) {
  85         *d++ = (( uv >> 18)         | 0xf0);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x4000000) {
  92         *d++ = (( uv >> 24)         | 0xf8);
  93         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  95         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  96         *d++ = (( uv        & 0x3f) | 0x80);
  97         return d;
  98     }
  99     if (uv < 0x80000000) {
 100         *d++ = (( uv >> 30)         | 0xfc);
 101         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 102         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 103         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 104         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 105         *d++ = (( uv        & 0x3f) | 0x80);
 106         return d;
 107     }
 108 #ifdef HAS_QUAD
 109     if (uv < UTF8_QUAD_MAX)
 110 #endif
 111     {
 112         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 113         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 114         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 115         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 116         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 117         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 118         *d++ = (( uv        & 0x3f) | 0x80);
 119         return d;
 120     }
 121 #ifdef HAS_QUAD
 122     {
 123         *d++ =                        0xff;     /* Can't match U+FFFE! */
 124         *d++ =                        0x80;     /* 6 Reserved bits */
 125         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 126         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 127         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 128         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 129         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 130         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 131         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 132         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 133         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 134         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 135         *d++ = (( uv        & 0x3f) | 0x80);
 136         return d;
 137     }
 138 #endif
 139 #endif /* Loop style */
 140 }
 141
 142
 143
 144 /*
 145 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 146
 147 Tests if some arbitrary number of bytes begins in a valid UTF-8
 148 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 149 The actual number of bytes in the UTF-8 character will be returned if
 150 it is valid, otherwise 0.
 151
 152 =cut
 153 */
 154 STRLEN
 155 Perl_is_utf8_char(pTHX_ U8 *s)
 156 {
 157     U8 u = *s;
 158     STRLEN slen, len;
 159     UV uv, ouv;
 160
 161     if (UTF8_IS_INVARIANT(u))
 162         return 1;
 163
 164     if (!UTF8_IS_START(u))
 165         return 0;
 166
 167     len = UTF8SKIP(s);
 168
 169     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 170         return 0;
 171
 172     slen = len - 1;
 173     s++;
 174     u &= UTF_START_MASK(len);
 175     uv  = u;
 176     ouv = uv;
 177     while (slen--) {
 178         if (!UTF8_IS_CONTINUATION(*s))
 179             return 0;
 180         uv = UTF8_ACCUMULATE(uv, *s);
 181         if (uv < ouv)
 182             return 0;
 183         ouv = uv;
 184         s++;
 185     }
 186
 187     if (UNISKIP(uv) < len)
 188         return 0;
 189
 190     return len;
 191 }
 192
 193 /*
 194 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 195
 196 Returns true if first C<len> bytes of the given string form a valid UTF8
 197 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 198 'a string that contains UTF8' because a valid ASCII string is a valid
 199 UTF8 string.
 200
 201 =cut
 202 */
 203
 204 bool
 205 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 206 {
 207     U8* x = s;
 208     U8* send;
 209     STRLEN c;
 210
 211     if (!len)
 212         len = strlen((char *)s);
 213     send = s + len;
 214
 215     while (x < send) {
 216         c = is_utf8_char(x);
 217         if (!c)
 218             return FALSE;
 219         x += c;
 220     }
 221     if (x != send)
 222         return FALSE;
 223
 224     return TRUE;
 225 }
 226
 227 /*
 228 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 229
 230 Bottom level UTF-8 decode routine.
 231 Returns the unicode code point value of the first character in the string C<s>
 232 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 233 C<retlen> will be set to the length, in bytes, of that character.
 234
 235 If C<s> does not point to a well-formed UTF8 character, the behaviour
 236 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 237 it is assumed that the caller will raise a warning, and this function
 238 will silently just set C<retlen> to C<-1> and return zero.  If the
 239 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 240 malformations will be given, C<retlen> will be set to the expected
 241 length of the UTF-8 character in bytes, and zero will be returned.
 242
 243 The C<flags> can also contain various flags to allow deviations from
 244 the strict UTF-8 encoding (see F<utf8.h>).
 245
 246 Most code should use utf8_to_uvchr() rather than call this directly.
 247
 248 =cut
 249 */
 250
 251 UV
 252 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 253 {
 254     U8 *s0 = s;
 255     UV uv = *s, ouv = 0;
 256     STRLEN len = 1;
 257     bool dowarn = ckWARN_d(WARN_UTF8);
 258     UV startbyte = *s;
 259     STRLEN expectlen = 0;
 260     U32 warning = 0;
 261
 262 /* This list is a superset of the UTF8_ALLOW_XXX. */
 263
 264 #define UTF8_WARN_EMPTY                          1
 265 #define UTF8_WARN_CONTINUATION                   2
 266 #define UTF8_WARN_NON_CONTINUATION               3
 267 #define UTF8_WARN_FE_FF                          4
 268 #define UTF8_WARN_SHORT                          5
 269 #define UTF8_WARN_OVERFLOW                       6
 270 #define UTF8_WARN_SURROGATE                      7
 271 #define UTF8_WARN_BOM                            8
 272 #define UTF8_WARN_LONG                           9
 273 #define UTF8_WARN_FFFF                          10
 274
 275     if (curlen == 0 &&
 276         !(flags & UTF8_ALLOW_EMPTY)) {
 277         warning = UTF8_WARN_EMPTY;
 278         goto malformed;
 279     }
 280
 281     if (UTF8_IS_INVARIANT(uv)) {
 282         if (retlen)
 283             *retlen = 1;
 284         return (UV) (NATIVE_TO_UTF(*s));
 285     }
 286
 287     if (UTF8_IS_CONTINUATION(uv) &&
 288         !(flags & UTF8_ALLOW_CONTINUATION)) {
 289         warning = UTF8_WARN_CONTINUATION;
 290         goto malformed;
 291     }
 292
 293     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 294         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 295         warning = UTF8_WARN_NON_CONTINUATION;
 296         goto malformed;
 297     }
 298
 299 #ifdef EBCDIC
 300     uv = NATIVE_TO_UTF(uv);
 301 #else
 302     if ((uv == 0xfe || uv == 0xff) &&
 303         !(flags & UTF8_ALLOW_FE_FF)) {
 304         warning = UTF8_WARN_FE_FF;
 305         goto malformed;
 306     }
 307 #endif
 308
 309     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 310     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 311     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 312     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 313 #ifdef EBCDIC
 314     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 315     else                        { len =  7; uv &= 0x01; }
 316 #else
 317     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 318     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 319     else                        { len = 13; uv = 0; } /* whoa! */
 320 #endif
 321
 322     if (retlen)
 323         *retlen = len;
 324
 325     expectlen = len;
 326
 327     if ((curlen < expectlen) &&
 328         !(flags & UTF8_ALLOW_SHORT)) {
 329         warning = UTF8_WARN_SHORT;
 330         goto malformed;
 331     }
 332
 333     len--;
 334     s++;
 335     ouv = uv;
 336
 337     while (len--) {
 338         if (!UTF8_IS_CONTINUATION(*s) &&
 339             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 340             s--;
 341             warning = UTF8_WARN_NON_CONTINUATION;
 342             goto malformed;
 343         }
 344         else
 345             uv = UTF8_ACCUMULATE(uv, *s);
 346         if (!(uv > ouv)) {
 347             /* These cannot be allowed. */
 348             if (uv == ouv) {
 349                 if (!(flags & UTF8_ALLOW_LONG)) {
 350                     warning = UTF8_WARN_LONG;
 351                     goto malformed;
 352                 }
 353             }
 354             else { /* uv < ouv */
 355                 /* This cannot be allowed. */
 356                 warning = UTF8_WARN_OVERFLOW;
 357                 goto malformed;
 358             }
 359         }
 360         s++;
 361         ouv = uv;
 362     }
 363
 364     if (UNICODE_IS_SURROGATE(uv) &&
 365         !(flags & UTF8_ALLOW_SURROGATE)) {
 366         warning = UTF8_WARN_SURROGATE;
 367         goto malformed;
 368     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 369                !(flags & UTF8_ALLOW_BOM)) {
 370         warning = UTF8_WARN_BOM;
 371         goto malformed;
 372     } else if ((expectlen > UNISKIP(uv)) &&
 373                !(flags & UTF8_ALLOW_LONG)) {
 374         warning = UTF8_WARN_LONG;
 375         goto malformed;
 376     } else if (UNICODE_IS_ILLEGAL(uv) &&
 377                !(flags & UTF8_ALLOW_FFFF)) {
 378         warning = UTF8_WARN_FFFF;
 379         goto malformed;
 380     }
 381
 382     return uv;
 383
 384 malformed:
 385
 386     if (flags & UTF8_CHECK_ONLY) {
 387         if (retlen)
 388             *retlen = -1;
 389         return 0;
 390     }
 391
 392     if (dowarn) {
 393         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 394
 395         switch (warning) {
 396         case 0: /* Intentionally empty. */ break;
 397         case UTF8_WARN_EMPTY:
 398             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 399             break;
 400         case UTF8_WARN_CONTINUATION:
 401             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 402             break;
 403         case UTF8_WARN_NON_CONTINUATION:
 404             if (s == s0)
 405                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 406                            (UV)s[1], startbyte);
 407             else
 408                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 409                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 410
 411             break;
 412         case UTF8_WARN_FE_FF:
 413             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 414             break;
 415         case UTF8_WARN_SHORT:
 416             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 417                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 418             expectlen = curlen;         /* distance for caller to skip */
 419             break;
 420         case UTF8_WARN_OVERFLOW:
 421             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 422                            ouv, *s, startbyte);
 423             break;
 424         case UTF8_WARN_SURROGATE:
 425             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 426             break;
 427         case UTF8_WARN_BOM:
 428             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 429             break;
 430         case UTF8_WARN_LONG:
 431             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 432                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 433             break;
 434         case UTF8_WARN_FFFF:
 435             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 436             break;
 437         default:
 438             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 439             break;
 440         }
 441
 442         if (warning) {
 443             char *s = SvPVX(sv);
 444
 445             if (PL_op)
 446                 Perl_warner(aTHX_ WARN_UTF8,
 447                             "%s in %s", s,  OP_DESC(PL_op));
 448             else
 449                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 450         }
 451     }
 452
 453     if (retlen)
 454         *retlen = expectlen ? expectlen : len;
 455
 456     return 0;
 457 }
 458
 459 /*
 460 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 461
 462 Returns the native character value of the first character in the string C<s>
 463 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 464 length, in bytes, of that character.
 465
 466 If C<s> does not point to a well-formed UTF8 character, zero is
 467 returned and retlen is set, if possible, to -1.
 468
 469 =cut
 470 */
 471
 472 UV
 473 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 474 {
 475     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 476 }
 477
 478 /*
 479 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 480
 481 Returns the Unicode code point of the first character in the string C<s>
 482 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 483 length, in bytes, of that character.
 484
 485 This function should only be used when returned UV is considered
 486 an index into the Unicode semantic tables (e.g. swashes).
 487
 488 If C<s> does not point to a well-formed UTF8 character, zero is
 489 returned and retlen is set, if possible, to -1.
 490
 491 =cut
 492 */
 493
 494 UV
 495 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 496 {
 497     /* Call the low level routine asking for checks */
 498     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 499 }
 500
 501 /*
 502 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 503
 504 Return the length of the UTF-8 char encoded string C<s> in characters.
 505 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 506 up past C<e>, croaks.
 507
 508 =cut
 509 */
 510
 511 STRLEN
 512 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 513 {
 514     STRLEN len = 0;
 515
 516     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 517      * the bitops (especially ~) can create illegal UTF-8.
 518      * In other words: in Perl UTF-8 is not just for Unicode. */
 519
 520     if (e < s)
 521         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 522     while (s < e) {
 523         U8 t = UTF8SKIP(s);
 524
 525         if (e - s < t)
 526             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 527         s += t;
 528         len++;
 529     }
 530
 531     return len;
 532 }
 533
 534 /*
 535 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 536
 537 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 538 and C<b>.
 539
 540 WARNING: use only if you *know* that the pointers point inside the
 541 same UTF-8 buffer.
 542
 543 =cut
 544 */
 545
 546 IV
 547 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 548 {
 549     IV off = 0;
 550
 551     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 552      * the bitops (especially ~) can create illegal UTF-8.
 553      * In other words: in Perl UTF-8 is not just for Unicode. */
 554
 555     if (a < b) {
 556         while (a < b) {
 557             U8 c = UTF8SKIP(a);
 558
 559             if (b - a < c)
 560                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 561             a += c;
 562             off--;
 563         }
 564     }
 565     else {
 566         while (b < a) {
 567             U8 c = UTF8SKIP(b);
 568
 569             if (a - b < c)
 570                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 571             b += c;
 572             off++;
 573         }
 574     }
 575
 576     return off;
 577 }
 578
 579 /*
 580 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 581
 582 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 583 forward or backward.
 584
 585 WARNING: do not use the following unless you *know* C<off> is within
 586 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 587 on the first byte of character or just after the last byte of a character.
 588
 589 =cut
 590 */
 591
 592 U8 *
 593 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 594 {
 595     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 596      * the bitops (especially ~) can create illegal UTF-8.
 597      * In other words: in Perl UTF-8 is not just for Unicode. */
 598
 599     if (off >= 0) {
 600         while (off--)
 601             s += UTF8SKIP(s);
 602     }
 603     else {
 604         while (off++) {
 605             s--;
 606             while (UTF8_IS_CONTINUATION(*s))
 607                 s--;
 608         }
 609     }
 610     return s;
 611 }
 612
 613 /*
 614 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 615
 616 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 617 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 618 updates len to contain the new length.
 619 Returns zero on failure, setting C<len> to -1.
 620
 621 =cut
 622 */
 623
 624 U8 *
 625 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 626 {
 627     U8 *send;
 628     U8 *d;
 629     U8 *save = s;
 630
 631     /* ensure valid UTF8 and chars < 256 before updating string */
 632     for (send = s + *len; s < send; ) {
 633         U8 c = *s++;
 634
 635         if (!UTF8_IS_INVARIANT(c) &&
 636             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 637              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 638             *len = -1;
 639             return 0;
 640         }
 641     }
 642
 643     d = s = save;
 644     while (s < send) {
 645         STRLEN ulen;
 646         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 647         s += ulen;
 648     }
 649     *d = '\0';
 650     *len = d - save;
 651     return save;
 652 }
 653
 654 /*
 655 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 656
 657 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 658 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 659 the newly-created string, and updates C<len> to contain the new
 660 length.  Returns the original string if no conversion occurs, C<len>
 661 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 662 0 if C<s> is converted or contains all 7bit characters.
 663
 664 =cut
 665 */
 666
 667 U8 *
 668 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 669 {
 670     U8 *d;
 671     U8 *start = s;
 672     U8 *send;
 673     I32 count = 0;
 674
 675     if (!*is_utf8)
 676         return start;
 677
 678     /* ensure valid UTF8 and chars < 256 before converting string */
 679     for (send = s + *len; s < send;) {
 680         U8 c = *s++;
 681         if (!UTF8_IS_INVARIANT(c)) {
 682             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 683                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 684                 count++;
 685             else
 686                 return start;
 687         }
 688     }
 689
 690     *is_utf8 = 0;
 691
 692     Newz(801, d, (*len) - count + 1, U8);
 693     s = start; start = d;
 694     while (s < send) {
 695         U8 c = *s++;
 696         if (!UTF8_IS_INVARIANT(c)) {
 697             /* Then it is two-byte encoded */
 698             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 699             c = ASCII_TO_NATIVE(c);
 700         }
 701         *d++ = c;
 702     }
 703     *d = '\0';
 704     *len = d - start;
 705     return start;
 706 }
 707
 708 /*
 709 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 710
 711 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 712 Returns a pointer to the newly-created string, and sets C<len> to
 713 reflect the new length.
 714
 715 =cut
 716 */
 717
 718 U8*
 719 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 720 {
 721     U8 *send;
 722     U8 *d;
 723     U8 *dst;
 724     send = s + (*len);
 725
 726     Newz(801, d, (*len) * 2 + 1, U8);
 727     dst = d;
 728
 729     while (s < send) {
 730         UV uv = NATIVE_TO_ASCII(*s++);
 731         if (UNI_IS_INVARIANT(uv))
 732             *d++ = UTF_TO_NATIVE(uv);
 733         else {
 734             *d++ = UTF8_EIGHT_BIT_HI(uv);
 735             *d++ = UTF8_EIGHT_BIT_LO(uv);
 736         }
 737     }
 738     *d = '\0';
 739     *len = d-dst;
 740     return dst;
 741 }
 742
 743 /*
 744  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 745  *
 746  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 747  * We optimize for native, for obvious reasons. */
 748
 749 U8*
 750 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 751 {
 752     U8* pend;
 753     U8* dstart = d;
 754
 755     if (bytelen & 1)
 756         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 757
 758     pend = p + bytelen;
 759
 760     while (p < pend) {
 761         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 762         p += 2;
 763         if (uv < 0x80) {
 764             *d++ = uv;
 765             continue;
 766         }
 767         if (uv < 0x800) {
 768             *d++ = (( uv >>  6)         | 0xc0);
 769             *d++ = (( uv        & 0x3f) | 0x80);
 770             continue;
 771         }
 772         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 773             UV low = *p++;
 774             if (low < 0xdc00 || low >= 0xdfff)
 775                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 776             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 777         }
 778         if (uv < 0x10000) {
 779             *d++ = (( uv >> 12)         | 0xe0);
 780             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 781             *d++ = (( uv        & 0x3f) | 0x80);
 782             continue;
 783         }
 784         else {
 785             *d++ = (( uv >> 18)         | 0xf0);
 786             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 787             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 788             *d++ = (( uv        & 0x3f) | 0x80);
 789             continue;
 790         }
 791     }
 792     *newlen = d - dstart;
 793     return d;
 794 }
 795
 796 /* Note: this one is slightly destructive of the source. */
 797
 798 U8*
 799 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 800 {
 801     U8* s = (U8*)p;
 802     U8* send = s + bytelen;
 803     while (s < send) {
 804         U8 tmp = s[0];
 805         s[0] = s[1];
 806         s[1] = tmp;
 807         s += 2;
 808     }
 809     return utf16_to_utf8(p, d, bytelen, newlen);
 810 }
 811
 812 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 813
 814 bool
 815 Perl_is_uni_alnum(pTHX_ UV c)
 816 {
 817     U8 tmpbuf[UTF8_MAXLEN+1];
 818     uvchr_to_utf8(tmpbuf, (UV)c);
 819     return is_utf8_alnum(tmpbuf);
 820 }
 821
 822 bool
 823 Perl_is_uni_alnumc(pTHX_ UV c)
 824 {
 825     U8 tmpbuf[UTF8_MAXLEN+1];
 826     uvchr_to_utf8(tmpbuf, (UV)c);
 827     return is_utf8_alnumc(tmpbuf);
 828 }
 829
 830 bool
 831 Perl_is_uni_idfirst(pTHX_ UV c)
 832 {
 833     U8 tmpbuf[UTF8_MAXLEN+1];
 834     uvchr_to_utf8(tmpbuf, (UV)c);
 835     return is_utf8_idfirst(tmpbuf);
 836 }
 837
 838 bool
 839 Perl_is_uni_alpha(pTHX_ UV c)
 840 {
 841     U8 tmpbuf[UTF8_MAXLEN+1];
 842     uvchr_to_utf8(tmpbuf, (UV)c);
 843     return is_utf8_alpha(tmpbuf);
 844 }
 845
 846 bool
 847 Perl_is_uni_ascii(pTHX_ UV c)
 848 {
 849     U8 tmpbuf[UTF8_MAXLEN+1];
 850     uvchr_to_utf8(tmpbuf, (UV)c);
 851     return is_utf8_ascii(tmpbuf);
 852 }
 853
 854 bool
 855 Perl_is_uni_space(pTHX_ UV c)
 856 {
 857     U8 tmpbuf[UTF8_MAXLEN+1];
 858     uvchr_to_utf8(tmpbuf, (UV)c);
 859     return is_utf8_space(tmpbuf);
 860 }
 861
 862 bool
 863 Perl_is_uni_digit(pTHX_ UV c)
 864 {
 865     U8 tmpbuf[UTF8_MAXLEN+1];
 866     uvchr_to_utf8(tmpbuf, (UV)c);
 867     return is_utf8_digit(tmpbuf);
 868 }
 869
 870 bool
 871 Perl_is_uni_upper(pTHX_ UV c)
 872 {
 873     U8 tmpbuf[UTF8_MAXLEN+1];
 874     uvchr_to_utf8(tmpbuf, (UV)c);
 875     return is_utf8_upper(tmpbuf);
 876 }
 877
 878 bool
 879 Perl_is_uni_lower(pTHX_ UV c)
 880 {
 881     U8 tmpbuf[UTF8_MAXLEN+1];
 882     uvchr_to_utf8(tmpbuf, (UV)c);
 883     return is_utf8_lower(tmpbuf);
 884 }
 885
 886 bool
 887 Perl_is_uni_cntrl(pTHX_ UV c)
 888 {
 889     U8 tmpbuf[UTF8_MAXLEN+1];
 890     uvchr_to_utf8(tmpbuf, (UV)c);
 891     return is_utf8_cntrl(tmpbuf);
 892 }
 893
 894 bool
 895 Perl_is_uni_graph(pTHX_ UV c)
 896 {
 897     U8 tmpbuf[UTF8_MAXLEN+1];
 898     uvchr_to_utf8(tmpbuf, (UV)c);
 899     return is_utf8_graph(tmpbuf);
 900 }
 901
 902 bool
 903 Perl_is_uni_print(pTHX_ UV c)
 904 {
 905     U8 tmpbuf[UTF8_MAXLEN+1];
 906     uvchr_to_utf8(tmpbuf, (UV)c);
 907     return is_utf8_print(tmpbuf);
 908 }
 909
 910 bool
 911 Perl_is_uni_punct(pTHX_ UV c)
 912 {
 913     U8 tmpbuf[UTF8_MAXLEN+1];
 914     uvchr_to_utf8(tmpbuf, (UV)c);
 915     return is_utf8_punct(tmpbuf);
 916 }
 917
 918 bool
 919 Perl_is_uni_xdigit(pTHX_ UV c)
 920 {
 921     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 922     uvchr_to_utf8(tmpbuf, (UV)c);
 923     return is_utf8_xdigit(tmpbuf);
 924 }
 925
 926 UV
 927 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 928 {
 929     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 930     uvchr_to_utf8(tmpbuf, (UV)c);
 931     return to_utf8_upper(tmpbuf, p, lenp);
 932 }
 933
 934 UV
 935 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 936 {
 937     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 938     uvchr_to_utf8(tmpbuf, (UV)c);
 939     return to_utf8_title(tmpbuf, p, lenp);
 940 }
 941
 942 UV
 943 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 944 {
 945     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 946     uvchr_to_utf8(tmpbuf, (UV)c);
 947     return to_utf8_lower(tmpbuf, p, lenp);
 948 }
 949
 950 UV
 951 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 952 {
 953     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
 954     uvchr_to_utf8(tmpbuf, (UV)c);
 955     return to_utf8_fold(tmpbuf, p, lenp);
 956 }
 957
 958 /* for now these all assume no locale info available for Unicode > 255 */
 959
 960 bool
 961 Perl_is_uni_alnum_lc(pTHX_ UV c)
 962 {
 963     return is_uni_alnum(c);     /* XXX no locale support yet */
 964 }
 965
 966 bool
 967 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 968 {
 969     return is_uni_alnumc(c);    /* XXX no locale support yet */
 970 }
 971
 972 bool
 973 Perl_is_uni_idfirst_lc(pTHX_ UV c)
 974 {
 975     return is_uni_idfirst(c);   /* XXX no locale support yet */
 976 }
 977
 978 bool
 979 Perl_is_uni_alpha_lc(pTHX_ UV c)
 980 {
 981     return is_uni_alpha(c);     /* XXX no locale support yet */
 982 }
 983
 984 bool
 985 Perl_is_uni_ascii_lc(pTHX_ UV c)
 986 {
 987     return is_uni_ascii(c);     /* XXX no locale support yet */
 988 }
 989
 990 bool
 991 Perl_is_uni_space_lc(pTHX_ UV c)
 992 {
 993     return is_uni_space(c);     /* XXX no locale support yet */
 994 }
 995
 996 bool
 997 Perl_is_uni_digit_lc(pTHX_ UV c)
 998 {
 999     return is_uni_digit(c);     /* XXX no locale support yet */
1000 }
1001
1002 bool
1003 Perl_is_uni_upper_lc(pTHX_ UV c)
1004 {
1005     return is_uni_upper(c);     /* XXX no locale support yet */
1006 }
1007
1008 bool
1009 Perl_is_uni_lower_lc(pTHX_ UV c)
1010 {
1011     return is_uni_lower(c);     /* XXX no locale support yet */
1012 }
1013
1014 bool
1015 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1016 {
1017     return is_uni_cntrl(c);     /* XXX no locale support yet */
1018 }
1019
1020 bool
1021 Perl_is_uni_graph_lc(pTHX_ UV c)
1022 {
1023     return is_uni_graph(c);     /* XXX no locale support yet */
1024 }
1025
1026 bool
1027 Perl_is_uni_print_lc(pTHX_ UV c)
1028 {
1029     return is_uni_print(c);     /* XXX no locale support yet */
1030 }
1031
1032 bool
1033 Perl_is_uni_punct_lc(pTHX_ UV c)
1034 {
1035     return is_uni_punct(c);     /* XXX no locale support yet */
1036 }
1037
1038 bool
1039 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1040 {
1041     return is_uni_xdigit(c);    /* XXX no locale support yet */
1042 }
1043
1044 U32
1045 Perl_to_uni_upper_lc(pTHX_ U32 c)
1046 {
1047     return to_uni_upper(c);    /* XXX no locale support yet */
1048 }
1049
1050 U32
1051 Perl_to_uni_title_lc(pTHX_ U32 c)
1052 {
1053     return to_uni_title(c);    /* XXX no locale support yet */
1054 }
1055
1056 U32
1057 Perl_to_uni_lower_lc(pTHX_ U32 c)
1058 {
1059     return to_uni_lower(c);    /* XXX no locale support yet */
1060 }
1061
1062 bool
1063 Perl_is_utf8_alnum(pTHX_ U8 *p)
1064 {
1065     if (!is_utf8_char(p))
1066         return FALSE;
1067     if (!PL_utf8_alnum)
1068         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1069          * descendant of isalnum(3), in other words, it doesn't
1070          * contain the '_'. --jhi */
1071         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1072     return swash_fetch(PL_utf8_alnum, p, TRUE);
1073 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1074 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1075     if (!PL_utf8_alnum)
1076         PL_utf8_alnum = swash_init("utf8", "",
1077             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1078     return swash_fetch(PL_utf8_alnum, p, TRUE);
1079 #endif
1080 }
1081
1082 bool
1083 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1084 {
1085     if (!is_utf8_char(p))
1086         return FALSE;
1087     if (!PL_utf8_alnum)
1088         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1089     return swash_fetch(PL_utf8_alnum, p, TRUE);
1090 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1091 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1092     if (!PL_utf8_alnum)
1093         PL_utf8_alnum = swash_init("utf8", "",
1094             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1095     return swash_fetch(PL_utf8_alnum, p, TRUE);
1096 #endif
1097 }
1098
1099 bool
1100 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1101 {
1102     return *p == '_' || is_utf8_alpha(p);
1103 }
1104
1105 bool
1106 Perl_is_utf8_alpha(pTHX_ U8 *p)
1107 {
1108     if (!is_utf8_char(p))
1109         return FALSE;
1110     if (!PL_utf8_alpha)
1111         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1112     return swash_fetch(PL_utf8_alpha, p, TRUE);
1113 }
1114
1115 bool
1116 Perl_is_utf8_ascii(pTHX_ U8 *p)
1117 {
1118     if (!is_utf8_char(p))
1119         return FALSE;
1120     if (!PL_utf8_ascii)
1121         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1122     return swash_fetch(PL_utf8_ascii, p, TRUE);
1123 }
1124
1125 bool
1126 Perl_is_utf8_space(pTHX_ U8 *p)
1127 {
1128     if (!is_utf8_char(p))
1129         return FALSE;
1130     if (!PL_utf8_space)
1131         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1132     return swash_fetch(PL_utf8_space, p, TRUE);
1133 }
1134
1135 bool
1136 Perl_is_utf8_digit(pTHX_ U8 *p)
1137 {
1138     if (!is_utf8_char(p))
1139         return FALSE;
1140     if (!PL_utf8_digit)
1141         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1142     return swash_fetch(PL_utf8_digit, p, TRUE);
1143 }
1144
1145 bool
1146 Perl_is_utf8_upper(pTHX_ U8 *p)
1147 {
1148     if (!is_utf8_char(p))
1149         return FALSE;
1150     if (!PL_utf8_upper)
1151         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1152     return swash_fetch(PL_utf8_upper, p, TRUE);
1153 }
1154
1155 bool
1156 Perl_is_utf8_lower(pTHX_ U8 *p)
1157 {
1158     if (!is_utf8_char(p))
1159         return FALSE;
1160     if (!PL_utf8_lower)
1161         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1162     return swash_fetch(PL_utf8_lower, p, TRUE);
1163 }
1164
1165 bool
1166 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1167 {
1168     if (!is_utf8_char(p))
1169         return FALSE;
1170     if (!PL_utf8_cntrl)
1171         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1172     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1173 }
1174
1175 bool
1176 Perl_is_utf8_graph(pTHX_ U8 *p)
1177 {
1178     if (!is_utf8_char(p))
1179         return FALSE;
1180     if (!PL_utf8_graph)
1181         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1182     return swash_fetch(PL_utf8_graph, p, TRUE);
1183 }
1184
1185 bool
1186 Perl_is_utf8_print(pTHX_ U8 *p)
1187 {
1188     if (!is_utf8_char(p))
1189         return FALSE;
1190     if (!PL_utf8_print)
1191         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1192     return swash_fetch(PL_utf8_print, p, TRUE);
1193 }
1194
1195 bool
1196 Perl_is_utf8_punct(pTHX_ U8 *p)
1197 {
1198     if (!is_utf8_char(p))
1199         return FALSE;
1200     if (!PL_utf8_punct)
1201         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1202     return swash_fetch(PL_utf8_punct, p, TRUE);
1203 }
1204
1205 bool
1206 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1207 {
1208     if (!is_utf8_char(p))
1209         return FALSE;
1210     if (!PL_utf8_xdigit)
1211         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1212     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1213 }
1214
1215 bool
1216 Perl_is_utf8_mark(pTHX_ U8 *p)
1217 {
1218     if (!is_utf8_char(p))
1219         return FALSE;
1220     if (!PL_utf8_mark)
1221         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1222     return swash_fetch(PL_utf8_mark, p, TRUE);
1223 }
1224
1225 /*
1226 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1227
1228 The "p" contains the pointer to the UTF-8 string encoding
1229 the character that is being converted.
1230
1231 The "ustrp" is a pointer to the character buffer to put the
1232 conversion result to.  The "lenp" is a pointer to the length
1233 of the result.
1234
1235 The "swash" is a pointer to the swash to use.
1236
1237 The "normal" is a string like "ToLower" which means the swash
1238 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1239 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1240
1241 The "special" is a string like "utf8::ToSpecLower", which means
1242 the hash %utf8::ToSpecLower, which is stored in the same file,
1243 lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
1244 to the hash is by Perl_to_utf8_case().
1245
1246 =cut
1247  */
1248
1249 UV
1250 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1251 {
1252     UV uv;
1253
1254     if (!*swashp)
1255         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1256     uv = swash_fetch(*swashp, p, TRUE);
1257     if (uv)
1258          uv = UNI_TO_NATIVE(uv);
1259     else {
1260          HV *hv;
1261          SV *keysv;
1262          HE *he;
1263
1264          uv = utf8_to_uvchr(p, 0);
1265
1266          if ((hv    = get_hv(special, FALSE)) &&
1267              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1268              (he    = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1269               SV *val = HeVAL(he);
1270               char *s = SvPV(val, *lenp);
1271               U8 c = *(U8*)s;
1272               if (*lenp > 1 || UNI_IS_INVARIANT(c))
1273                    Copy(s, ustrp, *lenp, U8);
1274               else {
1275                    /* something in the 0x80..0xFF range */
1276                    ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1277                    ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1278                    *lenp = 2;
1279               }
1280               return 0;
1281          }
1282     }
1283     if (lenp)
1284        *lenp = UNISKIP(uv);
1285     uvuni_to_utf8(ustrp, uv);
1286     return uv;
1287 }
1288
1289 UV
1290 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1291 {
1292     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1293                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1294 }
1295
1296 UV
1297 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1298 {
1299     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1300                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1301 }
1302
1303 UV
1304 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1305 {
1306     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1307                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1308 }
1309
1310 UV
1311 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1312 {
1313     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1314                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1315 }
1316
1317 /* a "swash" is a swatch hash */
1318
1319 SV*
1320 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1321 {
1322     SV* retval;
1323     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1324     dSP;
1325     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1326     SV* errsv_save;
1327
1328     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1329         ENTER;
1330         errsv_save = newSVsv(ERRSV);
1331         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1332         if (!SvTRUE(ERRSV))
1333             sv_setsv(ERRSV, errsv_save);
1334         SvREFCNT_dec(errsv_save);
1335         LEAVE;
1336     }
1337     SPAGAIN;
1338     PUSHSTACKi(PERLSI_MAGIC);
1339     PUSHMARK(SP);
1340     EXTEND(SP,5);
1341     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1342     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1343     PUSHs(listsv);
1344     PUSHs(sv_2mortal(newSViv(minbits)));
1345     PUSHs(sv_2mortal(newSViv(none)));
1346     PUTBACK;
1347     ENTER;
1348     SAVEI32(PL_hints);
1349     PL_hints = 0;
1350     save_re_context();
1351     if (PL_curcop == &PL_compiling)
1352         /* XXX ought to be handled by lex_start */
1353         sv_setpv(tokenbufsv, PL_tokenbuf);
1354     errsv_save = newSVsv(ERRSV);
1355     if (call_method("SWASHNEW", G_SCALAR))
1356         retval = newSVsv(*PL_stack_sp--);
1357     else
1358         retval = &PL_sv_undef;
1359     if (!SvTRUE(ERRSV))
1360         sv_setsv(ERRSV, errsv_save);
1361     SvREFCNT_dec(errsv_save);
1362     LEAVE;
1363     POPSTACK;
1364     if (PL_curcop == &PL_compiling) {
1365         STRLEN len;
1366         char* pv = SvPV(tokenbufsv, len);
1367
1368         Copy(pv, PL_tokenbuf, len+1, char);
1369         PL_curcop->op_private = PL_hints;
1370     }
1371     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1372         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1373     return retval;
1374 }
1375
1376
1377 /* This API is wrong for special case conversions since we may need to
1378  * return several Unicode characters for a single Unicode character
1379  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1380  * the lower-level routine, and it is similarly broken for returning
1381  * multiple values.  --jhi */
1382 UV
1383 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1384 {
1385     HV* hv = (HV*)SvRV(sv);
1386     U32 klen;
1387     U32 off;
1388     STRLEN slen;
1389     STRLEN needents;
1390     U8 *tmps = NULL;
1391     U32 bit;
1392     SV *retval;
1393     U8 tmputf8[2];
1394     UV c = NATIVE_TO_ASCII(*ptr);
1395
1396     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1397         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1398         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1399         ptr = tmputf8;
1400     }
1401     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1402      * then the "swatch" is a vec() for al the chars which start
1403      * with 0xAA..0xYY
1404      * So the key in the hash (klen) is length of encoded char -1
1405      */
1406     klen = UTF8SKIP(ptr) - 1;
1407     off  = ptr[klen];
1408
1409     if (klen == 0)
1410      {
1411       /* If char in invariant then swatch is for all the invariant chars
1412        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1413        */
1414       needents = UTF_CONTINUATION_MARK;
1415       off      = NATIVE_TO_UTF(ptr[klen]);
1416      }
1417     else
1418      {
1419       /* If char is encoded then swatch is for the prefix */
1420       needents = (1 << UTF_ACCUMULATION_SHIFT);
1421       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1422      }
1423
1424     /*
1425      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1426      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1427      * it's nothing to sniff at.)  Pity we usually come through at least
1428      * two function calls to get here...
1429      *
1430      * NB: this code assumes that swatches are never modified, once generated!
1431      */
1432
1433     if (hv   == PL_last_swash_hv &&
1434         klen == PL_last_swash_klen &&
1435         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1436     {
1437         tmps = PL_last_swash_tmps;
1438         slen = PL_last_swash_slen;
1439     }
1440     else {
1441         /* Try our second-level swatch cache, kept in a hash. */
1442         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1443
1444         /* If not cached, generate it via utf8::SWASHGET */
1445         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1446             dSP;
1447             /* We use utf8n_to_uvuni() as we want an index into
1448                Unicode tables, not a native character number.
1449              */
1450             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1451             SV *errsv_save;
1452             ENTER;
1453             SAVETMPS;
1454             save_re_context();
1455             PUSHSTACKi(PERLSI_MAGIC);
1456             PUSHMARK(SP);
1457             EXTEND(SP,3);
1458             PUSHs((SV*)sv);
1459             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1460             PUSHs(sv_2mortal(newSViv((klen) ?
1461                                      (code_point & ~(needents - 1)) : 0)));
1462             PUSHs(sv_2mortal(newSViv(needents)));
1463             PUTBACK;
1464             errsv_save = newSVsv(ERRSV);
1465             if (call_method("SWASHGET", G_SCALAR))
1466                 retval = newSVsv(*PL_stack_sp--);
1467             else
1468                 retval = &PL_sv_undef;
1469             if (!SvTRUE(ERRSV))
1470                 sv_setsv(ERRSV, errsv_save);
1471             SvREFCNT_dec(errsv_save);
1472             POPSTACK;
1473             FREETMPS;
1474             LEAVE;
1475             if (PL_curcop == &PL_compiling)
1476                 PL_curcop->op_private = PL_hints;
1477
1478             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1479
1480             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1481                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1482         }
1483
1484         PL_last_swash_hv = hv;
1485         PL_last_swash_klen = klen;
1486         PL_last_swash_tmps = tmps;
1487         PL_last_swash_slen = slen;
1488         if (klen)
1489             Copy(ptr, PL_last_swash_key, klen, U8);
1490     }
1491
1492     switch ((int)((slen << 3) / needents)) {
1493     case 1:
1494         bit = 1 << (off & 7);
1495         off >>= 3;
1496         return (tmps[off] & bit) != 0;
1497     case 8:
1498         return tmps[off];
1499     case 16:
1500         off <<= 1;
1501         return (tmps[off] << 8) + tmps[off + 1] ;
1502     case 32:
1503         off <<= 2;
1504         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1505     }
1506     Perl_croak(aTHX_ "panic: swash_fetch");
1507     return 0;
1508 }
1509
1510
1511 /*
1512 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1513
1514 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1515 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1516 bytes available. The return value is the pointer to the byte after the
1517 end of the new character. In other words,
1518
1519     d = uvchr_to_utf8(d, uv);
1520
1521 is the recommended wide native character-aware way of saying
1522
1523     *(d++) = uv;
1524
1525 =cut
1526 */
1527
1528 /* On ASCII machines this is normally a macro but we want a
1529    real function in case XS code wants it
1530 */
1531 #undef Perl_uvchr_to_utf8
1532 U8 *
1533 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1534 {
1535     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1536 }
1537
1538
1539 /*
1540 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1541
1542 Returns the native character value of the first character in the string C<s>
1543 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1544 length, in bytes, of that character.
1545
1546 Allows length and flags to be passed to low level routine.
1547
1548 =cut
1549 */
1550 /* On ASCII machines this is normally a macro but we want a
1551    real function in case XS code wants it
1552 */
1553 #undef Perl_utf8n_to_uvchr
1554 UV
1555 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1556 {
1557     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1558     return UNI_TO_NATIVE(uv);
1559 }
1560
1561 /*
1562 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1563
1564 Build to the scalar dsv a displayable version of the string spv,
1565 length len, the displayable version being at most pvlim bytes long
1566 (if longer, the rest is truncated and "..." will be appended).
1567 The flags argument is currently unused but available for future extensions.
1568 The pointer to the PV of the dsv is returned.
1569
1570 =cut */
1571 char *
1572 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1573 {
1574     int truncated = 0;
1575     char *s, *e;
1576
1577     sv_setpvn(dsv, "", 0);
1578     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1579          UV u;
1580          if (pvlim && SvCUR(dsv) >= pvlim) {
1581               truncated++;
1582               break;
1583          }
1584          u = utf8_to_uvchr((U8*)s, 0);
1585          Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1586     }
1587     if (truncated)
1588          sv_catpvn(dsv, "...", 3);
1589
1590     return SvPVX(dsv);
1591 }
1592
1593 /*
1594 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1595
1596 Build to the scalar dsv a displayable version of the scalar sv,
1597 he displayable version being at most pvlim bytes long
1598 (if longer, the rest is truncated and "..." will be appended).
1599 The flags argument is currently unused but available for future extensions.
1600 The pointer to the PV of the dsv is returned.
1601
1602 =cut */
1603 char *
1604 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1605 {
1606      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1607                                 pvlim, flags);
1608 }
1609
1610 /*
1611 =for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1612
1613 Return true if the strings s1 and s2 differ case-insensitively, false
1614 if not (if they are equal case-insensitively).  If u1 is true, the
1615 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1616 the string s2 is assumed to be in UTF-8-encoded Unicode.
1617
1618 For case-insensitiveness, the "casefolding" of Unicode is used
1619 instead of upper/lowercasing both the characters, see
1620 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1621
1622 =cut */
1623 I32
1624 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1625 {
1626      register U8 *a  = (U8*)s1;
1627      register U8 *b  = (U8*)s2;
1628      register U8 *ae = b + len1;
1629      register U8 *be = b + len2;
1630      STRLEN la, lb;
1631      UV ca, cb;
1632      STRLEN ulen1, ulen2;
1633      U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1634      U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
1635
1636      while (a < ae && b < be) {
1637           if (u1) {
1638                if (a + UTF8SKIP(a) > ae)
1639                     break;
1640                ca = utf8_to_uvchr((U8*)a, &la);
1641           } else {
1642                ca = *a;
1643                la = 1;
1644           }
1645           if (u2) {
1646                if (b + UTF8SKIP(b) > be)
1647                     break;
1648                cb = utf8_to_uvchr((U8*)b, &lb);
1649           } else {
1650                cb = *b;
1651                lb = 1;
1652           }
1653           if (ca != cb) {
1654                if (u1)
1655                     to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1656                else
1657                     ulen1 = 1;
1658                if (u2)
1659                     to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1660                else
1661                     ulen2 = 1;
1662                if (ulen1 != ulen2
1663                    || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1664                    || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1665                     return 1; /* mismatch */
1666           }
1667           a += la;
1668           b += lb;
1669      }
1670      return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
1671 }
1672