utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2001, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /* Unicode support */
  28
  29 /*
  30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8(d, uv);
  38
  39 is the recommended Unicode-aware way of saying
  40
  41     *(d++) = uv;
  42
  43 =cut
  44 */
  45
  46 U8 *
  47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  48 {
  49     if (UNI_IS_INVARIANT(uv)) {
  50         *d++ = UTF_TO_NATIVE(uv);
  51         return d;
  52     }
  53 #if defined(EBCDIC)
  54     else {
  55         STRLEN len  = UNISKIP(uv);
  56         U8 *p = d+len-1;
  57         while (p > d) {
  58             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  59             uv >>= UTF_ACCUMULATION_SHIFT;
  60         }
  61         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  62         return d+len;
  63     }
  64 #else /* Non loop style */
  65     if (uv < 0x800) {
  66         *d++ = (( uv >>  6)         | 0xc0);
  67         *d++ = (( uv        & 0x3f) | 0x80);
  68         return d;
  69     }
  70     if (uv < 0x10000) {
  71         *d++ = (( uv >> 12)         | 0xe0);
  72         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  73         *d++ = (( uv        & 0x3f) | 0x80);
  74         return d;
  75     }
  76     if (uv < 0x200000) {
  77         *d++ = (( uv >> 18)         | 0xf0);
  78         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  79         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  80         *d++ = (( uv        & 0x3f) | 0x80);
  81         return d;
  82     }
  83     if (uv < 0x4000000) {
  84         *d++ = (( uv >> 24)         | 0xf8);
  85         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  86         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  87         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  88         *d++ = (( uv        & 0x3f) | 0x80);
  89         return d;
  90     }
  91     if (uv < 0x80000000) {
  92         *d++ = (( uv >> 30)         | 0xfc);
  93         *d++ = (((uv >> 24) & 0x3f) | 0x80);
  94         *d++ = (((uv >> 18) & 0x3f) | 0x80);
  95         *d++ = (((uv >> 12) & 0x3f) | 0x80);
  96         *d++ = (((uv >>  6) & 0x3f) | 0x80);
  97         *d++ = (( uv        & 0x3f) | 0x80);
  98         return d;
  99     }
 100 #ifdef HAS_QUAD
 101     if (uv < UTF8_QUAD_MAX)
 102 #endif
 103     {
 104         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 105         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 106         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 107         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 108         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 109         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 110         *d++ = (( uv        & 0x3f) | 0x80);
 111         return d;
 112     }
 113 #ifdef HAS_QUAD
 114     {
 115         *d++ =                        0xff;     /* Can't match U+FFFE! */
 116         *d++ =                        0x80;     /* 6 Reserved bits */
 117         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 118         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 119         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 120         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 121         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 122         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 126         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 127         *d++ = (( uv        & 0x3f) | 0x80);
 128         return d;
 129     }
 130 #endif
 131 #endif /* Loop style */
 132 }
 133
 134
 135
 136 /*
 137 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 138
 139 Tests if some arbitrary number of bytes begins in a valid UTF-8
 140 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 141 The actual number of bytes in the UTF-8 character will be returned if
 142 it is valid, otherwise 0.
 143
 144 =cut
 145 */
 146 STRLEN
 147 Perl_is_utf8_char(pTHX_ U8 *s)
 148 {
 149     U8 u = *s;
 150     STRLEN slen, len;
 151     UV uv, ouv;
 152
 153     if (UTF8_IS_INVARIANT(u))
 154         return 1;
 155
 156     if (!UTF8_IS_START(u))
 157         return 0;
 158
 159     len = UTF8SKIP(s);
 160
 161     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 162         return 0;
 163
 164     slen = len - 1;
 165     s++;
 166     u &= UTF_START_MASK(len);
 167     uv  = u;
 168     ouv = uv;
 169     while (slen--) {
 170         if (!UTF8_IS_CONTINUATION(*s))
 171             return 0;
 172         uv = UTF8_ACCUMULATE(uv, *s);
 173         if (uv < ouv)
 174             return 0;
 175         ouv = uv;
 176         s++;
 177     }
 178
 179     if (UNISKIP(uv) < len)
 180         return 0;
 181
 182     return len;
 183 }
 184
 185 /*
 186 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 187
 188 Returns true if first C<len> bytes of the given string form a valid UTF8
 189 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 190 'a string that contains UTF8' because a valid ASCII string is a valid
 191 UTF8 string.
 192
 193 =cut
 194 */
 195
 196 bool
 197 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 198 {
 199     U8* x = s;
 200     U8* send;
 201     STRLEN c;
 202
 203     if (!len)
 204         len = strlen((char *)s);
 205     send = s + len;
 206
 207     while (x < send) {
 208         c = is_utf8_char(x);
 209         if (!c)
 210             return FALSE;
 211         x += c;
 212     }
 213     if (x != send)
 214         return FALSE;
 215
 216     return TRUE;
 217 }
 218
 219 /*
 220 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 221
 222 Bottom level UTF-8 decode routine.
 223 Returns the unicode code point value of the first character in the string C<s>
 224 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 225 C<retlen> will be set to the length, in bytes, of that character.
 226
 227 If C<s> does not point to a well-formed UTF8 character, the behaviour
 228 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 229 it is assumed that the caller will raise a warning, and this function
 230 will silently just set C<retlen> to C<-1> and return zero.  If the
 231 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 232 malformations will be given, C<retlen> will be set to the expected
 233 length of the UTF-8 character in bytes, and zero will be returned.
 234
 235 The C<flags> can also contain various flags to allow deviations from
 236 the strict UTF-8 encoding (see F<utf8.h>).
 237
 238 Most code should use utf8_to_uvchr() rather than call this directly.
 239
 240 =cut
 241 */
 242
 243 UV
 244 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 245 {
 246     UV uv = *s, ouv = 0;
 247     STRLEN len = 1;
 248     bool dowarn = ckWARN_d(WARN_UTF8);
 249     STRLEN expectlen = 0;
 250     U32 warning = 0;
 251
 252 /* This list is a superset of the UTF8_ALLOW_XXX. */
 253
 254 #define UTF8_WARN_EMPTY                          1
 255 #define UTF8_WARN_CONTINUATION                   2
 256 #define UTF8_WARN_NON_CONTINUATION               3
 257 #define UTF8_WARN_FE_FF                          4
 258 #define UTF8_WARN_SHORT                          5
 259 #define UTF8_WARN_OVERFLOW                       6
 260 #define UTF8_WARN_SURROGATE                      7
 261 #define UTF8_WARN_BOM                            8
 262 #define UTF8_WARN_LONG                           9
 263 #define UTF8_WARN_FFFF                          10
 264
 265     if (curlen == 0 &&
 266         !(flags & UTF8_ALLOW_EMPTY)) {
 267         warning = UTF8_WARN_EMPTY;
 268         goto malformed;
 269     }
 270
 271     if (UTF8_IS_INVARIANT(uv)) {
 272         if (retlen)
 273             *retlen = 1;
 274         return (UV) (NATIVE_TO_UTF(*s));
 275     }
 276
 277     if (UTF8_IS_CONTINUATION(uv) &&
 278         !(flags & UTF8_ALLOW_CONTINUATION)) {
 279         warning = UTF8_WARN_CONTINUATION;
 280         goto malformed;
 281     }
 282
 283     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 284         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 285         warning = UTF8_WARN_NON_CONTINUATION;
 286         goto malformed;
 287     }
 288
 289 #ifdef EBCDIC
 290     uv = NATIVE_TO_UTF(uv);
 291 #else
 292     if ((uv == 0xfe || uv == 0xff) &&
 293         !(flags & UTF8_ALLOW_FE_FF)) {
 294         warning = UTF8_WARN_FE_FF;
 295         goto malformed;
 296     }
 297 #endif
 298
 299     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 300     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 301     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 302     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 303 #ifdef EBCDIC
 304     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 305     else                        { len =  7; uv &= 0x01; }
 306 #else
 307     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 308     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 309     else                        { len = 13; uv = 0; } /* whoa! */
 310 #endif
 311
 312     if (retlen)
 313         *retlen = len;
 314
 315     expectlen = len;
 316
 317     if ((curlen < expectlen) &&
 318         !(flags & UTF8_ALLOW_SHORT)) {
 319         warning = UTF8_WARN_SHORT;
 320         goto malformed;
 321     }
 322
 323     len--;
 324     s++;
 325     ouv = uv;
 326
 327     while (len--) {
 328         if (!UTF8_IS_CONTINUATION(*s) &&
 329             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 330             s--;
 331             warning = UTF8_WARN_NON_CONTINUATION;
 332             goto malformed;
 333         }
 334         else
 335             uv = UTF8_ACCUMULATE(uv, *s);
 336         if (!(uv > ouv)) {
 337             /* These cannot be allowed. */
 338             if (uv == ouv) {
 339                 if (!(flags & UTF8_ALLOW_LONG)) {
 340                     warning = UTF8_WARN_LONG;
 341                     goto malformed;
 342                 }
 343             }
 344             else { /* uv < ouv */
 345                 /* This cannot be allowed. */
 346                 warning = UTF8_WARN_OVERFLOW;
 347                 goto malformed;
 348             }
 349         }
 350         s++;
 351         ouv = uv;
 352     }
 353
 354     if (UNICODE_IS_SURROGATE(uv) &&
 355         !(flags & UTF8_ALLOW_SURROGATE)) {
 356         warning = UTF8_WARN_SURROGATE;
 357         goto malformed;
 358     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 359                !(flags & UTF8_ALLOW_BOM)) {
 360         warning = UTF8_WARN_BOM;
 361         goto malformed;
 362     } else if ((expectlen > UNISKIP(uv)) &&
 363                !(flags & UTF8_ALLOW_LONG)) {
 364         warning = UTF8_WARN_LONG;
 365         goto malformed;
 366     } else if (UNICODE_IS_ILLEGAL(uv) &&
 367                !(flags & UTF8_ALLOW_FFFF)) {
 368         warning = UTF8_WARN_FFFF;
 369         goto malformed;
 370     }
 371
 372     return uv;
 373
 374 malformed:
 375
 376     if (flags & UTF8_CHECK_ONLY) {
 377         if (retlen)
 378             *retlen = -1;
 379         return 0;
 380     }
 381
 382     if (dowarn) {
 383         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 384
 385         switch (warning) {
 386         case 0: /* Intentionally empty. */ break;
 387         case UTF8_WARN_EMPTY:
 388             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 389             break;
 390         case UTF8_WARN_CONTINUATION:
 391             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
 392             break;
 393         case UTF8_WARN_NON_CONTINUATION:
 394             Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
 395                            (UV)s[1], uv);
 396             break;
 397         case UTF8_WARN_FE_FF:
 398             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 399             break;
 400         case UTF8_WARN_SHORT:
 401             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 402                            curlen, curlen == 1 ? "" : "s", expectlen);
 403             expectlen = curlen;         /* distance for caller to skip */
 404             break;
 405         case UTF8_WARN_OVERFLOW:
 406             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
 407                            ouv, *s);
 408             break;
 409         case UTF8_WARN_SURROGATE:
 410             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 411             break;
 412         case UTF8_WARN_BOM:
 413             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 414             break;
 415         case UTF8_WARN_LONG:
 416             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
 417                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
 418             break;
 419         case UTF8_WARN_FFFF:
 420             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 421             break;
 422         default:
 423             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 424             break;
 425         }
 426
 427         if (warning) {
 428             char *s = SvPVX(sv);
 429
 430             if (PL_op)
 431                 Perl_warner(aTHX_ WARN_UTF8,
 432                             "%s in %s", s,  OP_DESC(PL_op));
 433             else
 434                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 435         }
 436     }
 437
 438     if (retlen)
 439         *retlen = expectlen ? expectlen : len;
 440
 441     return 0;
 442 }
 443
 444 /*
 445 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 446
 447 Returns the native character value of the first character in the string C<s>
 448 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 449 length, in bytes, of that character.
 450
 451 If C<s> does not point to a well-formed UTF8 character, zero is
 452 returned and retlen is set, if possible, to -1.
 453
 454 =cut
 455 */
 456
 457 UV
 458 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 459 {
 460     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 461 }
 462
 463 /*
 464 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 465
 466 Returns the Unicode code point of the first character in the string C<s>
 467 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 468 length, in bytes, of that character.
 469
 470 This function should only be used when returned UV is considered
 471 an index into the Unicode semantic tables (e.g. swashes).
 472
 473 If C<s> does not point to a well-formed UTF8 character, zero is
 474 returned and retlen is set, if possible, to -1.
 475
 476 =cut
 477 */
 478
 479 UV
 480 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 481 {
 482     /* Call the low level routine asking for checks */
 483     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 484 }
 485
 486 /*
 487 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 488
 489 Return the length of the UTF-8 char encoded string C<s> in characters.
 490 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 491 up past C<e>, croaks.
 492
 493 =cut
 494 */
 495
 496 STRLEN
 497 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 498 {
 499     STRLEN len = 0;
 500
 501     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 502      * the bitops (especially ~) can create illegal UTF-8.
 503      * In other words: in Perl UTF-8 is not just for Unicode. */
 504
 505     if (e < s)
 506         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 507     while (s < e) {
 508         U8 t = UTF8SKIP(s);
 509
 510         if (e - s < t)
 511             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 512         s += t;
 513         len++;
 514     }
 515
 516     return len;
 517 }
 518
 519 /*
 520 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 521
 522 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 523 and C<b>.
 524
 525 WARNING: use only if you *know* that the pointers point inside the
 526 same UTF-8 buffer.
 527
 528 =cut
 529 */
 530
 531 IV
 532 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 533 {
 534     IV off = 0;
 535
 536     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 537      * the bitops (especially ~) can create illegal UTF-8.
 538      * In other words: in Perl UTF-8 is not just for Unicode. */
 539
 540     if (a < b) {
 541         while (a < b) {
 542             U8 c = UTF8SKIP(a);
 543
 544             if (b - a < c)
 545                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 546             a += c;
 547             off--;
 548         }
 549     }
 550     else {
 551         while (b < a) {
 552             U8 c = UTF8SKIP(b);
 553
 554             if (a - b < c)
 555                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 556             b += c;
 557             off++;
 558         }
 559     }
 560
 561     return off;
 562 }
 563
 564 /*
 565 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 566
 567 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 568 forward or backward.
 569
 570 WARNING: do not use the following unless you *know* C<off> is within
 571 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 572 on the first byte of character or just after the last byte of a character.
 573
 574 =cut
 575 */
 576
 577 U8 *
 578 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 579 {
 580     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 581      * the bitops (especially ~) can create illegal UTF-8.
 582      * In other words: in Perl UTF-8 is not just for Unicode. */
 583
 584     if (off >= 0) {
 585         while (off--)
 586             s += UTF8SKIP(s);
 587     }
 588     else {
 589         while (off++) {
 590             s--;
 591             while (UTF8_IS_CONTINUATION(*s))
 592                 s--;
 593         }
 594     }
 595     return s;
 596 }
 597
 598 /*
 599 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 600
 601 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 602 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 603 updates len to contain the new length.
 604 Returns zero on failure, setting C<len> to -1.
 605
 606 =cut
 607 */
 608
 609 U8 *
 610 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 611 {
 612     U8 *send;
 613     U8 *d;
 614     U8 *save = s;
 615
 616     /* ensure valid UTF8 and chars < 256 before updating string */
 617     for (send = s + *len; s < send; ) {
 618         U8 c = *s++;
 619
 620         if (!UTF8_IS_INVARIANT(c) &&
 621             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 622              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 623             *len = -1;
 624             return 0;
 625         }
 626     }
 627
 628     d = s = save;
 629     while (s < send) {
 630         STRLEN ulen;
 631         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 632         s += ulen;
 633     }
 634     *d = '\0';
 635     *len = d - save;
 636     return save;
 637 }
 638
 639 /*
 640 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 641
 642 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 643 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 644 the newly-created string, and updates C<len> to contain the new
 645 length.  Returns the original string if no conversion occurs, C<len>
 646 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 647 0 if C<s> is converted or contains all 7bit characters.
 648
 649 =cut
 650 */
 651
 652 U8 *
 653 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 654 {
 655     U8 *d;
 656     U8 *start = s;
 657     U8 *send;
 658     I32 count = 0;
 659
 660     if (!*is_utf8)
 661         return start;
 662
 663     /* ensure valid UTF8 and chars < 256 before converting string */
 664     for (send = s + *len; s < send;) {
 665         U8 c = *s++;
 666         if (!UTF8_IS_INVARIANT(c)) {
 667             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 668                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 669                 count++;
 670             else
 671                 return start;
 672         }
 673     }
 674
 675     *is_utf8 = 0;
 676
 677     Newz(801, d, (*len) - count + 1, U8);
 678     s = start; start = d;
 679     while (s < send) {
 680         U8 c = *s++;
 681         if (!UTF8_IS_INVARIANT(c)) {
 682             /* Then it is two-byte encoded */
 683             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 684             c = ASCII_TO_NATIVE(c);
 685         }
 686         *d++ = c;
 687     }
 688     *d = '\0';
 689     *len = d - start;
 690     return start;
 691 }
 692
 693 /*
 694 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 695
 696 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 697 Returns a pointer to the newly-created string, and sets C<len> to
 698 reflect the new length.
 699
 700 =cut
 701 */
 702
 703 U8*
 704 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 705 {
 706     U8 *send;
 707     U8 *d;
 708     U8 *dst;
 709     send = s + (*len);
 710
 711     Newz(801, d, (*len) * 2 + 1, U8);
 712     dst = d;
 713
 714     while (s < send) {
 715         UV uv = NATIVE_TO_ASCII(*s++);
 716         if (UNI_IS_INVARIANT(uv))
 717             *d++ = UTF_TO_NATIVE(uv);
 718         else {
 719             *d++ = UTF8_EIGHT_BIT_HI(uv);
 720             *d++ = UTF8_EIGHT_BIT_LO(uv);
 721         }
 722     }
 723     *d = '\0';
 724     *len = d-dst;
 725     return dst;
 726 }
 727
 728 /*
 729  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 730  *
 731  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 732  * We optimize for native, for obvious reasons. */
 733
 734 U8*
 735 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 736 {
 737     U8* pend;
 738     U8* dstart = d;
 739
 740     if (bytelen & 1)
 741         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 742
 743     pend = p + bytelen;
 744
 745     while (p < pend) {
 746         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 747         p += 2;
 748         if (uv < 0x80) {
 749             *d++ = uv;
 750             continue;
 751         }
 752         if (uv < 0x800) {
 753             *d++ = (( uv >>  6)         | 0xc0);
 754             *d++ = (( uv        & 0x3f) | 0x80);
 755             continue;
 756         }
 757         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 758             UV low = *p++;
 759             if (low < 0xdc00 || low >= 0xdfff)
 760                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 761             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 762         }
 763         if (uv < 0x10000) {
 764             *d++ = (( uv >> 12)         | 0xe0);
 765             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 766             *d++ = (( uv        & 0x3f) | 0x80);
 767             continue;
 768         }
 769         else {
 770             *d++ = (( uv >> 18)         | 0xf0);
 771             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 772             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 773             *d++ = (( uv        & 0x3f) | 0x80);
 774             continue;
 775         }
 776     }
 777     *newlen = d - dstart;
 778     return d;
 779 }
 780
 781 /* Note: this one is slightly destructive of the source. */
 782
 783 U8*
 784 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 785 {
 786     U8* s = (U8*)p;
 787     U8* send = s + bytelen;
 788     while (s < send) {
 789         U8 tmp = s[0];
 790         s[0] = s[1];
 791         s[1] = tmp;
 792         s += 2;
 793     }
 794     return utf16_to_utf8(p, d, bytelen, newlen);
 795 }
 796
 797 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 798
 799 bool
 800 Perl_is_uni_alnum(pTHX_ UV c)
 801 {
 802     U8 tmpbuf[UTF8_MAXLEN+1];
 803     uvchr_to_utf8(tmpbuf, (UV)c);
 804     return is_utf8_alnum(tmpbuf);
 805 }
 806
 807 bool
 808 Perl_is_uni_alnumc(pTHX_ UV c)
 809 {
 810     U8 tmpbuf[UTF8_MAXLEN+1];
 811     uvchr_to_utf8(tmpbuf, (UV)c);
 812     return is_utf8_alnumc(tmpbuf);
 813 }
 814
 815 bool
 816 Perl_is_uni_idfirst(pTHX_ UV c)
 817 {
 818     U8 tmpbuf[UTF8_MAXLEN+1];
 819     uvchr_to_utf8(tmpbuf, (UV)c);
 820     return is_utf8_idfirst(tmpbuf);
 821 }
 822
 823 bool
 824 Perl_is_uni_alpha(pTHX_ UV c)
 825 {
 826     U8 tmpbuf[UTF8_MAXLEN+1];
 827     uvchr_to_utf8(tmpbuf, (UV)c);
 828     return is_utf8_alpha(tmpbuf);
 829 }
 830
 831 bool
 832 Perl_is_uni_ascii(pTHX_ UV c)
 833 {
 834     U8 tmpbuf[UTF8_MAXLEN+1];
 835     uvchr_to_utf8(tmpbuf, (UV)c);
 836     return is_utf8_ascii(tmpbuf);
 837 }
 838
 839 bool
 840 Perl_is_uni_space(pTHX_ UV c)
 841 {
 842     U8 tmpbuf[UTF8_MAXLEN+1];
 843     uvchr_to_utf8(tmpbuf, (UV)c);
 844     return is_utf8_space(tmpbuf);
 845 }
 846
 847 bool
 848 Perl_is_uni_digit(pTHX_ UV c)
 849 {
 850     U8 tmpbuf[UTF8_MAXLEN+1];
 851     uvchr_to_utf8(tmpbuf, (UV)c);
 852     return is_utf8_digit(tmpbuf);
 853 }
 854
 855 bool
 856 Perl_is_uni_upper(pTHX_ UV c)
 857 {
 858     U8 tmpbuf[UTF8_MAXLEN+1];
 859     uvchr_to_utf8(tmpbuf, (UV)c);
 860     return is_utf8_upper(tmpbuf);
 861 }
 862
 863 bool
 864 Perl_is_uni_lower(pTHX_ UV c)
 865 {
 866     U8 tmpbuf[UTF8_MAXLEN+1];
 867     uvchr_to_utf8(tmpbuf, (UV)c);
 868     return is_utf8_lower(tmpbuf);
 869 }
 870
 871 bool
 872 Perl_is_uni_cntrl(pTHX_ UV c)
 873 {
 874     U8 tmpbuf[UTF8_MAXLEN+1];
 875     uvchr_to_utf8(tmpbuf, (UV)c);
 876     return is_utf8_cntrl(tmpbuf);
 877 }
 878
 879 bool
 880 Perl_is_uni_graph(pTHX_ UV c)
 881 {
 882     U8 tmpbuf[UTF8_MAXLEN+1];
 883     uvchr_to_utf8(tmpbuf, (UV)c);
 884     return is_utf8_graph(tmpbuf);
 885 }
 886
 887 bool
 888 Perl_is_uni_print(pTHX_ UV c)
 889 {
 890     U8 tmpbuf[UTF8_MAXLEN+1];
 891     uvchr_to_utf8(tmpbuf, (UV)c);
 892     return is_utf8_print(tmpbuf);
 893 }
 894
 895 bool
 896 Perl_is_uni_punct(pTHX_ UV c)
 897 {
 898     U8 tmpbuf[UTF8_MAXLEN+1];
 899     uvchr_to_utf8(tmpbuf, (UV)c);
 900     return is_utf8_punct(tmpbuf);
 901 }
 902
 903 bool
 904 Perl_is_uni_xdigit(pTHX_ UV c)
 905 {
 906     U8 tmpbuf[UTF8_MAXLEN*2+1];
 907     uvchr_to_utf8(tmpbuf, (UV)c);
 908     return is_utf8_xdigit(tmpbuf);
 909 }
 910
 911 UV
 912 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 913 {
 914     U8 tmpbuf[UTF8_MAXLEN*2+1];
 915     uvchr_to_utf8(tmpbuf, (UV)c);
 916     return to_utf8_upper(tmpbuf, p, lenp);
 917 }
 918
 919 UV
 920 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 921 {
 922     U8 tmpbuf[UTF8_MAXLEN*2+1];
 923     uvchr_to_utf8(tmpbuf, (UV)c);
 924     return to_utf8_title(tmpbuf, p, lenp);
 925 }
 926
 927 UV
 928 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 929 {
 930     U8 tmpbuf[UTF8_MAXLEN+1];
 931     uvchr_to_utf8(tmpbuf, (UV)c);
 932     return to_utf8_lower(tmpbuf, p, lenp);
 933 }
 934
 935 UV
 936 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 937 {
 938     U8 tmpbuf[UTF8_MAXLEN+1];
 939     uvchr_to_utf8(tmpbuf, (UV)c);
 940     return to_utf8_fold(tmpbuf, p, lenp);
 941 }
 942
 943 /* for now these all assume no locale info available for Unicode > 255 */
 944
 945 bool
 946 Perl_is_uni_alnum_lc(pTHX_ UV c)
 947 {
 948     return is_uni_alnum(c);     /* XXX no locale support yet */
 949 }
 950
 951 bool
 952 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 953 {
 954     return is_uni_alnumc(c);    /* XXX no locale support yet */
 955 }
 956
 957 bool
 958 Perl_is_uni_idfirst_lc(pTHX_ UV c)
 959 {
 960     return is_uni_idfirst(c);   /* XXX no locale support yet */
 961 }
 962
 963 bool
 964 Perl_is_uni_alpha_lc(pTHX_ UV c)
 965 {
 966     return is_uni_alpha(c);     /* XXX no locale support yet */
 967 }
 968
 969 bool
 970 Perl_is_uni_ascii_lc(pTHX_ UV c)
 971 {
 972     return is_uni_ascii(c);     /* XXX no locale support yet */
 973 }
 974
 975 bool
 976 Perl_is_uni_space_lc(pTHX_ UV c)
 977 {
 978     return is_uni_space(c);     /* XXX no locale support yet */
 979 }
 980
 981 bool
 982 Perl_is_uni_digit_lc(pTHX_ UV c)
 983 {
 984     return is_uni_digit(c);     /* XXX no locale support yet */
 985 }
 986
 987 bool
 988 Perl_is_uni_upper_lc(pTHX_ UV c)
 989 {
 990     return is_uni_upper(c);     /* XXX no locale support yet */
 991 }
 992
 993 bool
 994 Perl_is_uni_lower_lc(pTHX_ UV c)
 995 {
 996     return is_uni_lower(c);     /* XXX no locale support yet */
 997 }
 998
 999 bool
1000 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1001 {
1002     return is_uni_cntrl(c);     /* XXX no locale support yet */
1003 }
1004
1005 bool
1006 Perl_is_uni_graph_lc(pTHX_ UV c)
1007 {
1008     return is_uni_graph(c);     /* XXX no locale support yet */
1009 }
1010
1011 bool
1012 Perl_is_uni_print_lc(pTHX_ UV c)
1013 {
1014     return is_uni_print(c);     /* XXX no locale support yet */
1015 }
1016
1017 bool
1018 Perl_is_uni_punct_lc(pTHX_ UV c)
1019 {
1020     return is_uni_punct(c);     /* XXX no locale support yet */
1021 }
1022
1023 bool
1024 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1025 {
1026     return is_uni_xdigit(c);    /* XXX no locale support yet */
1027 }
1028
1029 bool
1030 Perl_is_utf8_alnum(pTHX_ U8 *p)
1031 {
1032     if (!is_utf8_char(p))
1033         return FALSE;
1034     if (!PL_utf8_alnum)
1035         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1036          * descendant of isalnum(3), in other words, it doesn't
1037          * contain the '_'. --jhi */
1038         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1039     return swash_fetch(PL_utf8_alnum, p, TRUE);
1040 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1041 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1042     if (!PL_utf8_alnum)
1043         PL_utf8_alnum = swash_init("utf8", "",
1044             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1045     return swash_fetch(PL_utf8_alnum, p, TRUE);
1046 #endif
1047 }
1048
1049 bool
1050 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1051 {
1052     if (!is_utf8_char(p))
1053         return FALSE;
1054     if (!PL_utf8_alnum)
1055         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1056     return swash_fetch(PL_utf8_alnum, p, TRUE);
1057 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1058 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1059     if (!PL_utf8_alnum)
1060         PL_utf8_alnum = swash_init("utf8", "",
1061             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1062     return swash_fetch(PL_utf8_alnum, p, TRUE);
1063 #endif
1064 }
1065
1066 bool
1067 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1068 {
1069     return *p == '_' || is_utf8_alpha(p);
1070 }
1071
1072 bool
1073 Perl_is_utf8_alpha(pTHX_ U8 *p)
1074 {
1075     if (!is_utf8_char(p))
1076         return FALSE;
1077     if (!PL_utf8_alpha)
1078         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1079     return swash_fetch(PL_utf8_alpha, p, TRUE);
1080 }
1081
1082 bool
1083 Perl_is_utf8_ascii(pTHX_ U8 *p)
1084 {
1085     if (!is_utf8_char(p))
1086         return FALSE;
1087     if (!PL_utf8_ascii)
1088         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1089     return swash_fetch(PL_utf8_ascii, p, TRUE);
1090 }
1091
1092 bool
1093 Perl_is_utf8_space(pTHX_ U8 *p)
1094 {
1095     if (!is_utf8_char(p))
1096         return FALSE;
1097     if (!PL_utf8_space)
1098         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1099     return swash_fetch(PL_utf8_space, p, TRUE);
1100 }
1101
1102 bool
1103 Perl_is_utf8_digit(pTHX_ U8 *p)
1104 {
1105     if (!is_utf8_char(p))
1106         return FALSE;
1107     if (!PL_utf8_digit)
1108         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1109     return swash_fetch(PL_utf8_digit, p, TRUE);
1110 }
1111
1112 bool
1113 Perl_is_utf8_upper(pTHX_ U8 *p)
1114 {
1115     if (!is_utf8_char(p))
1116         return FALSE;
1117     if (!PL_utf8_upper)
1118         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1119     return swash_fetch(PL_utf8_upper, p, TRUE);
1120 }
1121
1122 bool
1123 Perl_is_utf8_lower(pTHX_ U8 *p)
1124 {
1125     if (!is_utf8_char(p))
1126         return FALSE;
1127     if (!PL_utf8_lower)
1128         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1129     return swash_fetch(PL_utf8_lower, p, TRUE);
1130 }
1131
1132 bool
1133 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1134 {
1135     if (!is_utf8_char(p))
1136         return FALSE;
1137     if (!PL_utf8_cntrl)
1138         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1139     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1140 }
1141
1142 bool
1143 Perl_is_utf8_graph(pTHX_ U8 *p)
1144 {
1145     if (!is_utf8_char(p))
1146         return FALSE;
1147     if (!PL_utf8_graph)
1148         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1149     return swash_fetch(PL_utf8_graph, p, TRUE);
1150 }
1151
1152 bool
1153 Perl_is_utf8_print(pTHX_ U8 *p)
1154 {
1155     if (!is_utf8_char(p))
1156         return FALSE;
1157     if (!PL_utf8_print)
1158         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1159     return swash_fetch(PL_utf8_print, p, TRUE);
1160 }
1161
1162 bool
1163 Perl_is_utf8_punct(pTHX_ U8 *p)
1164 {
1165     if (!is_utf8_char(p))
1166         return FALSE;
1167     if (!PL_utf8_punct)
1168         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1169     return swash_fetch(PL_utf8_punct, p, TRUE);
1170 }
1171
1172 bool
1173 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1174 {
1175     if (!is_utf8_char(p))
1176         return FALSE;
1177     if (!PL_utf8_xdigit)
1178         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1179     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1180 }
1181
1182 bool
1183 Perl_is_utf8_mark(pTHX_ U8 *p)
1184 {
1185     if (!is_utf8_char(p))
1186         return FALSE;
1187     if (!PL_utf8_mark)
1188         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1189     return swash_fetch(PL_utf8_mark, p, TRUE);
1190 }
1191
1192 /*
1193 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1194
1195 The "p" contains the pointer to the UTF-8 string encoding
1196 the character that is being converted.
1197
1198 The "ustrp" is a pointer to the character buffer to put the
1199 conversion result to.  The "lenp" is a pointer to the length
1200 of the result.
1201
1202 The "swash" is a pointer to the swash to use.
1203
1204 The "normal" is a string like "ToLower" which means the swash
1205 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1206 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1207
1208 The "special" is a string like "utf8::ToSpecLower", which means
1209 the hash %utf8::ToSpecLower, which is stored in the same file,
1210 lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
1211 to the hash is by Perl_to_utf8_case().
1212
1213 =cut
1214  */
1215
1216 UV
1217 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1218 {
1219     UV uv;
1220
1221     if (!*swashp)
1222         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1223     uv = swash_fetch(*swashp, p, TRUE);
1224     if (uv)
1225          uv = UNI_TO_NATIVE(uv);
1226     else {
1227          HV *hv;
1228          SV *keysv;
1229          HE *he;
1230
1231          uv = utf8_to_uvchr(p, 0);
1232
1233          if ((hv    = get_hv(special, FALSE)) &&
1234              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1235              (he    = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1236               SV *val = HeVAL(he);
1237               char *s = SvPV(val, *lenp);
1238               U8 c = *(U8*)s;
1239               if (*lenp > 1 || UNI_IS_INVARIANT(c))
1240                    Copy(s, ustrp, *lenp, U8);
1241               else {
1242                    /* something in the 0x80..0xFF range */
1243                    ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1244                    ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1245                    *lenp = 2;
1246               }
1247               return 0;
1248          }
1249     }
1250     *lenp = UNISKIP(uv);
1251     uvuni_to_utf8(ustrp, uv);
1252     return uv;
1253 }
1254
1255 UV
1256 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1257 {
1258     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1259                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1260 }
1261
1262 UV
1263 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1264 {
1265     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1266                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1267 }
1268
1269 UV
1270 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1271 {
1272     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1273                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1274 }
1275
1276 UV
1277 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1278 {
1279     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1280                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1281 }
1282
1283 /* a "swash" is a swatch hash */
1284
1285 SV*
1286 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1287 {
1288     SV* retval;
1289     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1290     dSP;
1291     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1292     SV* errsv_save;
1293
1294     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1295         ENTER;
1296         errsv_save = newSVsv(ERRSV);
1297         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1298         if (!SvTRUE(ERRSV))
1299             sv_setsv(ERRSV, errsv_save);
1300         SvREFCNT_dec(errsv_save);
1301         LEAVE;
1302     }
1303     SPAGAIN;
1304     PUSHSTACKi(PERLSI_MAGIC);
1305     PUSHMARK(SP);
1306     EXTEND(SP,5);
1307     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1308     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1309     PUSHs(listsv);
1310     PUSHs(sv_2mortal(newSViv(minbits)));
1311     PUSHs(sv_2mortal(newSViv(none)));
1312     PUTBACK;
1313     ENTER;
1314     SAVEI32(PL_hints);
1315     PL_hints = 0;
1316     save_re_context();
1317     if (PL_curcop == &PL_compiling)
1318         /* XXX ought to be handled by lex_start */
1319         sv_setpv(tokenbufsv, PL_tokenbuf);
1320     errsv_save = newSVsv(ERRSV);
1321     if (call_method("SWASHNEW", G_SCALAR))
1322         retval = newSVsv(*PL_stack_sp--);
1323     else
1324         retval = &PL_sv_undef;
1325     if (!SvTRUE(ERRSV))
1326         sv_setsv(ERRSV, errsv_save);
1327     SvREFCNT_dec(errsv_save);
1328     LEAVE;
1329     POPSTACK;
1330     if (PL_curcop == &PL_compiling) {
1331         STRLEN len;
1332         char* pv = SvPV(tokenbufsv, len);
1333
1334         Copy(pv, PL_tokenbuf, len+1, char);
1335         PL_curcop->op_private = PL_hints;
1336     }
1337     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1338         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1339     return retval;
1340 }
1341
1342
1343 /* This API is wrong for special case conversions since we may need to
1344  * return several Unicode characters for a single Unicode character
1345  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1346  * the lower-level routine, and it is similarly broken for returning
1347  * multiple values.  --jhi */
1348 UV
1349 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1350 {
1351     HV* hv = (HV*)SvRV(sv);
1352     U32 klen;
1353     U32 off;
1354     STRLEN slen;
1355     STRLEN needents;
1356     U8 *tmps = NULL;
1357     U32 bit;
1358     SV *retval;
1359     U8 tmputf8[2];
1360     UV c = NATIVE_TO_ASCII(*ptr);
1361
1362     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1363         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1364         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1365         ptr = tmputf8;
1366     }
1367     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1368      * then the "swatch" is a vec() for al the chars which start
1369      * with 0xAA..0xYY
1370      * So the key in the hash (klen) is length of encoded char -1
1371      */
1372     klen = UTF8SKIP(ptr) - 1;
1373     off  = ptr[klen];
1374
1375     if (klen == 0)
1376      {
1377       /* If char in invariant then swatch is for all the invariant chars
1378        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1379        */
1380       needents = UTF_CONTINUATION_MARK;
1381       off      = NATIVE_TO_UTF(ptr[klen]);
1382      }
1383     else
1384      {
1385       /* If char is encoded then swatch is for the prefix */
1386       needents = (1 << UTF_ACCUMULATION_SHIFT);
1387       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1388      }
1389
1390     /*
1391      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1392      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1393      * it's nothing to sniff at.)  Pity we usually come through at least
1394      * two function calls to get here...
1395      *
1396      * NB: this code assumes that swatches are never modified, once generated!
1397      */
1398
1399     if (hv   == PL_last_swash_hv &&
1400         klen == PL_last_swash_klen &&
1401         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1402     {
1403         tmps = PL_last_swash_tmps;
1404         slen = PL_last_swash_slen;
1405     }
1406     else {
1407         /* Try our second-level swatch cache, kept in a hash. */
1408         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1409
1410         /* If not cached, generate it via utf8::SWASHGET */
1411         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1412             dSP;
1413             /* We use utf8n_to_uvuni() as we want an index into
1414                Unicode tables, not a native character number.
1415              */
1416             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1417             SV *errsv_save;
1418             ENTER;
1419             SAVETMPS;
1420             save_re_context();
1421             PUSHSTACKi(PERLSI_MAGIC);
1422             PUSHMARK(SP);
1423             EXTEND(SP,3);
1424             PUSHs((SV*)sv);
1425             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1426             PUSHs(sv_2mortal(newSViv((klen) ?
1427                                      (code_point & ~(needents - 1)) : 0)));
1428             PUSHs(sv_2mortal(newSViv(needents)));
1429             PUTBACK;
1430             errsv_save = newSVsv(ERRSV);
1431             if (call_method("SWASHGET", G_SCALAR))
1432                 retval = newSVsv(*PL_stack_sp--);
1433             else
1434                 retval = &PL_sv_undef;
1435             if (!SvTRUE(ERRSV))
1436                 sv_setsv(ERRSV, errsv_save);
1437             SvREFCNT_dec(errsv_save);
1438             POPSTACK;
1439             FREETMPS;
1440             LEAVE;
1441             if (PL_curcop == &PL_compiling)
1442                 PL_curcop->op_private = PL_hints;
1443
1444             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1445
1446             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1447                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1448         }
1449
1450         PL_last_swash_hv = hv;
1451         PL_last_swash_klen = klen;
1452         PL_last_swash_tmps = tmps;
1453         PL_last_swash_slen = slen;
1454         if (klen)
1455             Copy(ptr, PL_last_swash_key, klen, U8);
1456     }
1457
1458     switch ((int)((slen << 3) / needents)) {
1459     case 1:
1460         bit = 1 << (off & 7);
1461         off >>= 3;
1462         return (tmps[off] & bit) != 0;
1463     case 8:
1464         return tmps[off];
1465     case 16:
1466         off <<= 1;
1467         return (tmps[off] << 8) + tmps[off + 1] ;
1468     case 32:
1469         off <<= 2;
1470         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1471     }
1472     Perl_croak(aTHX_ "panic: swash_fetch");
1473     return 0;
1474 }
1475
1476
1477 /*
1478 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1479
1480 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1481 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1482 bytes available. The return value is the pointer to the byte after the
1483 end of the new character. In other words,
1484
1485     d = uvchr_to_utf8(d, uv);
1486
1487 is the recommended wide native character-aware way of saying
1488
1489     *(d++) = uv;
1490
1491 =cut
1492 */
1493
1494 /* On ASCII machines this is normally a macro but we want a
1495    real function in case XS code wants it
1496 */
1497 #undef Perl_uvchr_to_utf8
1498 U8 *
1499 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1500 {
1501     return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1502 }
1503
1504
1505 /*
1506 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1507
1508 Returns the native character value of the first character in the string C<s>
1509 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1510 length, in bytes, of that character.
1511
1512 Allows length and flags to be passed to low level routine.
1513
1514 =cut
1515 */
1516 /* On ASCII machines this is normally a macro but we want a
1517    real function in case XS code wants it
1518 */
1519 #undef Perl_utf8n_to_uvchr
1520 UV
1521 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1522 {
1523     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1524     return UNI_TO_NATIVE(uv);
1525 }
1526
1527 /*
1528 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1529
1530 Build to the scalar dsv a displayable version of the string spv,
1531 length len, the displayable version being at most pvlim bytes long
1532 (if longer, the rest is truncated and "..." will be appended).
1533 The flags argument is currently unused but available for future extensions.
1534 The pointer to the PV of the dsv is returned.
1535
1536 =cut */
1537 char *
1538 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1539 {
1540     int truncated = 0;
1541     char *s, *e;
1542
1543     sv_setpvn(dsv, "", 0);
1544     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1545          UV u;
1546          if (pvlim && SvCUR(dsv) >= pvlim) {
1547               truncated++;
1548               break;
1549          }
1550          u = utf8_to_uvchr((U8*)s, 0);
1551          Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1552     }
1553     if (truncated)
1554          sv_catpvn(dsv, "...", 3);
1555
1556     return SvPVX(dsv);
1557 }
1558
1559 /*
1560 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1561
1562 Build to the scalar dsv a displayable version of the scalar sv,
1563 he displayable version being at most pvlim bytes long
1564 (if longer, the rest is truncated and "..." will be appended).
1565 The flags argument is currently unused but available for future extensions.
1566 The pointer to the PV of the dsv is returned.
1567
1568 =cut */
1569 char *
1570 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1571 {
1572      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1573                                 pvlim, flags);
1574 }
1575
1576 /*
1577 =for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1578
1579 Return true if the strings s1 and s2 differ case-insensitively, false
1580 if not (if they are equal case-insensitively).  If u1 is true, the
1581 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1582 the string s2 is assumed to be in UTF-8-encoded Unicode.  (If both u1
1583 and u2 are false, ibcmp() is called.)
1584
1585 For case-insensitiveness, the "casefolding" of Unicode is used
1586 instead of upper/lowercasing both the characters, see
1587 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1588
1589 =cut */
1590 I32
1591 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len)
1592 {
1593      if (u1 || u2) {
1594           register U8 *a = (U8*)s1;
1595           register U8 *b = (U8*)s2;
1596           STRLEN la, lb;
1597           UV ca, cb;
1598           STRLEN ulen1, ulen2;
1599           U8 tmpbuf1[UTF8_MAXLEN*3+1];
1600           U8 tmpbuf2[UTF8_MAXLEN*3+1];
1601
1602           while (len) {
1603                if (u1)
1604                     ca = utf8_to_uvchr((U8*)a, &la);
1605                else {
1606                     ca = *a;
1607                     la = 1;
1608                }
1609                if (u2)
1610                     cb = utf8_to_uvchr((U8*)b, &lb);
1611                else {
1612                     cb = *b;
1613                     lb = 1;
1614                }
1615                if (ca != cb) {
1616                     if (u1)
1617                          to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1618                     else
1619                          ulen1 = 1;
1620                     if (u2)
1621                          to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1622                     else
1623                          ulen2 = 1;
1624                     if (ulen1 != ulen2
1625                         || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1626                         || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1627                          return 1;
1628                }
1629                a += la;
1630                b += lb;
1631           }
1632           return 0;
1633      }
1634      else
1635          return ibcmp(s1, s2, len);
1636 }
1637