utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2002, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
  28
  29 /*
  30 =head1 Unicode Support
  31
  32 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  33
  34 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  35 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  36 bytes available. The return value is the pointer to the byte after the
  37 end of the new character. In other words,
  38
  39     d = uvuni_to_utf8_flags(d, uv, flags);
  40
  41 or, in most cases,
  42
  43     d = uvuni_to_utf8(d, uv);
  44
  45 (which is equivalent to)
  46
  47     d = uvuni_to_utf8_flags(d, uv, 0);
  48
  49 is the recommended Unicode-aware way of saying
  50
  51     *(d++) = uv;
  52
  53 =cut
  54 */
  55
  56 U8 *
  57 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  58 {
  59     if (ckWARN(WARN_UTF8)) {
  60          if (UNICODE_IS_SURROGATE(uv) &&
  61              !(flags & UNICODE_ALLOW_SURROGATE))
  62               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  63          else if (
  64                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  65                     !(flags & UNICODE_ALLOW_FDD0))
  66                    ||
  67                    (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
  68                     !(flags & UNICODE_ALLOW_BOM))
  69                    ||
  70                    ((uv & 0xFFFF) == 0xFFFF &&
  71                     !(flags & UNICODE_ALLOW_FFFF))) &&
  72                   /* UNICODE_ALLOW_SUPER includes
  73                    * FFFFs beyond 0x10FFFF. */
  74                   ((uv <= PERL_UNICODE_MAX) ||
  75                    !(flags & UNICODE_ALLOW_SUPER))
  76                   )
  77               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  78                          "Unicode character 0x%04"UVxf" is illegal", uv);
  79     }
  80     if (UNI_IS_INVARIANT(uv)) {
  81         *d++ = UTF_TO_NATIVE(uv);
  82         return d;
  83     }
  84 #if defined(EBCDIC)
  85     else {
  86         STRLEN len  = UNISKIP(uv);
  87         U8 *p = d+len-1;
  88         while (p > d) {
  89             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  90             uv >>= UTF_ACCUMULATION_SHIFT;
  91         }
  92         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  93         return d+len;
  94     }
  95 #else /* Non loop style */
  96     if (uv < 0x800) {
  97         *d++ = (( uv >>  6)         | 0xc0);
  98         *d++ = (( uv        & 0x3f) | 0x80);
  99         return d;
 100     }
 101     if (uv < 0x10000) {
 102         *d++ = (( uv >> 12)         | 0xe0);
 103         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 104         *d++ = (( uv        & 0x3f) | 0x80);
 105         return d;
 106     }
 107     if (uv < 0x200000) {
 108         *d++ = (( uv >> 18)         | 0xf0);
 109         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 110         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 111         *d++ = (( uv        & 0x3f) | 0x80);
 112         return d;
 113     }
 114     if (uv < 0x4000000) {
 115         *d++ = (( uv >> 24)         | 0xf8);
 116         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 117         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 118         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 119         *d++ = (( uv        & 0x3f) | 0x80);
 120         return d;
 121     }
 122     if (uv < 0x80000000) {
 123         *d++ = (( uv >> 30)         | 0xfc);
 124         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 126         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 127         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 128         *d++ = (( uv        & 0x3f) | 0x80);
 129         return d;
 130     }
 131 #ifdef HAS_QUAD
 132     if (uv < UTF8_QUAD_MAX)
 133 #endif
 134     {
 135         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 136         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 137         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 138         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 139         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 140         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 141         *d++ = (( uv        & 0x3f) | 0x80);
 142         return d;
 143     }
 144 #ifdef HAS_QUAD
 145     {
 146         *d++ =                        0xff;     /* Can't match U+FFFE! */
 147         *d++ =                        0x80;     /* 6 Reserved bits */
 148         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 149         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 150         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 151         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 152         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 153         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 154         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 155         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 156         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 157         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 158         *d++ = (( uv        & 0x3f) | 0x80);
 159         return d;
 160     }
 161 #endif
 162 #endif /* Loop style */
 163 }
 164
 165 U8 *
 166 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 167 {
 168     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 169 }
 170
 171
 172 /*
 173 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 174
 175 Tests if some arbitrary number of bytes begins in a valid UTF-8
 176 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 177 The actual number of bytes in the UTF-8 character will be returned if
 178 it is valid, otherwise 0.
 179
 180 =cut
 181 */
 182 STRLEN
 183 Perl_is_utf8_char(pTHX_ U8 *s)
 184 {
 185     U8 u = *s;
 186     STRLEN slen, len;
 187     UV uv, ouv;
 188
 189     if (UTF8_IS_INVARIANT(u))
 190         return 1;
 191
 192     if (!UTF8_IS_START(u))
 193         return 0;
 194
 195     len = UTF8SKIP(s);
 196
 197     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 198         return 0;
 199
 200     slen = len - 1;
 201     s++;
 202     u &= UTF_START_MASK(len);
 203     uv  = u;
 204     ouv = uv;
 205     while (slen--) {
 206         if (!UTF8_IS_CONTINUATION(*s))
 207             return 0;
 208         uv = UTF8_ACCUMULATE(uv, *s);
 209         if (uv < ouv)
 210             return 0;
 211         ouv = uv;
 212         s++;
 213     }
 214
 215     if (UNISKIP(uv) < len)
 216         return 0;
 217
 218     return len;
 219 }
 220
 221 /*
 222 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 223
 224 Returns true if first C<len> bytes of the given string form a valid UTF8
 225 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 226 'a string that contains UTF8' because a valid ASCII string is a valid
 227 UTF8 string.
 228
 229 =cut
 230 */
 231
 232 bool
 233 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 234 {
 235     U8* x = s;
 236     U8* send;
 237     STRLEN c;
 238
 239     if (!len)
 240         len = strlen((char *)s);
 241     send = s + len;
 242
 243     while (x < send) {
 244         c = is_utf8_char(x);
 245         if (!c)
 246             return FALSE;
 247         x += c;
 248     }
 249     if (x != send)
 250         return FALSE;
 251
 252     return TRUE;
 253 }
 254
 255 /*
 256 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 257
 258 Bottom level UTF-8 decode routine.
 259 Returns the unicode code point value of the first character in the string C<s>
 260 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 261 C<retlen> will be set to the length, in bytes, of that character.
 262
 263 If C<s> does not point to a well-formed UTF8 character, the behaviour
 264 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 265 it is assumed that the caller will raise a warning, and this function
 266 will silently just set C<retlen> to C<-1> and return zero.  If the
 267 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 268 malformations will be given, C<retlen> will be set to the expected
 269 length of the UTF-8 character in bytes, and zero will be returned.
 270
 271 The C<flags> can also contain various flags to allow deviations from
 272 the strict UTF-8 encoding (see F<utf8.h>).
 273
 274 Most code should use utf8_to_uvchr() rather than call this directly.
 275
 276 =cut
 277 */
 278
 279 UV
 280 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 281 {
 282     U8 *s0 = s;
 283     UV uv = *s, ouv = 0;
 284     STRLEN len = 1;
 285     bool dowarn = ckWARN_d(WARN_UTF8);
 286     UV startbyte = *s;
 287     STRLEN expectlen = 0;
 288     U32 warning = 0;
 289
 290 /* This list is a superset of the UTF8_ALLOW_XXX. */
 291
 292 #define UTF8_WARN_EMPTY                          1
 293 #define UTF8_WARN_CONTINUATION                   2
 294 #define UTF8_WARN_NON_CONTINUATION               3
 295 #define UTF8_WARN_FE_FF                          4
 296 #define UTF8_WARN_SHORT                          5
 297 #define UTF8_WARN_OVERFLOW                       6
 298 #define UTF8_WARN_SURROGATE                      7
 299 #define UTF8_WARN_BOM                            8
 300 #define UTF8_WARN_LONG                           9
 301 #define UTF8_WARN_FFFF                          10
 302
 303     if (curlen == 0 &&
 304         !(flags & UTF8_ALLOW_EMPTY)) {
 305         warning = UTF8_WARN_EMPTY;
 306         goto malformed;
 307     }
 308
 309     if (UTF8_IS_INVARIANT(uv)) {
 310         if (retlen)
 311             *retlen = 1;
 312         return (UV) (NATIVE_TO_UTF(*s));
 313     }
 314
 315     if (UTF8_IS_CONTINUATION(uv) &&
 316         !(flags & UTF8_ALLOW_CONTINUATION)) {
 317         warning = UTF8_WARN_CONTINUATION;
 318         goto malformed;
 319     }
 320
 321     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 322         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 323         warning = UTF8_WARN_NON_CONTINUATION;
 324         goto malformed;
 325     }
 326
 327 #ifdef EBCDIC
 328     uv = NATIVE_TO_UTF(uv);
 329 #else
 330     if ((uv == 0xfe || uv == 0xff) &&
 331         !(flags & UTF8_ALLOW_FE_FF)) {
 332         warning = UTF8_WARN_FE_FF;
 333         goto malformed;
 334     }
 335 #endif
 336
 337     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 338     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 339     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 340     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 341 #ifdef EBCDIC
 342     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 343     else                        { len =  7; uv &= 0x01; }
 344 #else
 345     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 346     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 347     else                        { len = 13; uv = 0; } /* whoa! */
 348 #endif
 349
 350     if (retlen)
 351         *retlen = len;
 352
 353     expectlen = len;
 354
 355     if ((curlen < expectlen) &&
 356         !(flags & UTF8_ALLOW_SHORT)) {
 357         warning = UTF8_WARN_SHORT;
 358         goto malformed;
 359     }
 360
 361     len--;
 362     s++;
 363     ouv = uv;
 364
 365     while (len--) {
 366         if (!UTF8_IS_CONTINUATION(*s) &&
 367             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 368             s--;
 369             warning = UTF8_WARN_NON_CONTINUATION;
 370             goto malformed;
 371         }
 372         else
 373             uv = UTF8_ACCUMULATE(uv, *s);
 374         if (!(uv > ouv)) {
 375             /* These cannot be allowed. */
 376             if (uv == ouv) {
 377                 if (!(flags & UTF8_ALLOW_LONG)) {
 378                     warning = UTF8_WARN_LONG;
 379                     goto malformed;
 380                 }
 381             }
 382             else { /* uv < ouv */
 383                 /* This cannot be allowed. */
 384                 warning = UTF8_WARN_OVERFLOW;
 385                 goto malformed;
 386             }
 387         }
 388         s++;
 389         ouv = uv;
 390     }
 391
 392     if (UNICODE_IS_SURROGATE(uv) &&
 393         !(flags & UTF8_ALLOW_SURROGATE)) {
 394         warning = UTF8_WARN_SURROGATE;
 395         goto malformed;
 396     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 397                !(flags & UTF8_ALLOW_BOM)) {
 398         warning = UTF8_WARN_BOM;
 399         goto malformed;
 400     } else if ((expectlen > UNISKIP(uv)) &&
 401                !(flags & UTF8_ALLOW_LONG)) {
 402         warning = UTF8_WARN_LONG;
 403         goto malformed;
 404     } else if (UNICODE_IS_ILLEGAL(uv) &&
 405                !(flags & UTF8_ALLOW_FFFF)) {
 406         warning = UTF8_WARN_FFFF;
 407         goto malformed;
 408     }
 409
 410     return uv;
 411
 412 malformed:
 413
 414     if (flags & UTF8_CHECK_ONLY) {
 415         if (retlen)
 416             *retlen = -1;
 417         return 0;
 418     }
 419
 420     if (dowarn) {
 421         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 422
 423         switch (warning) {
 424         case 0: /* Intentionally empty. */ break;
 425         case UTF8_WARN_EMPTY:
 426             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 427             break;
 428         case UTF8_WARN_CONTINUATION:
 429             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 430             break;
 431         case UTF8_WARN_NON_CONTINUATION:
 432             if (s == s0)
 433                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 434                            (UV)s[1], startbyte);
 435             else
 436                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 437                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 438
 439             break;
 440         case UTF8_WARN_FE_FF:
 441             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 442             break;
 443         case UTF8_WARN_SHORT:
 444             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 445                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 446             expectlen = curlen;         /* distance for caller to skip */
 447             break;
 448         case UTF8_WARN_OVERFLOW:
 449             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 450                            ouv, *s, startbyte);
 451             break;
 452         case UTF8_WARN_SURROGATE:
 453             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 454             break;
 455         case UTF8_WARN_BOM:
 456             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 457             break;
 458         case UTF8_WARN_LONG:
 459             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 460                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 461             break;
 462         case UTF8_WARN_FFFF:
 463             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 464             break;
 465         default:
 466             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 467             break;
 468         }
 469
 470         if (warning) {
 471             char *s = SvPVX(sv);
 472
 473             if (PL_op)
 474                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 475                             "%s in %s", s,  OP_DESC(PL_op));
 476             else
 477                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 478         }
 479     }
 480
 481     if (retlen)
 482         *retlen = expectlen ? expectlen : len;
 483
 484     return 0;
 485 }
 486
 487 /*
 488 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 489
 490 Returns the native character value of the first character in the string C<s>
 491 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 492 length, in bytes, of that character.
 493
 494 If C<s> does not point to a well-formed UTF8 character, zero is
 495 returned and retlen is set, if possible, to -1.
 496
 497 =cut
 498 */
 499
 500 UV
 501 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 502 {
 503     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
 504                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 505 }
 506
 507 /*
 508 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 509
 510 Returns the Unicode code point of the first character in the string C<s>
 511 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 512 length, in bytes, of that character.
 513
 514 This function should only be used when returned UV is considered
 515 an index into the Unicode semantic tables (e.g. swashes).
 516
 517 If C<s> does not point to a well-formed UTF8 character, zero is
 518 returned and retlen is set, if possible, to -1.
 519
 520 =cut
 521 */
 522
 523 UV
 524 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 525 {
 526     /* Call the low level routine asking for checks */
 527     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
 528                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 529 }
 530
 531 /*
 532 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 533
 534 Return the length of the UTF-8 char encoded string C<s> in characters.
 535 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 536 up past C<e>, croaks.
 537
 538 =cut
 539 */
 540
 541 STRLEN
 542 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 543 {
 544     STRLEN len = 0;
 545
 546     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 547      * the bitops (especially ~) can create illegal UTF-8.
 548      * In other words: in Perl UTF-8 is not just for Unicode. */
 549
 550     if (e < s) {
 551         if (ckWARN_d(WARN_UTF8)) {
 552             if (PL_op)
 553                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 554                             "%s in %s", unees, OP_DESC(PL_op));
 555             else
 556                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 557         }
 558         return 0;
 559     }
 560     while (s < e) {
 561         U8 t = UTF8SKIP(s);
 562
 563         if (e - s < t) {
 564             if (ckWARN_d(WARN_UTF8)) {
 565                 if (PL_op)
 566                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 567                                 unees, OP_DESC(PL_op));
 568                 else
 569                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 570             }
 571             return len;
 572         }
 573         s += t;
 574         len++;
 575     }
 576
 577     return len;
 578 }
 579
 580 /*
 581 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 582
 583 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 584 and C<b>.
 585
 586 WARNING: use only if you *know* that the pointers point inside the
 587 same UTF-8 buffer.
 588
 589 =cut
 590 */
 591
 592 IV
 593 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 594 {
 595     IV off = 0;
 596
 597     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 598      * the bitops (especially ~) can create illegal UTF-8.
 599      * In other words: in Perl UTF-8 is not just for Unicode. */
 600
 601     if (a < b) {
 602         while (a < b) {
 603             U8 c = UTF8SKIP(a);
 604
 605             if (b - a < c) {
 606                 if (ckWARN_d(WARN_UTF8)) {
 607                     if (PL_op)
 608                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 609                                     "%s in %s", unees, OP_DESC(PL_op));
 610                     else
 611                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 612                 }
 613                 return off;
 614             }
 615             a += c;
 616             off--;
 617         }
 618     }
 619     else {
 620         while (b < a) {
 621             U8 c = UTF8SKIP(b);
 622
 623             if (a - b < c) {
 624                 if (ckWARN_d(WARN_UTF8)) {
 625                     if (PL_op)
 626                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 627                                     "%s in %s", unees, OP_DESC(PL_op));
 628                     else
 629                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 630                 }
 631                 return off;
 632             }
 633             b += c;
 634             off++;
 635         }
 636     }
 637
 638     return off;
 639 }
 640
 641 /*
 642 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 643
 644 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 645 forward or backward.
 646
 647 WARNING: do not use the following unless you *know* C<off> is within
 648 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 649 on the first byte of character or just after the last byte of a character.
 650
 651 =cut
 652 */
 653
 654 U8 *
 655 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 656 {
 657     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 658      * the bitops (especially ~) can create illegal UTF-8.
 659      * In other words: in Perl UTF-8 is not just for Unicode. */
 660
 661     if (off >= 0) {
 662         while (off--)
 663             s += UTF8SKIP(s);
 664     }
 665     else {
 666         while (off++) {
 667             s--;
 668             while (UTF8_IS_CONTINUATION(*s))
 669                 s--;
 670         }
 671     }
 672     return s;
 673 }
 674
 675 /*
 676 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 677
 678 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 679 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 680 updates len to contain the new length.
 681 Returns zero on failure, setting C<len> to -1.
 682
 683 =cut
 684 */
 685
 686 U8 *
 687 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 688 {
 689     U8 *send;
 690     U8 *d;
 691     U8 *save = s;
 692
 693     /* ensure valid UTF8 and chars < 256 before updating string */
 694     for (send = s + *len; s < send; ) {
 695         U8 c = *s++;
 696
 697         if (!UTF8_IS_INVARIANT(c) &&
 698             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 699              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 700             *len = -1;
 701             return 0;
 702         }
 703     }
 704
 705     d = s = save;
 706     while (s < send) {
 707         STRLEN ulen;
 708         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 709         s += ulen;
 710     }
 711     *d = '\0';
 712     *len = d - save;
 713     return save;
 714 }
 715
 716 /*
 717 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 718
 719 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 720 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 721 the newly-created string, and updates C<len> to contain the new
 722 length.  Returns the original string if no conversion occurs, C<len>
 723 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 724 0 if C<s> is converted or contains all 7bit characters.
 725
 726 =cut
 727 */
 728
 729 U8 *
 730 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 731 {
 732     U8 *d;
 733     U8 *start = s;
 734     U8 *send;
 735     I32 count = 0;
 736
 737     if (!*is_utf8)
 738         return start;
 739
 740     /* ensure valid UTF8 and chars < 256 before converting string */
 741     for (send = s + *len; s < send;) {
 742         U8 c = *s++;
 743         if (!UTF8_IS_INVARIANT(c)) {
 744             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 745                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 746                 count++;
 747             else
 748                 return start;
 749         }
 750     }
 751
 752     *is_utf8 = 0;
 753
 754     Newz(801, d, (*len) - count + 1, U8);
 755     s = start; start = d;
 756     while (s < send) {
 757         U8 c = *s++;
 758         if (!UTF8_IS_INVARIANT(c)) {
 759             /* Then it is two-byte encoded */
 760             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 761             c = ASCII_TO_NATIVE(c);
 762         }
 763         *d++ = c;
 764     }
 765     *d = '\0';
 766     *len = d - start;
 767     return start;
 768 }
 769
 770 /*
 771 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 772
 773 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 774 Returns a pointer to the newly-created string, and sets C<len> to
 775 reflect the new length.
 776
 777 =cut
 778 */
 779
 780 U8*
 781 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 782 {
 783     U8 *send;
 784     U8 *d;
 785     U8 *dst;
 786     send = s + (*len);
 787
 788     Newz(801, d, (*len) * 2 + 1, U8);
 789     dst = d;
 790
 791     while (s < send) {
 792         UV uv = NATIVE_TO_ASCII(*s++);
 793         if (UNI_IS_INVARIANT(uv))
 794             *d++ = UTF_TO_NATIVE(uv);
 795         else {
 796             *d++ = UTF8_EIGHT_BIT_HI(uv);
 797             *d++ = UTF8_EIGHT_BIT_LO(uv);
 798         }
 799     }
 800     *d = '\0';
 801     *len = d-dst;
 802     return dst;
 803 }
 804
 805 /*
 806  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 807  *
 808  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 809  * We optimize for native, for obvious reasons. */
 810
 811 U8*
 812 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 813 {
 814     U8* pend;
 815     U8* dstart = d;
 816
 817     if (bytelen & 1)
 818         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 819
 820     pend = p + bytelen;
 821
 822     while (p < pend) {
 823         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 824         p += 2;
 825         if (uv < 0x80) {
 826             *d++ = uv;
 827             continue;
 828         }
 829         if (uv < 0x800) {
 830             *d++ = (( uv >>  6)         | 0xc0);
 831             *d++ = (( uv        & 0x3f) | 0x80);
 832             continue;
 833         }
 834         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 835             UV low = *p++;
 836             if (low < 0xdc00 || low >= 0xdfff)
 837                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 838             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 839         }
 840         if (uv < 0x10000) {
 841             *d++ = (( uv >> 12)         | 0xe0);
 842             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 843             *d++ = (( uv        & 0x3f) | 0x80);
 844             continue;
 845         }
 846         else {
 847             *d++ = (( uv >> 18)         | 0xf0);
 848             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 849             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 850             *d++ = (( uv        & 0x3f) | 0x80);
 851             continue;
 852         }
 853     }
 854     *newlen = d - dstart;
 855     return d;
 856 }
 857
 858 /* Note: this one is slightly destructive of the source. */
 859
 860 U8*
 861 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 862 {
 863     U8* s = (U8*)p;
 864     U8* send = s + bytelen;
 865     while (s < send) {
 866         U8 tmp = s[0];
 867         s[0] = s[1];
 868         s[1] = tmp;
 869         s += 2;
 870     }
 871     return utf16_to_utf8(p, d, bytelen, newlen);
 872 }
 873
 874 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 875
 876 bool
 877 Perl_is_uni_alnum(pTHX_ UV c)
 878 {
 879     U8 tmpbuf[UTF8_MAXLEN+1];
 880     uvchr_to_utf8(tmpbuf, c);
 881     return is_utf8_alnum(tmpbuf);
 882 }
 883
 884 bool
 885 Perl_is_uni_alnumc(pTHX_ UV c)
 886 {
 887     U8 tmpbuf[UTF8_MAXLEN+1];
 888     uvchr_to_utf8(tmpbuf, c);
 889     return is_utf8_alnumc(tmpbuf);
 890 }
 891
 892 bool
 893 Perl_is_uni_idfirst(pTHX_ UV c)
 894 {
 895     U8 tmpbuf[UTF8_MAXLEN+1];
 896     uvchr_to_utf8(tmpbuf, c);
 897     return is_utf8_idfirst(tmpbuf);
 898 }
 899
 900 bool
 901 Perl_is_uni_alpha(pTHX_ UV c)
 902 {
 903     U8 tmpbuf[UTF8_MAXLEN+1];
 904     uvchr_to_utf8(tmpbuf, c);
 905     return is_utf8_alpha(tmpbuf);
 906 }
 907
 908 bool
 909 Perl_is_uni_ascii(pTHX_ UV c)
 910 {
 911     U8 tmpbuf[UTF8_MAXLEN+1];
 912     uvchr_to_utf8(tmpbuf, c);
 913     return is_utf8_ascii(tmpbuf);
 914 }
 915
 916 bool
 917 Perl_is_uni_space(pTHX_ UV c)
 918 {
 919     U8 tmpbuf[UTF8_MAXLEN+1];
 920     uvchr_to_utf8(tmpbuf, c);
 921     return is_utf8_space(tmpbuf);
 922 }
 923
 924 bool
 925 Perl_is_uni_digit(pTHX_ UV c)
 926 {
 927     U8 tmpbuf[UTF8_MAXLEN+1];
 928     uvchr_to_utf8(tmpbuf, c);
 929     return is_utf8_digit(tmpbuf);
 930 }
 931
 932 bool
 933 Perl_is_uni_upper(pTHX_ UV c)
 934 {
 935     U8 tmpbuf[UTF8_MAXLEN+1];
 936     uvchr_to_utf8(tmpbuf, c);
 937     return is_utf8_upper(tmpbuf);
 938 }
 939
 940 bool
 941 Perl_is_uni_lower(pTHX_ UV c)
 942 {
 943     U8 tmpbuf[UTF8_MAXLEN+1];
 944     uvchr_to_utf8(tmpbuf, c);
 945     return is_utf8_lower(tmpbuf);
 946 }
 947
 948 bool
 949 Perl_is_uni_cntrl(pTHX_ UV c)
 950 {
 951     U8 tmpbuf[UTF8_MAXLEN+1];
 952     uvchr_to_utf8(tmpbuf, c);
 953     return is_utf8_cntrl(tmpbuf);
 954 }
 955
 956 bool
 957 Perl_is_uni_graph(pTHX_ UV c)
 958 {
 959     U8 tmpbuf[UTF8_MAXLEN+1];
 960     uvchr_to_utf8(tmpbuf, c);
 961     return is_utf8_graph(tmpbuf);
 962 }
 963
 964 bool
 965 Perl_is_uni_print(pTHX_ UV c)
 966 {
 967     U8 tmpbuf[UTF8_MAXLEN+1];
 968     uvchr_to_utf8(tmpbuf, c);
 969     return is_utf8_print(tmpbuf);
 970 }
 971
 972 bool
 973 Perl_is_uni_punct(pTHX_ UV c)
 974 {
 975     U8 tmpbuf[UTF8_MAXLEN+1];
 976     uvchr_to_utf8(tmpbuf, c);
 977     return is_utf8_punct(tmpbuf);
 978 }
 979
 980 bool
 981 Perl_is_uni_xdigit(pTHX_ UV c)
 982 {
 983     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 984     uvchr_to_utf8(tmpbuf, c);
 985     return is_utf8_xdigit(tmpbuf);
 986 }
 987
 988 UV
 989 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 990 {
 991     uvchr_to_utf8(p, c);
 992     return to_utf8_upper(p, p, lenp);
 993 }
 994
 995 UV
 996 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 997 {
 998     uvchr_to_utf8(p, c);
 999     return to_utf8_title(p, p, lenp);
1000 }
1001
1002 UV
1003 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1004 {
1005     uvchr_to_utf8(p, c);
1006     return to_utf8_lower(p, p, lenp);
1007 }
1008
1009 UV
1010 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1011 {
1012     uvchr_to_utf8(p, c);
1013     return to_utf8_fold(p, p, lenp);
1014 }
1015
1016 /* for now these all assume no locale info available for Unicode > 255 */
1017
1018 bool
1019 Perl_is_uni_alnum_lc(pTHX_ UV c)
1020 {
1021     return is_uni_alnum(c);     /* XXX no locale support yet */
1022 }
1023
1024 bool
1025 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1026 {
1027     return is_uni_alnumc(c);    /* XXX no locale support yet */
1028 }
1029
1030 bool
1031 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1032 {
1033     return is_uni_idfirst(c);   /* XXX no locale support yet */
1034 }
1035
1036 bool
1037 Perl_is_uni_alpha_lc(pTHX_ UV c)
1038 {
1039     return is_uni_alpha(c);     /* XXX no locale support yet */
1040 }
1041
1042 bool
1043 Perl_is_uni_ascii_lc(pTHX_ UV c)
1044 {
1045     return is_uni_ascii(c);     /* XXX no locale support yet */
1046 }
1047
1048 bool
1049 Perl_is_uni_space_lc(pTHX_ UV c)
1050 {
1051     return is_uni_space(c);     /* XXX no locale support yet */
1052 }
1053
1054 bool
1055 Perl_is_uni_digit_lc(pTHX_ UV c)
1056 {
1057     return is_uni_digit(c);     /* XXX no locale support yet */
1058 }
1059
1060 bool
1061 Perl_is_uni_upper_lc(pTHX_ UV c)
1062 {
1063     return is_uni_upper(c);     /* XXX no locale support yet */
1064 }
1065
1066 bool
1067 Perl_is_uni_lower_lc(pTHX_ UV c)
1068 {
1069     return is_uni_lower(c);     /* XXX no locale support yet */
1070 }
1071
1072 bool
1073 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1074 {
1075     return is_uni_cntrl(c);     /* XXX no locale support yet */
1076 }
1077
1078 bool
1079 Perl_is_uni_graph_lc(pTHX_ UV c)
1080 {
1081     return is_uni_graph(c);     /* XXX no locale support yet */
1082 }
1083
1084 bool
1085 Perl_is_uni_print_lc(pTHX_ UV c)
1086 {
1087     return is_uni_print(c);     /* XXX no locale support yet */
1088 }
1089
1090 bool
1091 Perl_is_uni_punct_lc(pTHX_ UV c)
1092 {
1093     return is_uni_punct(c);     /* XXX no locale support yet */
1094 }
1095
1096 bool
1097 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1098 {
1099     return is_uni_xdigit(c);    /* XXX no locale support yet */
1100 }
1101
1102 U32
1103 Perl_to_uni_upper_lc(pTHX_ U32 c)
1104 {
1105     /* XXX returns only the first character -- do not use XXX */
1106     /* XXX no locale support yet */
1107     STRLEN len;
1108     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1109     return (U32)to_uni_upper(c, tmpbuf, &len);
1110 }
1111
1112 U32
1113 Perl_to_uni_title_lc(pTHX_ U32 c)
1114 {
1115     /* XXX returns only the first character XXX -- do not use XXX */
1116     /* XXX no locale support yet */
1117     STRLEN len;
1118     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1119     return (U32)to_uni_title(c, tmpbuf, &len);
1120 }
1121
1122 U32
1123 Perl_to_uni_lower_lc(pTHX_ U32 c)
1124 {
1125     /* XXX returns only the first character -- do not use XXX */
1126     /* XXX no locale support yet */
1127     STRLEN len;
1128     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1129     return (U32)to_uni_lower(c, tmpbuf, &len);
1130 }
1131
1132 bool
1133 Perl_is_utf8_alnum(pTHX_ U8 *p)
1134 {
1135     if (!is_utf8_char(p))
1136         return FALSE;
1137     if (!PL_utf8_alnum)
1138         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1139          * descendant of isalnum(3), in other words, it doesn't
1140          * contain the '_'. --jhi */
1141         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1142     return swash_fetch(PL_utf8_alnum, p, TRUE);
1143 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1144 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1145     if (!PL_utf8_alnum)
1146         PL_utf8_alnum = swash_init("utf8", "",
1147             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1148     return swash_fetch(PL_utf8_alnum, p, TRUE);
1149 #endif
1150 }
1151
1152 bool
1153 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1154 {
1155     if (!is_utf8_char(p))
1156         return FALSE;
1157     if (!PL_utf8_alnum)
1158         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1159     return swash_fetch(PL_utf8_alnum, p, TRUE);
1160 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1161 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1162     if (!PL_utf8_alnum)
1163         PL_utf8_alnum = swash_init("utf8", "",
1164             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1165     return swash_fetch(PL_utf8_alnum, p, TRUE);
1166 #endif
1167 }
1168
1169 bool
1170 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1171 {
1172     return *p == '_' || is_utf8_alpha(p);
1173 }
1174
1175 bool
1176 Perl_is_utf8_alpha(pTHX_ U8 *p)
1177 {
1178     if (!is_utf8_char(p))
1179         return FALSE;
1180     if (!PL_utf8_alpha)
1181         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1182     return swash_fetch(PL_utf8_alpha, p, TRUE);
1183 }
1184
1185 bool
1186 Perl_is_utf8_ascii(pTHX_ U8 *p)
1187 {
1188     if (!is_utf8_char(p))
1189         return FALSE;
1190     if (!PL_utf8_ascii)
1191         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1192     return swash_fetch(PL_utf8_ascii, p, TRUE);
1193 }
1194
1195 bool
1196 Perl_is_utf8_space(pTHX_ U8 *p)
1197 {
1198     if (!is_utf8_char(p))
1199         return FALSE;
1200     if (!PL_utf8_space)
1201         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1202     return swash_fetch(PL_utf8_space, p, TRUE);
1203 }
1204
1205 bool
1206 Perl_is_utf8_digit(pTHX_ U8 *p)
1207 {
1208     if (!is_utf8_char(p))
1209         return FALSE;
1210     if (!PL_utf8_digit)
1211         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1212     return swash_fetch(PL_utf8_digit, p, TRUE);
1213 }
1214
1215 bool
1216 Perl_is_utf8_upper(pTHX_ U8 *p)
1217 {
1218     if (!is_utf8_char(p))
1219         return FALSE;
1220     if (!PL_utf8_upper)
1221         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1222     return swash_fetch(PL_utf8_upper, p, TRUE);
1223 }
1224
1225 bool
1226 Perl_is_utf8_lower(pTHX_ U8 *p)
1227 {
1228     if (!is_utf8_char(p))
1229         return FALSE;
1230     if (!PL_utf8_lower)
1231         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1232     return swash_fetch(PL_utf8_lower, p, TRUE);
1233 }
1234
1235 bool
1236 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1237 {
1238     if (!is_utf8_char(p))
1239         return FALSE;
1240     if (!PL_utf8_cntrl)
1241         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1242     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1243 }
1244
1245 bool
1246 Perl_is_utf8_graph(pTHX_ U8 *p)
1247 {
1248     if (!is_utf8_char(p))
1249         return FALSE;
1250     if (!PL_utf8_graph)
1251         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1252     return swash_fetch(PL_utf8_graph, p, TRUE);
1253 }
1254
1255 bool
1256 Perl_is_utf8_print(pTHX_ U8 *p)
1257 {
1258     if (!is_utf8_char(p))
1259         return FALSE;
1260     if (!PL_utf8_print)
1261         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1262     return swash_fetch(PL_utf8_print, p, TRUE);
1263 }
1264
1265 bool
1266 Perl_is_utf8_punct(pTHX_ U8 *p)
1267 {
1268     if (!is_utf8_char(p))
1269         return FALSE;
1270     if (!PL_utf8_punct)
1271         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1272     return swash_fetch(PL_utf8_punct, p, TRUE);
1273 }
1274
1275 bool
1276 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1277 {
1278     if (!is_utf8_char(p))
1279         return FALSE;
1280     if (!PL_utf8_xdigit)
1281         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1282     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1283 }
1284
1285 bool
1286 Perl_is_utf8_mark(pTHX_ U8 *p)
1287 {
1288     if (!is_utf8_char(p))
1289         return FALSE;
1290     if (!PL_utf8_mark)
1291         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1292     return swash_fetch(PL_utf8_mark, p, TRUE);
1293 }
1294
1295 /*
1296 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1297
1298 The "p" contains the pointer to the UTF-8 string encoding
1299 the character that is being converted.
1300
1301 The "ustrp" is a pointer to the character buffer to put the
1302 conversion result to.  The "lenp" is a pointer to the length
1303 of the result.
1304
1305 The "swashp" is a pointer to the swash to use.
1306
1307 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1308 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1309 but not always, a multicharacter mapping), is tried first.
1310
1311 The "special" is a string like "utf8::ToSpecLower", which means the
1312 hash %utf8::ToSpecLower.  The access to the hash is through
1313 Perl_to_utf8_case().
1314
1315 The "normal" is a string like "ToLower" which means the swash
1316 %utf8::ToLower.
1317
1318 =cut */
1319
1320 UV
1321 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1322 {
1323     UV uv0, uv1;
1324     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1325     STRLEN len = 0;
1326
1327     uv0 = utf8_to_uvchr(p, 0);
1328     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1329      * are necessary in EBCDIC, they are redundant no-ops
1330      * in ASCII-ish platforms, and hopefully optimized away. */
1331     uv1 = NATIVE_TO_UNI(uv0);
1332     uvuni_to_utf8(tmpbuf, uv1);
1333
1334     if (!*swashp) /* load on-demand */
1335          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1336
1337     if (special) {
1338          /* It might be "special" (sometimes, but not always,
1339           * a multicharacter mapping) */
1340          HV *hv;
1341          SV *keysv;
1342          HE *he;
1343          SV *val;
1344
1345          if ((hv    = get_hv(special, FALSE)) &&
1346              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1347              (he    = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1348              (val   = HeVAL(he))) {
1349              char *s;
1350
1351               s = SvPV(val, len);
1352               if (len == 1)
1353                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1354               else {
1355 #ifdef EBCDIC
1356                    /* If we have EBCDIC we need to remap the characters
1357                     * since any characters in the low 256 are Unicode
1358                     * code points, not EBCDIC. */
1359                    U8 *t = (U8*)s, *tend = t + len, *d;
1360
1361                    d = tmpbuf;
1362                    if (SvUTF8(val)) {
1363                         STRLEN tlen = 0;
1364
1365                         while (t < tend) {
1366                              UV c = utf8_to_uvchr(t, &tlen);
1367                              if (tlen > 0) {
1368                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1369                                   t += tlen;
1370                              }
1371                              else
1372                                   break;
1373                         }
1374                    }
1375                    else {
1376                         while (t < tend) {
1377                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1378                              t++;
1379                         }
1380                    }
1381                    len = d - tmpbuf;
1382                    Copy(tmpbuf, ustrp, len, U8);
1383 #else
1384                    Copy(s, ustrp, len, U8);
1385 #endif
1386               }
1387          }
1388     }
1389
1390     if (!len && *swashp) {
1391          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1392
1393          if (uv2) {
1394               /* It was "normal" (a single character mapping). */
1395               UV uv3 = UNI_TO_NATIVE(uv2);
1396
1397               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1398          }
1399     }
1400
1401     if (!len) /* Neither: just copy. */
1402          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1403
1404     if (lenp)
1405          *lenp = len;
1406
1407     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1408 }
1409
1410 /*
1411 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1412
1413 Convert the UTF-8 encoded character at p to its uppercase version and
1414 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1415 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1416 uppercase version may be longer than the original character (up to two
1417 characters).
1418
1419 The first character of the uppercased version is returned
1420 (but note, as explained above, that there may be more.)
1421
1422 =cut */
1423
1424 UV
1425 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1426 {
1427     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1428                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1429 }
1430
1431 /*
1432 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1433
1434 Convert the UTF-8 encoded character at p to its titlecase version and
1435 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1436 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1437 titlecase version may be longer than the original character (up to two
1438 characters).
1439
1440 The first character of the titlecased version is returned
1441 (but note, as explained above, that there may be more.)
1442
1443 =cut */
1444
1445 UV
1446 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1447 {
1448     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1449                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1450 }
1451
1452 /*
1453 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1454
1455 Convert the UTF-8 encoded character at p to its lowercase version and
1456 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1457 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1458 lowercase version may be longer than the original character (up to two
1459 characters).
1460
1461 The first character of the lowercased version is returned
1462 (but note, as explained above, that there may be more.)
1463
1464 =cut */
1465
1466 UV
1467 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1468 {
1469     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1470                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1471 }
1472
1473 /*
1474 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1475
1476 Convert the UTF-8 encoded character at p to its foldcase version and
1477 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1478 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1479 foldcase version may be longer than the original character (up to
1480 three characters).
1481
1482 The first character of the foldcased version is returned
1483 (but note, as explained above, that there may be more.)
1484
1485 =cut */
1486
1487 UV
1488 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1489 {
1490     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1491                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1492 }
1493
1494 /* a "swash" is a swatch hash */
1495
1496 SV*
1497 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1498 {
1499     SV* retval;
1500     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1501     dSP;
1502     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1503     SV* errsv_save;
1504
1505     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1506         ENTER;
1507         errsv_save = newSVsv(ERRSV);
1508         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1509         if (!SvTRUE(ERRSV))
1510             sv_setsv(ERRSV, errsv_save);
1511         SvREFCNT_dec(errsv_save);
1512         LEAVE;
1513     }
1514     SPAGAIN;
1515     PUSHSTACKi(PERLSI_MAGIC);
1516     PUSHMARK(SP);
1517     EXTEND(SP,5);
1518     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1519     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1520     PUSHs(listsv);
1521     PUSHs(sv_2mortal(newSViv(minbits)));
1522     PUSHs(sv_2mortal(newSViv(none)));
1523     PUTBACK;
1524     ENTER;
1525     SAVEI32(PL_hints);
1526     PL_hints = 0;
1527     save_re_context();
1528     if (PL_curcop == &PL_compiling)
1529         /* XXX ought to be handled by lex_start */
1530         sv_setpv(tokenbufsv, PL_tokenbuf);
1531     errsv_save = newSVsv(ERRSV);
1532     if (call_method("SWASHNEW", G_SCALAR))
1533         retval = newSVsv(*PL_stack_sp--);
1534     else
1535         retval = &PL_sv_undef;
1536     if (!SvTRUE(ERRSV))
1537         sv_setsv(ERRSV, errsv_save);
1538     SvREFCNT_dec(errsv_save);
1539     LEAVE;
1540     POPSTACK;
1541     if (PL_curcop == &PL_compiling) {
1542         STRLEN len;
1543         char* pv = SvPV(tokenbufsv, len);
1544
1545         Copy(pv, PL_tokenbuf, len+1, char);
1546         PL_curcop->op_private = PL_hints;
1547     }
1548     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1549         if (SvPOK(retval))
1550             Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1551                        SvPV_nolen(retval));
1552         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1553     }
1554     return retval;
1555 }
1556
1557
1558 /* This API is wrong for special case conversions since we may need to
1559  * return several Unicode characters for a single Unicode character
1560  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1561  * the lower-level routine, and it is similarly broken for returning
1562  * multiple values.  --jhi */
1563 UV
1564 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1565 {
1566     HV* hv = (HV*)SvRV(sv);
1567     U32 klen;
1568     U32 off;
1569     STRLEN slen;
1570     STRLEN needents;
1571     U8 *tmps = NULL;
1572     U32 bit;
1573     SV *retval;
1574     U8 tmputf8[2];
1575     UV c = NATIVE_TO_ASCII(*ptr);
1576
1577     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1578         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1579         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1580         ptr = tmputf8;
1581     }
1582     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1583      * then the "swatch" is a vec() for al the chars which start
1584      * with 0xAA..0xYY
1585      * So the key in the hash (klen) is length of encoded char -1
1586      */
1587     klen = UTF8SKIP(ptr) - 1;
1588     off  = ptr[klen];
1589
1590     if (klen == 0)
1591      {
1592       /* If char in invariant then swatch is for all the invariant chars
1593        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1594        */
1595       needents = UTF_CONTINUATION_MARK;
1596       off      = NATIVE_TO_UTF(ptr[klen]);
1597      }
1598     else
1599      {
1600       /* If char is encoded then swatch is for the prefix */
1601       needents = (1 << UTF_ACCUMULATION_SHIFT);
1602       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1603      }
1604
1605     /*
1606      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1607      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1608      * it's nothing to sniff at.)  Pity we usually come through at least
1609      * two function calls to get here...
1610      *
1611      * NB: this code assumes that swatches are never modified, once generated!
1612      */
1613
1614     if (hv   == PL_last_swash_hv &&
1615         klen == PL_last_swash_klen &&
1616         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1617     {
1618         tmps = PL_last_swash_tmps;
1619         slen = PL_last_swash_slen;
1620     }
1621     else {
1622         /* Try our second-level swatch cache, kept in a hash. */
1623         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1624
1625         /* If not cached, generate it via utf8::SWASHGET */
1626         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1627             dSP;
1628             /* We use utf8n_to_uvuni() as we want an index into
1629                Unicode tables, not a native character number.
1630              */
1631             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1632                                            ckWARN(WARN_UTF8) ?
1633                                            0 : UTF8_ALLOW_ANY);
1634             SV *errsv_save;
1635             ENTER;
1636             SAVETMPS;
1637             save_re_context();
1638             PUSHSTACKi(PERLSI_MAGIC);
1639             PUSHMARK(SP);
1640             EXTEND(SP,3);
1641             PUSHs((SV*)sv);
1642             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1643             PUSHs(sv_2mortal(newSViv((klen) ?
1644                                      (code_point & ~(needents - 1)) : 0)));
1645             PUSHs(sv_2mortal(newSViv(needents)));
1646             PUTBACK;
1647             errsv_save = newSVsv(ERRSV);
1648             if (call_method("SWASHGET", G_SCALAR))
1649                 retval = newSVsv(*PL_stack_sp--);
1650             else
1651                 retval = &PL_sv_undef;
1652             if (!SvTRUE(ERRSV))
1653                 sv_setsv(ERRSV, errsv_save);
1654             SvREFCNT_dec(errsv_save);
1655             POPSTACK;
1656             FREETMPS;
1657             LEAVE;
1658             if (PL_curcop == &PL_compiling)
1659                 PL_curcop->op_private = PL_hints;
1660
1661             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1662
1663             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1664                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1665         }
1666
1667         PL_last_swash_hv = hv;
1668         PL_last_swash_klen = klen;
1669         PL_last_swash_tmps = tmps;
1670         PL_last_swash_slen = slen;
1671         if (klen)
1672             Copy(ptr, PL_last_swash_key, klen, U8);
1673     }
1674
1675     switch ((int)((slen << 3) / needents)) {
1676     case 1:
1677         bit = 1 << (off & 7);
1678         off >>= 3;
1679         return (tmps[off] & bit) != 0;
1680     case 8:
1681         return tmps[off];
1682     case 16:
1683         off <<= 1;
1684         return (tmps[off] << 8) + tmps[off + 1] ;
1685     case 32:
1686         off <<= 2;
1687         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1688     }
1689     Perl_croak(aTHX_ "panic: swash_fetch");
1690     return 0;
1691 }
1692
1693
1694 /*
1695 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1696
1697 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1698 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1699 bytes available. The return value is the pointer to the byte after the
1700 end of the new character. In other words,
1701
1702     d = uvchr_to_utf8(d, uv);
1703
1704 is the recommended wide native character-aware way of saying
1705
1706     *(d++) = uv;
1707
1708 =cut
1709 */
1710
1711 /* On ASCII machines this is normally a macro but we want a
1712    real function in case XS code wants it
1713 */
1714 #undef Perl_uvchr_to_utf8
1715 U8 *
1716 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1717 {
1718     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1719 }
1720
1721 U8 *
1722 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1723 {
1724     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1725 }
1726
1727 /*
1728 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1729
1730 Returns the native character value of the first character in the string C<s>
1731 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1732 length, in bytes, of that character.
1733
1734 Allows length and flags to be passed to low level routine.
1735
1736 =cut
1737 */
1738 /* On ASCII machines this is normally a macro but we want
1739    a real function in case XS code wants it
1740 */
1741 #undef Perl_utf8n_to_uvchr
1742 UV
1743 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1744 {
1745     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1746     return UNI_TO_NATIVE(uv);
1747 }
1748
1749 /*
1750 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1751
1752 Build to the scalar dsv a displayable version of the string spv,
1753 length len, the displayable version being at most pvlim bytes long
1754 (if longer, the rest is truncated and "..." will be appended).
1755
1756 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1757 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1758 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1759 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1760 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1761 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1762
1763 The pointer to the PV of the dsv is returned.
1764
1765 =cut */
1766 char *
1767 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1768 {
1769     int truncated = 0;
1770     char *s, *e;
1771
1772     sv_setpvn(dsv, "", 0);
1773     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1774          UV u;
1775          bool ok = FALSE;
1776
1777          if (pvlim && SvCUR(dsv) >= pvlim) {
1778               truncated++;
1779               break;
1780          }
1781          u = utf8_to_uvchr((U8*)s, 0);
1782          if (u < 256) {
1783              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1784                  switch (u & 0xFF) {
1785                  case '\n':
1786                      Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1787                  case '\r':
1788                      Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1789                  case '\t':
1790                      Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1791                  case '\f':
1792                      Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1793                  case '\a':
1794                      Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1795                  case '\\':
1796                      Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1797                  default: break;
1798                  }
1799              }
1800              /* isPRINT() is the locale-blind version. */
1801              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1802                  Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1803                  ok = TRUE;
1804              }
1805          }
1806          if (!ok)
1807              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1808     }
1809     if (truncated)
1810          sv_catpvn(dsv, "...", 3);
1811
1812     return SvPVX(dsv);
1813 }
1814
1815 /*
1816 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1817
1818 Build to the scalar dsv a displayable version of the scalar sv,
1819 the displayable version being at most pvlim bytes long
1820 (if longer, the rest is truncated and "..." will be appended).
1821
1822 The flags argument is as in pv_uni_display().
1823
1824 The pointer to the PV of the dsv is returned.
1825
1826 =cut */
1827 char *
1828 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1829 {
1830      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1831                                 pvlim, flags);
1832 }
1833
1834 /*
1835 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1836
1837 Return true if the strings s1 and s2 differ case-insensitively, false
1838 if not (if they are equal case-insensitively).  If u1 is true, the
1839 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1840 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1841 are false, the respective string is assumed to be in native 8-bit
1842 encoding.
1843
1844 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1845 in there (they will point at the beginning of the I<next> character).
1846 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1847 pointers beyond which scanning will not continue under any
1848 circustances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1849 s2+l2 will be used as goal end pointers that will also stop the scan,
1850 and which qualify towards defining a successful match: all the scans
1851 that define an explicit length must reach their goal pointers for
1852 a match to succeed).
1853
1854 For case-insensitiveness, the "casefolding" of Unicode is used
1855 instead of upper/lowercasing both the characters, see
1856 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1857
1858 =cut */
1859 I32
1860 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1861 {
1862      register U8 *p1  = (U8*)s1;
1863      register U8 *p2  = (U8*)s2;
1864      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1865      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1866      STRLEN n1 = 0, n2 = 0;
1867      U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1868      U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1869      U8 natbuf[1+1];
1870      STRLEN foldlen1, foldlen2;
1871      bool match;
1872
1873      if (pe1)
1874           e1 = *(U8**)pe1;
1875      if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1876           f1 = (U8*)s1 + l1;
1877      if (pe2)
1878           e2 = *(U8**)pe2;
1879      if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1880           f2 = (U8*)s2 + l2;
1881
1882      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1883           return 1; /* mismatch; possible infinite loop or false positive */
1884
1885      if (!u1 || !u2)
1886           natbuf[1] = 0; /* Need to terminate the buffer. */
1887
1888      while ((e1 == 0 || p1 < e1) &&
1889             (f1 == 0 || p1 < f1) &&
1890             (e2 == 0 || p2 < e2) &&
1891             (f2 == 0 || p2 < f2)) {
1892           if (n1 == 0) {
1893                if (u1)
1894                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1895                else {
1896                     natbuf[0] = *p1;
1897                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1898                }
1899                q1 = foldbuf1;
1900                n1 = foldlen1;
1901           }
1902           if (n2 == 0) {
1903                if (u2)
1904                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1905                else {
1906                     natbuf[0] = *p2;
1907                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1908                }
1909                q2 = foldbuf2;
1910                n2 = foldlen2;
1911           }
1912           while (n1 && n2) {
1913                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1914                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1915                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1916                    return 1; /* mismatch */
1917                n1 -= UTF8SKIP(q1);
1918                q1 += UTF8SKIP(q1);
1919                n2 -= UTF8SKIP(q2);
1920                q2 += UTF8SKIP(q2);
1921           }
1922           if (n1 == 0)
1923                p1 += u1 ? UTF8SKIP(p1) : 1;
1924           if (n2 == 0)
1925                p2 += u2 ? UTF8SKIP(p2) : 1;
1926
1927      }
1928
1929      /* A match is defined by all the scans that specified
1930       * an explicit length reaching their final goals. */
1931      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1932
1933      if (match) {
1934           if (pe1)
1935                *pe1 = (char*)p1;
1936           if (pe2)
1937                *pe2 = (char*)p2;
1938      }
1939
1940      return match ? 0 : 1; /* 0 match, 1 mismatch */
1941 }
1942