utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2002, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
  28
  29 /*
  30 =head1 Unicode Support
  31
  32 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  33
  34 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  35 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  36 bytes available. The return value is the pointer to the byte after the
  37 end of the new character. In other words,
  38
  39     d = uvuni_to_utf8_flags(d, uv, flags);
  40
  41 or, in most cases,
  42
  43     d = uvuni_to_utf8(d, uv);
  44
  45 (which is equivalent to)
  46
  47     d = uvuni_to_utf8_flags(d, uv, 0);
  48
  49 is the recommended Unicode-aware way of saying
  50
  51     *(d++) = uv;
  52
  53 =cut
  54 */
  55
  56 U8 *
  57 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  58 {
  59     if (ckWARN(WARN_UTF8)) {
  60          if (UNICODE_IS_SURROGATE(uv) &&
  61              !(flags & UNICODE_ALLOW_SURROGATE))
  62               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  63          else if (
  64                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  65                     !(flags & UNICODE_ALLOW_FDD0))
  66                    ||
  67                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  68                     !(flags & UNICODE_ALLOW_FFFF))) &&
  69                   /* UNICODE_ALLOW_SUPER includes
  70                    * FFFFs beyond 0x10FFFF. */
  71                   ((uv <= PERL_UNICODE_MAX) ||
  72                    !(flags & UNICODE_ALLOW_SUPER))
  73                   )
  74               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  75                          "Unicode character 0x%04"UVxf" is illegal", uv);
  76     }
  77     if (UNI_IS_INVARIANT(uv)) {
  78         *d++ = UTF_TO_NATIVE(uv);
  79         return d;
  80     }
  81 #if defined(EBCDIC)
  82     else {
  83         STRLEN len  = UNISKIP(uv);
  84         U8 *p = d+len-1;
  85         while (p > d) {
  86             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  87             uv >>= UTF_ACCUMULATION_SHIFT;
  88         }
  89         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  90         return d+len;
  91     }
  92 #else /* Non loop style */
  93     if (uv < 0x800) {
  94         *d++ = (( uv >>  6)         | 0xc0);
  95         *d++ = (( uv        & 0x3f) | 0x80);
  96         return d;
  97     }
  98     if (uv < 0x10000) {
  99         *d++ = (( uv >> 12)         | 0xe0);
 100         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 101         *d++ = (( uv        & 0x3f) | 0x80);
 102         return d;
 103     }
 104     if (uv < 0x200000) {
 105         *d++ = (( uv >> 18)         | 0xf0);
 106         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 107         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 108         *d++ = (( uv        & 0x3f) | 0x80);
 109         return d;
 110     }
 111     if (uv < 0x4000000) {
 112         *d++ = (( uv >> 24)         | 0xf8);
 113         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 114         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x80000000) {
 120         *d++ = (( uv >> 30)         | 0xfc);
 121         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 122         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 124         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 125         *d++ = (( uv        & 0x3f) | 0x80);
 126         return d;
 127     }
 128 #ifdef HAS_QUAD
 129     if (uv < UTF8_QUAD_MAX)
 130 #endif
 131     {
 132         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 133         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 134         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 135         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 136         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 137         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 138         *d++ = (( uv        & 0x3f) | 0x80);
 139         return d;
 140     }
 141 #ifdef HAS_QUAD
 142     {
 143         *d++ =                        0xff;     /* Can't match U+FFFE! */
 144         *d++ =                        0x80;     /* 6 Reserved bits */
 145         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 146         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 147         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 148         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 149         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 150         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 151         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 152         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 153         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 154         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 155         *d++ = (( uv        & 0x3f) | 0x80);
 156         return d;
 157     }
 158 #endif
 159 #endif /* Loop style */
 160 }
 161
 162 U8 *
 163 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 164 {
 165     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 166 }
 167
 168
 169 /*
 170 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 174 The actual number of bytes in the UTF-8 character will be returned if
 175 it is valid, otherwise 0.
 176
 177 =cut
 178 */
 179 STRLEN
 180 Perl_is_utf8_char(pTHX_ U8 *s)
 181 {
 182     U8 u = *s;
 183     STRLEN slen, len;
 184     UV uv, ouv;
 185
 186     if (UTF8_IS_INVARIANT(u))
 187         return 1;
 188
 189     if (!UTF8_IS_START(u))
 190         return 0;
 191
 192     len = UTF8SKIP(s);
 193
 194     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 195         return 0;
 196
 197     slen = len - 1;
 198     s++;
 199     u &= UTF_START_MASK(len);
 200     uv  = u;
 201     ouv = uv;
 202     while (slen--) {
 203         if (!UTF8_IS_CONTINUATION(*s))
 204             return 0;
 205         uv = UTF8_ACCUMULATE(uv, *s);
 206         if (uv < ouv)
 207             return 0;
 208         ouv = uv;
 209         s++;
 210     }
 211
 212     if (UNISKIP(uv) < len)
 213         return 0;
 214
 215     return len;
 216 }
 217
 218 /*
 219 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 220
 221 Returns true if first C<len> bytes of the given string form a valid UTF8
 222 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 223 'a string that contains UTF8' because a valid ASCII string is a valid
 224 UTF8 string.
 225
 226 =cut
 227 */
 228
 229 bool
 230 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 231 {
 232     U8* x = s;
 233     U8* send;
 234     STRLEN c;
 235
 236     if (!len)
 237         len = strlen((char *)s);
 238     send = s + len;
 239
 240     while (x < send) {
 241         c = is_utf8_char(x);
 242         if (!c)
 243             return FALSE;
 244         x += c;
 245     }
 246     if (x != send)
 247         return FALSE;
 248
 249     return TRUE;
 250 }
 251
 252 /*
 253 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 254
 255 Bottom level UTF-8 decode routine.
 256 Returns the unicode code point value of the first character in the string C<s>
 257 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 258 C<retlen> will be set to the length, in bytes, of that character.
 259
 260 If C<s> does not point to a well-formed UTF8 character, the behaviour
 261 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 262 it is assumed that the caller will raise a warning, and this function
 263 will silently just set C<retlen> to C<-1> and return zero.  If the
 264 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 265 malformations will be given, C<retlen> will be set to the expected
 266 length of the UTF-8 character in bytes, and zero will be returned.
 267
 268 The C<flags> can also contain various flags to allow deviations from
 269 the strict UTF-8 encoding (see F<utf8.h>).
 270
 271 Most code should use utf8_to_uvchr() rather than call this directly.
 272
 273 =cut
 274 */
 275
 276 UV
 277 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 278 {
 279     U8 *s0 = s;
 280     UV uv = *s, ouv = 0;
 281     STRLEN len = 1;
 282     bool dowarn = ckWARN_d(WARN_UTF8);
 283     UV startbyte = *s;
 284     STRLEN expectlen = 0;
 285     U32 warning = 0;
 286
 287 /* This list is a superset of the UTF8_ALLOW_XXX. */
 288
 289 #define UTF8_WARN_EMPTY                          1
 290 #define UTF8_WARN_CONTINUATION                   2
 291 #define UTF8_WARN_NON_CONTINUATION               3
 292 #define UTF8_WARN_FE_FF                          4
 293 #define UTF8_WARN_SHORT                          5
 294 #define UTF8_WARN_OVERFLOW                       6
 295 #define UTF8_WARN_SURROGATE                      7
 296 #define UTF8_WARN_LONG                           8
 297 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 298
 299     if (curlen == 0 &&
 300         !(flags & UTF8_ALLOW_EMPTY)) {
 301         warning = UTF8_WARN_EMPTY;
 302         goto malformed;
 303     }
 304
 305     if (UTF8_IS_INVARIANT(uv)) {
 306         if (retlen)
 307             *retlen = 1;
 308         return (UV) (NATIVE_TO_UTF(*s));
 309     }
 310
 311     if (UTF8_IS_CONTINUATION(uv) &&
 312         !(flags & UTF8_ALLOW_CONTINUATION)) {
 313         warning = UTF8_WARN_CONTINUATION;
 314         goto malformed;
 315     }
 316
 317     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 318         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 319         warning = UTF8_WARN_NON_CONTINUATION;
 320         goto malformed;
 321     }
 322
 323 #ifdef EBCDIC
 324     uv = NATIVE_TO_UTF(uv);
 325 #else
 326     if ((uv == 0xfe || uv == 0xff) &&
 327         !(flags & UTF8_ALLOW_FE_FF)) {
 328         warning = UTF8_WARN_FE_FF;
 329         goto malformed;
 330     }
 331 #endif
 332
 333     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 334     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 335     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 336     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 337 #ifdef EBCDIC
 338     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 339     else                        { len =  7; uv &= 0x01; }
 340 #else
 341     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 342     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 343     else                        { len = 13; uv = 0; } /* whoa! */
 344 #endif
 345
 346     if (retlen)
 347         *retlen = len;
 348
 349     expectlen = len;
 350
 351     if ((curlen < expectlen) &&
 352         !(flags & UTF8_ALLOW_SHORT)) {
 353         warning = UTF8_WARN_SHORT;
 354         goto malformed;
 355     }
 356
 357     len--;
 358     s++;
 359     ouv = uv;
 360
 361     while (len--) {
 362         if (!UTF8_IS_CONTINUATION(*s) &&
 363             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 364             s--;
 365             warning = UTF8_WARN_NON_CONTINUATION;
 366             goto malformed;
 367         }
 368         else
 369             uv = UTF8_ACCUMULATE(uv, *s);
 370         if (!(uv > ouv)) {
 371             /* These cannot be allowed. */
 372             if (uv == ouv) {
 373                 if (!(flags & UTF8_ALLOW_LONG)) {
 374                     warning = UTF8_WARN_LONG;
 375                     goto malformed;
 376                 }
 377             }
 378             else { /* uv < ouv */
 379                 /* This cannot be allowed. */
 380                 warning = UTF8_WARN_OVERFLOW;
 381                 goto malformed;
 382             }
 383         }
 384         s++;
 385         ouv = uv;
 386     }
 387
 388     if (UNICODE_IS_SURROGATE(uv) &&
 389         !(flags & UTF8_ALLOW_SURROGATE)) {
 390         warning = UTF8_WARN_SURROGATE;
 391         goto malformed;
 392     } else if ((expectlen > UNISKIP(uv)) &&
 393                !(flags & UTF8_ALLOW_LONG)) {
 394         warning = UTF8_WARN_LONG;
 395         goto malformed;
 396     } else if (UNICODE_IS_ILLEGAL(uv) &&
 397                !(flags & UTF8_ALLOW_FFFF)) {
 398         warning = UTF8_WARN_FFFF;
 399         goto malformed;
 400     }
 401
 402     return uv;
 403
 404 malformed:
 405
 406     if (flags & UTF8_CHECK_ONLY) {
 407         if (retlen)
 408             *retlen = -1;
 409         return 0;
 410     }
 411
 412     if (dowarn) {
 413         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 414
 415         switch (warning) {
 416         case 0: /* Intentionally empty. */ break;
 417         case UTF8_WARN_EMPTY:
 418             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 419             break;
 420         case UTF8_WARN_CONTINUATION:
 421             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 422             break;
 423         case UTF8_WARN_NON_CONTINUATION:
 424             if (s == s0)
 425                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 426                            (UV)s[1], startbyte);
 427             else
 428                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 429                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 430
 431             break;
 432         case UTF8_WARN_FE_FF:
 433             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 434             break;
 435         case UTF8_WARN_SHORT:
 436             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 437                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 438             expectlen = curlen;         /* distance for caller to skip */
 439             break;
 440         case UTF8_WARN_OVERFLOW:
 441             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 442                            ouv, *s, startbyte);
 443             break;
 444         case UTF8_WARN_SURROGATE:
 445             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 446             break;
 447         case UTF8_WARN_LONG:
 448             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 449                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 450             break;
 451         case UTF8_WARN_FFFF:
 452             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 453             break;
 454         default:
 455             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 456             break;
 457         }
 458
 459         if (warning) {
 460             char *s = SvPVX(sv);
 461
 462             if (PL_op)
 463                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 464                             "%s in %s", s,  OP_DESC(PL_op));
 465             else
 466                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 467         }
 468     }
 469
 470     if (retlen)
 471         *retlen = expectlen ? expectlen : len;
 472
 473     return 0;
 474 }
 475
 476 /*
 477 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 478
 479 Returns the native character value of the first character in the string C<s>
 480 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 481 length, in bytes, of that character.
 482
 483 If C<s> does not point to a well-formed UTF8 character, zero is
 484 returned and retlen is set, if possible, to -1.
 485
 486 =cut
 487 */
 488
 489 UV
 490 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 491 {
 492     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
 493                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 494 }
 495
 496 /*
 497 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 498
 499 Returns the Unicode code point of the first character in the string C<s>
 500 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 501 length, in bytes, of that character.
 502
 503 This function should only be used when returned UV is considered
 504 an index into the Unicode semantic tables (e.g. swashes).
 505
 506 If C<s> does not point to a well-formed UTF8 character, zero is
 507 returned and retlen is set, if possible, to -1.
 508
 509 =cut
 510 */
 511
 512 UV
 513 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 514 {
 515     /* Call the low level routine asking for checks */
 516     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
 517                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 518 }
 519
 520 /*
 521 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 522
 523 Return the length of the UTF-8 char encoded string C<s> in characters.
 524 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 525 up past C<e>, croaks.
 526
 527 =cut
 528 */
 529
 530 STRLEN
 531 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 532 {
 533     STRLEN len = 0;
 534
 535     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 536      * the bitops (especially ~) can create illegal UTF-8.
 537      * In other words: in Perl UTF-8 is not just for Unicode. */
 538
 539     if (e < s) {
 540         if (ckWARN_d(WARN_UTF8)) {
 541             if (PL_op)
 542                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 543                             "%s in %s", unees, OP_DESC(PL_op));
 544             else
 545                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 546         }
 547         return 0;
 548     }
 549     while (s < e) {
 550         U8 t = UTF8SKIP(s);
 551
 552         if (e - s < t) {
 553             if (ckWARN_d(WARN_UTF8)) {
 554                 if (PL_op)
 555                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 556                                 unees, OP_DESC(PL_op));
 557                 else
 558                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 559             }
 560             return len;
 561         }
 562         s += t;
 563         len++;
 564     }
 565
 566     return len;
 567 }
 568
 569 /*
 570 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 571
 572 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 573 and C<b>.
 574
 575 WARNING: use only if you *know* that the pointers point inside the
 576 same UTF-8 buffer.
 577
 578 =cut
 579 */
 580
 581 IV
 582 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 583 {
 584     IV off = 0;
 585
 586     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 587      * the bitops (especially ~) can create illegal UTF-8.
 588      * In other words: in Perl UTF-8 is not just for Unicode. */
 589
 590     if (a < b) {
 591         while (a < b) {
 592             U8 c = UTF8SKIP(a);
 593
 594             if (b - a < c) {
 595                 if (ckWARN_d(WARN_UTF8)) {
 596                     if (PL_op)
 597                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 598                                     "%s in %s", unees, OP_DESC(PL_op));
 599                     else
 600                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 601                 }
 602                 return off;
 603             }
 604             a += c;
 605             off--;
 606         }
 607     }
 608     else {
 609         while (b < a) {
 610             U8 c = UTF8SKIP(b);
 611
 612             if (a - b < c) {
 613                 if (ckWARN_d(WARN_UTF8)) {
 614                     if (PL_op)
 615                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 616                                     "%s in %s", unees, OP_DESC(PL_op));
 617                     else
 618                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 619                 }
 620                 return off;
 621             }
 622             b += c;
 623             off++;
 624         }
 625     }
 626
 627     return off;
 628 }
 629
 630 /*
 631 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 632
 633 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 634 forward or backward.
 635
 636 WARNING: do not use the following unless you *know* C<off> is within
 637 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 638 on the first byte of character or just after the last byte of a character.
 639
 640 =cut
 641 */
 642
 643 U8 *
 644 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 645 {
 646     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 647      * the bitops (especially ~) can create illegal UTF-8.
 648      * In other words: in Perl UTF-8 is not just for Unicode. */
 649
 650     if (off >= 0) {
 651         while (off--)
 652             s += UTF8SKIP(s);
 653     }
 654     else {
 655         while (off++) {
 656             s--;
 657             while (UTF8_IS_CONTINUATION(*s))
 658                 s--;
 659         }
 660     }
 661     return s;
 662 }
 663
 664 /*
 665 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 666
 667 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 668 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 669 updates len to contain the new length.
 670 Returns zero on failure, setting C<len> to -1.
 671
 672 =cut
 673 */
 674
 675 U8 *
 676 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 677 {
 678     U8 *send;
 679     U8 *d;
 680     U8 *save = s;
 681
 682     /* ensure valid UTF8 and chars < 256 before updating string */
 683     for (send = s + *len; s < send; ) {
 684         U8 c = *s++;
 685
 686         if (!UTF8_IS_INVARIANT(c) &&
 687             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 688              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 689             *len = -1;
 690             return 0;
 691         }
 692     }
 693
 694     d = s = save;
 695     while (s < send) {
 696         STRLEN ulen;
 697         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 698         s += ulen;
 699     }
 700     *d = '\0';
 701     *len = d - save;
 702     return save;
 703 }
 704
 705 /*
 706 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 707
 708 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 709 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 710 the newly-created string, and updates C<len> to contain the new
 711 length.  Returns the original string if no conversion occurs, C<len>
 712 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 713 0 if C<s> is converted or contains all 7bit characters.
 714
 715 =cut
 716 */
 717
 718 U8 *
 719 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 720 {
 721     U8 *d;
 722     U8 *start = s;
 723     U8 *send;
 724     I32 count = 0;
 725
 726     if (!*is_utf8)
 727         return start;
 728
 729     /* ensure valid UTF8 and chars < 256 before converting string */
 730     for (send = s + *len; s < send;) {
 731         U8 c = *s++;
 732         if (!UTF8_IS_INVARIANT(c)) {
 733             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 734                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 735                 count++;
 736             else
 737                 return start;
 738         }
 739     }
 740
 741     *is_utf8 = 0;
 742
 743     Newz(801, d, (*len) - count + 1, U8);
 744     s = start; start = d;
 745     while (s < send) {
 746         U8 c = *s++;
 747         if (!UTF8_IS_INVARIANT(c)) {
 748             /* Then it is two-byte encoded */
 749             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 750             c = ASCII_TO_NATIVE(c);
 751         }
 752         *d++ = c;
 753     }
 754     *d = '\0';
 755     *len = d - start;
 756     return start;
 757 }
 758
 759 /*
 760 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 761
 762 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 763 Returns a pointer to the newly-created string, and sets C<len> to
 764 reflect the new length.
 765
 766 =cut
 767 */
 768
 769 U8*
 770 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 771 {
 772     U8 *send;
 773     U8 *d;
 774     U8 *dst;
 775     send = s + (*len);
 776
 777     Newz(801, d, (*len) * 2 + 1, U8);
 778     dst = d;
 779
 780     while (s < send) {
 781         UV uv = NATIVE_TO_ASCII(*s++);
 782         if (UNI_IS_INVARIANT(uv))
 783             *d++ = UTF_TO_NATIVE(uv);
 784         else {
 785             *d++ = UTF8_EIGHT_BIT_HI(uv);
 786             *d++ = UTF8_EIGHT_BIT_LO(uv);
 787         }
 788     }
 789     *d = '\0';
 790     *len = d-dst;
 791     return dst;
 792 }
 793
 794 /*
 795  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 796  *
 797  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 798  * We optimize for native, for obvious reasons. */
 799
 800 U8*
 801 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 802 {
 803     U8* pend;
 804     U8* dstart = d;
 805
 806     if (bytelen & 1)
 807         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 808
 809     pend = p + bytelen;
 810
 811     while (p < pend) {
 812         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 813         p += 2;
 814         if (uv < 0x80) {
 815             *d++ = uv;
 816             continue;
 817         }
 818         if (uv < 0x800) {
 819             *d++ = (( uv >>  6)         | 0xc0);
 820             *d++ = (( uv        & 0x3f) | 0x80);
 821             continue;
 822         }
 823         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 824             UV low = *p++;
 825             if (low < 0xdc00 || low >= 0xdfff)
 826                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 827             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 828         }
 829         if (uv < 0x10000) {
 830             *d++ = (( uv >> 12)         | 0xe0);
 831             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 832             *d++ = (( uv        & 0x3f) | 0x80);
 833             continue;
 834         }
 835         else {
 836             *d++ = (( uv >> 18)         | 0xf0);
 837             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 838             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 839             *d++ = (( uv        & 0x3f) | 0x80);
 840             continue;
 841         }
 842     }
 843     *newlen = d - dstart;
 844     return d;
 845 }
 846
 847 /* Note: this one is slightly destructive of the source. */
 848
 849 U8*
 850 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 851 {
 852     U8* s = (U8*)p;
 853     U8* send = s + bytelen;
 854     while (s < send) {
 855         U8 tmp = s[0];
 856         s[0] = s[1];
 857         s[1] = tmp;
 858         s += 2;
 859     }
 860     return utf16_to_utf8(p, d, bytelen, newlen);
 861 }
 862
 863 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 864
 865 bool
 866 Perl_is_uni_alnum(pTHX_ UV c)
 867 {
 868     U8 tmpbuf[UTF8_MAXLEN+1];
 869     uvchr_to_utf8(tmpbuf, c);
 870     return is_utf8_alnum(tmpbuf);
 871 }
 872
 873 bool
 874 Perl_is_uni_alnumc(pTHX_ UV c)
 875 {
 876     U8 tmpbuf[UTF8_MAXLEN+1];
 877     uvchr_to_utf8(tmpbuf, c);
 878     return is_utf8_alnumc(tmpbuf);
 879 }
 880
 881 bool
 882 Perl_is_uni_idfirst(pTHX_ UV c)
 883 {
 884     U8 tmpbuf[UTF8_MAXLEN+1];
 885     uvchr_to_utf8(tmpbuf, c);
 886     return is_utf8_idfirst(tmpbuf);
 887 }
 888
 889 bool
 890 Perl_is_uni_alpha(pTHX_ UV c)
 891 {
 892     U8 tmpbuf[UTF8_MAXLEN+1];
 893     uvchr_to_utf8(tmpbuf, c);
 894     return is_utf8_alpha(tmpbuf);
 895 }
 896
 897 bool
 898 Perl_is_uni_ascii(pTHX_ UV c)
 899 {
 900     U8 tmpbuf[UTF8_MAXLEN+1];
 901     uvchr_to_utf8(tmpbuf, c);
 902     return is_utf8_ascii(tmpbuf);
 903 }
 904
 905 bool
 906 Perl_is_uni_space(pTHX_ UV c)
 907 {
 908     U8 tmpbuf[UTF8_MAXLEN+1];
 909     uvchr_to_utf8(tmpbuf, c);
 910     return is_utf8_space(tmpbuf);
 911 }
 912
 913 bool
 914 Perl_is_uni_digit(pTHX_ UV c)
 915 {
 916     U8 tmpbuf[UTF8_MAXLEN+1];
 917     uvchr_to_utf8(tmpbuf, c);
 918     return is_utf8_digit(tmpbuf);
 919 }
 920
 921 bool
 922 Perl_is_uni_upper(pTHX_ UV c)
 923 {
 924     U8 tmpbuf[UTF8_MAXLEN+1];
 925     uvchr_to_utf8(tmpbuf, c);
 926     return is_utf8_upper(tmpbuf);
 927 }
 928
 929 bool
 930 Perl_is_uni_lower(pTHX_ UV c)
 931 {
 932     U8 tmpbuf[UTF8_MAXLEN+1];
 933     uvchr_to_utf8(tmpbuf, c);
 934     return is_utf8_lower(tmpbuf);
 935 }
 936
 937 bool
 938 Perl_is_uni_cntrl(pTHX_ UV c)
 939 {
 940     U8 tmpbuf[UTF8_MAXLEN+1];
 941     uvchr_to_utf8(tmpbuf, c);
 942     return is_utf8_cntrl(tmpbuf);
 943 }
 944
 945 bool
 946 Perl_is_uni_graph(pTHX_ UV c)
 947 {
 948     U8 tmpbuf[UTF8_MAXLEN+1];
 949     uvchr_to_utf8(tmpbuf, c);
 950     return is_utf8_graph(tmpbuf);
 951 }
 952
 953 bool
 954 Perl_is_uni_print(pTHX_ UV c)
 955 {
 956     U8 tmpbuf[UTF8_MAXLEN+1];
 957     uvchr_to_utf8(tmpbuf, c);
 958     return is_utf8_print(tmpbuf);
 959 }
 960
 961 bool
 962 Perl_is_uni_punct(pTHX_ UV c)
 963 {
 964     U8 tmpbuf[UTF8_MAXLEN+1];
 965     uvchr_to_utf8(tmpbuf, c);
 966     return is_utf8_punct(tmpbuf);
 967 }
 968
 969 bool
 970 Perl_is_uni_xdigit(pTHX_ UV c)
 971 {
 972     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 973     uvchr_to_utf8(tmpbuf, c);
 974     return is_utf8_xdigit(tmpbuf);
 975 }
 976
 977 UV
 978 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 979 {
 980     uvchr_to_utf8(p, c);
 981     return to_utf8_upper(p, p, lenp);
 982 }
 983
 984 UV
 985 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 986 {
 987     uvchr_to_utf8(p, c);
 988     return to_utf8_title(p, p, lenp);
 989 }
 990
 991 UV
 992 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 993 {
 994     uvchr_to_utf8(p, c);
 995     return to_utf8_lower(p, p, lenp);
 996 }
 997
 998 UV
 999 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1000 {
1001     uvchr_to_utf8(p, c);
1002     return to_utf8_fold(p, p, lenp);
1003 }
1004
1005 /* for now these all assume no locale info available for Unicode > 255 */
1006
1007 bool
1008 Perl_is_uni_alnum_lc(pTHX_ UV c)
1009 {
1010     return is_uni_alnum(c);     /* XXX no locale support yet */
1011 }
1012
1013 bool
1014 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1015 {
1016     return is_uni_alnumc(c);    /* XXX no locale support yet */
1017 }
1018
1019 bool
1020 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1021 {
1022     return is_uni_idfirst(c);   /* XXX no locale support yet */
1023 }
1024
1025 bool
1026 Perl_is_uni_alpha_lc(pTHX_ UV c)
1027 {
1028     return is_uni_alpha(c);     /* XXX no locale support yet */
1029 }
1030
1031 bool
1032 Perl_is_uni_ascii_lc(pTHX_ UV c)
1033 {
1034     return is_uni_ascii(c);     /* XXX no locale support yet */
1035 }
1036
1037 bool
1038 Perl_is_uni_space_lc(pTHX_ UV c)
1039 {
1040     return is_uni_space(c);     /* XXX no locale support yet */
1041 }
1042
1043 bool
1044 Perl_is_uni_digit_lc(pTHX_ UV c)
1045 {
1046     return is_uni_digit(c);     /* XXX no locale support yet */
1047 }
1048
1049 bool
1050 Perl_is_uni_upper_lc(pTHX_ UV c)
1051 {
1052     return is_uni_upper(c);     /* XXX no locale support yet */
1053 }
1054
1055 bool
1056 Perl_is_uni_lower_lc(pTHX_ UV c)
1057 {
1058     return is_uni_lower(c);     /* XXX no locale support yet */
1059 }
1060
1061 bool
1062 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1063 {
1064     return is_uni_cntrl(c);     /* XXX no locale support yet */
1065 }
1066
1067 bool
1068 Perl_is_uni_graph_lc(pTHX_ UV c)
1069 {
1070     return is_uni_graph(c);     /* XXX no locale support yet */
1071 }
1072
1073 bool
1074 Perl_is_uni_print_lc(pTHX_ UV c)
1075 {
1076     return is_uni_print(c);     /* XXX no locale support yet */
1077 }
1078
1079 bool
1080 Perl_is_uni_punct_lc(pTHX_ UV c)
1081 {
1082     return is_uni_punct(c);     /* XXX no locale support yet */
1083 }
1084
1085 bool
1086 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1087 {
1088     return is_uni_xdigit(c);    /* XXX no locale support yet */
1089 }
1090
1091 U32
1092 Perl_to_uni_upper_lc(pTHX_ U32 c)
1093 {
1094     /* XXX returns only the first character -- do not use XXX */
1095     /* XXX no locale support yet */
1096     STRLEN len;
1097     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1098     return (U32)to_uni_upper(c, tmpbuf, &len);
1099 }
1100
1101 U32
1102 Perl_to_uni_title_lc(pTHX_ U32 c)
1103 {
1104     /* XXX returns only the first character XXX -- do not use XXX */
1105     /* XXX no locale support yet */
1106     STRLEN len;
1107     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1108     return (U32)to_uni_title(c, tmpbuf, &len);
1109 }
1110
1111 U32
1112 Perl_to_uni_lower_lc(pTHX_ U32 c)
1113 {
1114     /* XXX returns only the first character -- do not use XXX */
1115     /* XXX no locale support yet */
1116     STRLEN len;
1117     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1118     return (U32)to_uni_lower(c, tmpbuf, &len);
1119 }
1120
1121 bool
1122 Perl_is_utf8_alnum(pTHX_ U8 *p)
1123 {
1124     if (!is_utf8_char(p))
1125         return FALSE;
1126     if (!PL_utf8_alnum)
1127         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1128          * descendant of isalnum(3), in other words, it doesn't
1129          * contain the '_'. --jhi */
1130         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1131     return swash_fetch(PL_utf8_alnum, p, TRUE);
1132 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1133 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1134     if (!PL_utf8_alnum)
1135         PL_utf8_alnum = swash_init("utf8", "",
1136             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1137     return swash_fetch(PL_utf8_alnum, p, TRUE);
1138 #endif
1139 }
1140
1141 bool
1142 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1143 {
1144     if (!is_utf8_char(p))
1145         return FALSE;
1146     if (!PL_utf8_alnum)
1147         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1148     return swash_fetch(PL_utf8_alnum, p, TRUE);
1149 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1150 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1151     if (!PL_utf8_alnum)
1152         PL_utf8_alnum = swash_init("utf8", "",
1153             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1154     return swash_fetch(PL_utf8_alnum, p, TRUE);
1155 #endif
1156 }
1157
1158 bool
1159 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1160 {
1161     return *p == '_' || is_utf8_alpha(p);
1162 }
1163
1164 bool
1165 Perl_is_utf8_alpha(pTHX_ U8 *p)
1166 {
1167     if (!is_utf8_char(p))
1168         return FALSE;
1169     if (!PL_utf8_alpha)
1170         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1171     return swash_fetch(PL_utf8_alpha, p, TRUE);
1172 }
1173
1174 bool
1175 Perl_is_utf8_ascii(pTHX_ U8 *p)
1176 {
1177     if (!is_utf8_char(p))
1178         return FALSE;
1179     if (!PL_utf8_ascii)
1180         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1181     return swash_fetch(PL_utf8_ascii, p, TRUE);
1182 }
1183
1184 bool
1185 Perl_is_utf8_space(pTHX_ U8 *p)
1186 {
1187     if (!is_utf8_char(p))
1188         return FALSE;
1189     if (!PL_utf8_space)
1190         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1191     return swash_fetch(PL_utf8_space, p, TRUE);
1192 }
1193
1194 bool
1195 Perl_is_utf8_digit(pTHX_ U8 *p)
1196 {
1197     if (!is_utf8_char(p))
1198         return FALSE;
1199     if (!PL_utf8_digit)
1200         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1201     return swash_fetch(PL_utf8_digit, p, TRUE);
1202 }
1203
1204 bool
1205 Perl_is_utf8_upper(pTHX_ U8 *p)
1206 {
1207     if (!is_utf8_char(p))
1208         return FALSE;
1209     if (!PL_utf8_upper)
1210         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1211     return swash_fetch(PL_utf8_upper, p, TRUE);
1212 }
1213
1214 bool
1215 Perl_is_utf8_lower(pTHX_ U8 *p)
1216 {
1217     if (!is_utf8_char(p))
1218         return FALSE;
1219     if (!PL_utf8_lower)
1220         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1221     return swash_fetch(PL_utf8_lower, p, TRUE);
1222 }
1223
1224 bool
1225 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1226 {
1227     if (!is_utf8_char(p))
1228         return FALSE;
1229     if (!PL_utf8_cntrl)
1230         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1231     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1232 }
1233
1234 bool
1235 Perl_is_utf8_graph(pTHX_ U8 *p)
1236 {
1237     if (!is_utf8_char(p))
1238         return FALSE;
1239     if (!PL_utf8_graph)
1240         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1241     return swash_fetch(PL_utf8_graph, p, TRUE);
1242 }
1243
1244 bool
1245 Perl_is_utf8_print(pTHX_ U8 *p)
1246 {
1247     if (!is_utf8_char(p))
1248         return FALSE;
1249     if (!PL_utf8_print)
1250         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1251     return swash_fetch(PL_utf8_print, p, TRUE);
1252 }
1253
1254 bool
1255 Perl_is_utf8_punct(pTHX_ U8 *p)
1256 {
1257     if (!is_utf8_char(p))
1258         return FALSE;
1259     if (!PL_utf8_punct)
1260         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1261     return swash_fetch(PL_utf8_punct, p, TRUE);
1262 }
1263
1264 bool
1265 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1266 {
1267     if (!is_utf8_char(p))
1268         return FALSE;
1269     if (!PL_utf8_xdigit)
1270         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1271     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1272 }
1273
1274 bool
1275 Perl_is_utf8_mark(pTHX_ U8 *p)
1276 {
1277     if (!is_utf8_char(p))
1278         return FALSE;
1279     if (!PL_utf8_mark)
1280         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1281     return swash_fetch(PL_utf8_mark, p, TRUE);
1282 }
1283
1284 /*
1285 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1286
1287 The "p" contains the pointer to the UTF-8 string encoding
1288 the character that is being converted.
1289
1290 The "ustrp" is a pointer to the character buffer to put the
1291 conversion result to.  The "lenp" is a pointer to the length
1292 of the result.
1293
1294 The "swashp" is a pointer to the swash to use.
1295
1296 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1297 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1298 but not always, a multicharacter mapping), is tried first.
1299
1300 The "special" is a string like "utf8::ToSpecLower", which means the
1301 hash %utf8::ToSpecLower.  The access to the hash is through
1302 Perl_to_utf8_case().
1303
1304 The "normal" is a string like "ToLower" which means the swash
1305 %utf8::ToLower.
1306
1307 =cut */
1308
1309 UV
1310 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1311 {
1312     UV uv0, uv1;
1313     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1314     STRLEN len = 0;
1315
1316     uv0 = utf8_to_uvchr(p, 0);
1317     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1318      * are necessary in EBCDIC, they are redundant no-ops
1319      * in ASCII-ish platforms, and hopefully optimized away. */
1320     uv1 = NATIVE_TO_UNI(uv0);
1321     uvuni_to_utf8(tmpbuf, uv1);
1322
1323     if (!*swashp) /* load on-demand */
1324          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1325
1326     if (special) {
1327          /* It might be "special" (sometimes, but not always,
1328           * a multicharacter mapping) */
1329          HV *hv;
1330          SV *keysv;
1331          HE *he;
1332          SV *val;
1333
1334          if ((hv    = get_hv(special, FALSE)) &&
1335              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1336              (he    = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1337              (val   = HeVAL(he))) {
1338              char *s;
1339
1340               s = SvPV(val, len);
1341               if (len == 1)
1342                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1343               else {
1344 #ifdef EBCDIC
1345                    /* If we have EBCDIC we need to remap the characters
1346                     * since any characters in the low 256 are Unicode
1347                     * code points, not EBCDIC. */
1348                    U8 *t = (U8*)s, *tend = t + len, *d;
1349
1350                    d = tmpbuf;
1351                    if (SvUTF8(val)) {
1352                         STRLEN tlen = 0;
1353
1354                         while (t < tend) {
1355                              UV c = utf8_to_uvchr(t, &tlen);
1356                              if (tlen > 0) {
1357                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1358                                   t += tlen;
1359                              }
1360                              else
1361                                   break;
1362                         }
1363                    }
1364                    else {
1365                         while (t < tend) {
1366                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1367                              t++;
1368                         }
1369                    }
1370                    len = d - tmpbuf;
1371                    Copy(tmpbuf, ustrp, len, U8);
1372 #else
1373                    Copy(s, ustrp, len, U8);
1374 #endif
1375               }
1376          }
1377     }
1378
1379     if (!len && *swashp) {
1380          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1381
1382          if (uv2) {
1383               /* It was "normal" (a single character mapping). */
1384               UV uv3 = UNI_TO_NATIVE(uv2);
1385
1386               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1387          }
1388     }
1389
1390     if (!len) /* Neither: just copy. */
1391          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1392
1393     if (lenp)
1394          *lenp = len;
1395
1396     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1397 }
1398
1399 /*
1400 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1401
1402 Convert the UTF-8 encoded character at p to its uppercase version and
1403 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1404 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1405 uppercase version may be longer than the original character (up to two
1406 characters).
1407
1408 The first character of the uppercased version is returned
1409 (but note, as explained above, that there may be more.)
1410
1411 =cut */
1412
1413 UV
1414 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1415 {
1416     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1417                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1418 }
1419
1420 /*
1421 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1422
1423 Convert the UTF-8 encoded character at p to its titlecase version and
1424 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1425 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1426 titlecase version may be longer than the original character (up to two
1427 characters).
1428
1429 The first character of the titlecased version is returned
1430 (but note, as explained above, that there may be more.)
1431
1432 =cut */
1433
1434 UV
1435 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1436 {
1437     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1438                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1439 }
1440
1441 /*
1442 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1443
1444 Convert the UTF-8 encoded character at p to its lowercase version and
1445 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1446 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1447 lowercase version may be longer than the original character (up to two
1448 characters).
1449
1450 The first character of the lowercased version is returned
1451 (but note, as explained above, that there may be more.)
1452
1453 =cut */
1454
1455 UV
1456 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1457 {
1458     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1459                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1460 }
1461
1462 /*
1463 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1464
1465 Convert the UTF-8 encoded character at p to its foldcase version and
1466 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1467 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1468 foldcase version may be longer than the original character (up to
1469 three characters).
1470
1471 The first character of the foldcased version is returned
1472 (but note, as explained above, that there may be more.)
1473
1474 =cut */
1475
1476 UV
1477 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1478 {
1479     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1480                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1481 }
1482
1483 /* a "swash" is a swatch hash */
1484
1485 SV*
1486 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1487 {
1488     SV* retval;
1489     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1490     dSP;
1491     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1492     SV* errsv_save;
1493
1494     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1495         ENTER;
1496         errsv_save = newSVsv(ERRSV);
1497         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1498         if (!SvTRUE(ERRSV))
1499             sv_setsv(ERRSV, errsv_save);
1500         SvREFCNT_dec(errsv_save);
1501         LEAVE;
1502     }
1503     SPAGAIN;
1504     PUSHSTACKi(PERLSI_MAGIC);
1505     PUSHMARK(SP);
1506     EXTEND(SP,5);
1507     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1508     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1509     PUSHs(listsv);
1510     PUSHs(sv_2mortal(newSViv(minbits)));
1511     PUSHs(sv_2mortal(newSViv(none)));
1512     PUTBACK;
1513     ENTER;
1514     SAVEI32(PL_hints);
1515     PL_hints = 0;
1516     save_re_context();
1517     if (PL_curcop == &PL_compiling)
1518         /* XXX ought to be handled by lex_start */
1519         sv_setpv(tokenbufsv, PL_tokenbuf);
1520     errsv_save = newSVsv(ERRSV);
1521     if (call_method("SWASHNEW", G_SCALAR))
1522         retval = newSVsv(*PL_stack_sp--);
1523     else
1524         retval = &PL_sv_undef;
1525     if (!SvTRUE(ERRSV))
1526         sv_setsv(ERRSV, errsv_save);
1527     SvREFCNT_dec(errsv_save);
1528     LEAVE;
1529     POPSTACK;
1530     if (PL_curcop == &PL_compiling) {
1531         STRLEN len;
1532         char* pv = SvPV(tokenbufsv, len);
1533
1534         Copy(pv, PL_tokenbuf, len+1, char);
1535         PL_curcop->op_private = PL_hints;
1536     }
1537     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1538         if (SvPOK(retval))
1539             Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1540                        SvPV_nolen(retval));
1541         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1542     }
1543     return retval;
1544 }
1545
1546
1547 /* This API is wrong for special case conversions since we may need to
1548  * return several Unicode characters for a single Unicode character
1549  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1550  * the lower-level routine, and it is similarly broken for returning
1551  * multiple values.  --jhi */
1552 UV
1553 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1554 {
1555     HV* hv = (HV*)SvRV(sv);
1556     U32 klen;
1557     U32 off;
1558     STRLEN slen;
1559     STRLEN needents;
1560     U8 *tmps = NULL;
1561     U32 bit;
1562     SV *retval;
1563     U8 tmputf8[2];
1564     UV c = NATIVE_TO_ASCII(*ptr);
1565
1566     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1567         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1568         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1569         ptr = tmputf8;
1570     }
1571     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1572      * then the "swatch" is a vec() for al the chars which start
1573      * with 0xAA..0xYY
1574      * So the key in the hash (klen) is length of encoded char -1
1575      */
1576     klen = UTF8SKIP(ptr) - 1;
1577     off  = ptr[klen];
1578
1579     if (klen == 0)
1580      {
1581       /* If char in invariant then swatch is for all the invariant chars
1582        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1583        */
1584       needents = UTF_CONTINUATION_MARK;
1585       off      = NATIVE_TO_UTF(ptr[klen]);
1586      }
1587     else
1588      {
1589       /* If char is encoded then swatch is for the prefix */
1590       needents = (1 << UTF_ACCUMULATION_SHIFT);
1591       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1592      }
1593
1594     /*
1595      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1596      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1597      * it's nothing to sniff at.)  Pity we usually come through at least
1598      * two function calls to get here...
1599      *
1600      * NB: this code assumes that swatches are never modified, once generated!
1601      */
1602
1603     if (hv   == PL_last_swash_hv &&
1604         klen == PL_last_swash_klen &&
1605         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1606     {
1607         tmps = PL_last_swash_tmps;
1608         slen = PL_last_swash_slen;
1609     }
1610     else {
1611         /* Try our second-level swatch cache, kept in a hash. */
1612         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1613
1614         /* If not cached, generate it via utf8::SWASHGET */
1615         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1616             dSP;
1617             /* We use utf8n_to_uvuni() as we want an index into
1618                Unicode tables, not a native character number.
1619              */
1620             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1621                                            ckWARN(WARN_UTF8) ?
1622                                            0 : UTF8_ALLOW_ANY);
1623             SV *errsv_save;
1624             ENTER;
1625             SAVETMPS;
1626             save_re_context();
1627             PUSHSTACKi(PERLSI_MAGIC);
1628             PUSHMARK(SP);
1629             EXTEND(SP,3);
1630             PUSHs((SV*)sv);
1631             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1632             PUSHs(sv_2mortal(newSViv((klen) ?
1633                                      (code_point & ~(needents - 1)) : 0)));
1634             PUSHs(sv_2mortal(newSViv(needents)));
1635             PUTBACK;
1636             errsv_save = newSVsv(ERRSV);
1637             if (call_method("SWASHGET", G_SCALAR))
1638                 retval = newSVsv(*PL_stack_sp--);
1639             else
1640                 retval = &PL_sv_undef;
1641             if (!SvTRUE(ERRSV))
1642                 sv_setsv(ERRSV, errsv_save);
1643             SvREFCNT_dec(errsv_save);
1644             POPSTACK;
1645             FREETMPS;
1646             LEAVE;
1647             if (PL_curcop == &PL_compiling)
1648                 PL_curcop->op_private = PL_hints;
1649
1650             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1651
1652             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1653                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1654         }
1655
1656         PL_last_swash_hv = hv;
1657         PL_last_swash_klen = klen;
1658         PL_last_swash_tmps = tmps;
1659         PL_last_swash_slen = slen;
1660         if (klen)
1661             Copy(ptr, PL_last_swash_key, klen, U8);
1662     }
1663
1664     switch ((int)((slen << 3) / needents)) {
1665     case 1:
1666         bit = 1 << (off & 7);
1667         off >>= 3;
1668         return (tmps[off] & bit) != 0;
1669     case 8:
1670         return tmps[off];
1671     case 16:
1672         off <<= 1;
1673         return (tmps[off] << 8) + tmps[off + 1] ;
1674     case 32:
1675         off <<= 2;
1676         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1677     }
1678     Perl_croak(aTHX_ "panic: swash_fetch");
1679     return 0;
1680 }
1681
1682
1683 /*
1684 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1685
1686 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1687 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1688 bytes available. The return value is the pointer to the byte after the
1689 end of the new character. In other words,
1690
1691     d = uvchr_to_utf8(d, uv);
1692
1693 is the recommended wide native character-aware way of saying
1694
1695     *(d++) = uv;
1696
1697 =cut
1698 */
1699
1700 /* On ASCII machines this is normally a macro but we want a
1701    real function in case XS code wants it
1702 */
1703 #undef Perl_uvchr_to_utf8
1704 U8 *
1705 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1706 {
1707     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1708 }
1709
1710 U8 *
1711 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1712 {
1713     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1714 }
1715
1716 /*
1717 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1718
1719 Returns the native character value of the first character in the string C<s>
1720 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1721 length, in bytes, of that character.
1722
1723 Allows length and flags to be passed to low level routine.
1724
1725 =cut
1726 */
1727 /* On ASCII machines this is normally a macro but we want
1728    a real function in case XS code wants it
1729 */
1730 #undef Perl_utf8n_to_uvchr
1731 UV
1732 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1733 {
1734     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1735     return UNI_TO_NATIVE(uv);
1736 }
1737
1738 /*
1739 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1740
1741 Build to the scalar dsv a displayable version of the string spv,
1742 length len, the displayable version being at most pvlim bytes long
1743 (if longer, the rest is truncated and "..." will be appended).
1744
1745 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1746 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1747 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1748 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1749 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1750 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1751
1752 The pointer to the PV of the dsv is returned.
1753
1754 =cut */
1755 char *
1756 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1757 {
1758     int truncated = 0;
1759     char *s, *e;
1760
1761     sv_setpvn(dsv, "", 0);
1762     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1763          UV u;
1764          bool ok = FALSE;
1765
1766          if (pvlim && SvCUR(dsv) >= pvlim) {
1767               truncated++;
1768               break;
1769          }
1770          u = utf8_to_uvchr((U8*)s, 0);
1771          if (u < 256) {
1772              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1773                  switch (u & 0xFF) {
1774                  case '\n':
1775                      Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1776                  case '\r':
1777                      Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1778                  case '\t':
1779                      Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1780                  case '\f':
1781                      Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1782                  case '\a':
1783                      Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1784                  case '\\':
1785                      Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1786                  default: break;
1787                  }
1788              }
1789              /* isPRINT() is the locale-blind version. */
1790              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1791                  Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1792                  ok = TRUE;
1793              }
1794          }
1795          if (!ok)
1796              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1797     }
1798     if (truncated)
1799          sv_catpvn(dsv, "...", 3);
1800
1801     return SvPVX(dsv);
1802 }
1803
1804 /*
1805 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1806
1807 Build to the scalar dsv a displayable version of the scalar sv,
1808 the displayable version being at most pvlim bytes long
1809 (if longer, the rest is truncated and "..." will be appended).
1810
1811 The flags argument is as in pv_uni_display().
1812
1813 The pointer to the PV of the dsv is returned.
1814
1815 =cut */
1816 char *
1817 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1818 {
1819      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1820                                 pvlim, flags);
1821 }
1822
1823 /*
1824 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1825
1826 Return true if the strings s1 and s2 differ case-insensitively, false
1827 if not (if they are equal case-insensitively).  If u1 is true, the
1828 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1829 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1830 are false, the respective string is assumed to be in native 8-bit
1831 encoding.
1832
1833 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1834 in there (they will point at the beginning of the I<next> character).
1835 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1836 pointers beyond which scanning will not continue under any
1837 circustances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1838 s2+l2 will be used as goal end pointers that will also stop the scan,
1839 and which qualify towards defining a successful match: all the scans
1840 that define an explicit length must reach their goal pointers for
1841 a match to succeed).
1842
1843 For case-insensitiveness, the "casefolding" of Unicode is used
1844 instead of upper/lowercasing both the characters, see
1845 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1846
1847 =cut */
1848 I32
1849 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1850 {
1851      register U8 *p1  = (U8*)s1;
1852      register U8 *p2  = (U8*)s2;
1853      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1854      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1855      STRLEN n1 = 0, n2 = 0;
1856      U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1857      U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1858      U8 natbuf[1+1];
1859      STRLEN foldlen1, foldlen2;
1860      bool match;
1861
1862      if (pe1)
1863           e1 = *(U8**)pe1;
1864      if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1865           f1 = (U8*)s1 + l1;
1866      if (pe2)
1867           e2 = *(U8**)pe2;
1868      if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1869           f2 = (U8*)s2 + l2;
1870
1871      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1872           return 1; /* mismatch; possible infinite loop or false positive */
1873
1874      if (!u1 || !u2)
1875           natbuf[1] = 0; /* Need to terminate the buffer. */
1876
1877      while ((e1 == 0 || p1 < e1) &&
1878             (f1 == 0 || p1 < f1) &&
1879             (e2 == 0 || p2 < e2) &&
1880             (f2 == 0 || p2 < f2)) {
1881           if (n1 == 0) {
1882                if (u1)
1883                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1884                else {
1885                     natbuf[0] = *p1;
1886                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1887                }
1888                q1 = foldbuf1;
1889                n1 = foldlen1;
1890           }
1891           if (n2 == 0) {
1892                if (u2)
1893                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1894                else {
1895                     natbuf[0] = *p2;
1896                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1897                }
1898                q2 = foldbuf2;
1899                n2 = foldlen2;
1900           }
1901           while (n1 && n2) {
1902                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1903                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1904                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1905                    return 1; /* mismatch */
1906                n1 -= UTF8SKIP(q1);
1907                q1 += UTF8SKIP(q1);
1908                n2 -= UTF8SKIP(q2);
1909                q2 += UTF8SKIP(q2);
1910           }
1911           if (n1 == 0)
1912                p1 += u1 ? UTF8SKIP(p1) : 1;
1913           if (n2 == 0)
1914                p2 += u2 ? UTF8SKIP(p2) : 1;
1915
1916      }
1917
1918      /* A match is defined by all the scans that specified
1919       * an explicit length reaching their final goals. */
1920      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1921
1922      if (match) {
1923           if (pe1)
1924                *pe1 = (char*)p1;
1925           if (pe2)
1926                *pe2 = (char*)p2;
1927      }
1928
1929      return match ? 0 : 1; /* 0 match, 1 mismatch */
1930 }
1931