utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2002, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
  28
  29 /*
  30 =head1 Unicode Support
  31
  32 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  33
  34 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  35 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  36 bytes available. The return value is the pointer to the byte after the
  37 end of the new character. In other words,
  38
  39     d = uvuni_to_utf8_flags(d, uv, flags);
  40
  41 or, in most cases,
  42
  43     d = uvuni_to_utf8(d, uv);
  44
  45 (which is equivalent to)
  46
  47     d = uvuni_to_utf8_flags(d, uv, 0);
  48
  49 is the recommended Unicode-aware way of saying
  50
  51     *(d++) = uv;
  52
  53 =cut
  54 */
  55
  56 U8 *
  57 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  58 {
  59     if (ckWARN(WARN_UTF8)) {
  60          if (UNICODE_IS_SURROGATE(uv) &&
  61              !(flags & UNICODE_ALLOW_SURROGATE))
  62               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  63          else if (
  64                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  65                     !(flags & UNICODE_ALLOW_FDD0))
  66                    ||
  67                    ((uv & 0xFFFF) == 0xFFFE &&
  68                     !(flags & UNICODE_ALLOW_FFFE))
  69                    ||
  70                    ((uv & 0xFFFF) == 0xFFFF &&
  71                     !(flags & UNICODE_ALLOW_FFFF))) &&
  72                   /* UNICODE_ALLOW_SUPER includes
  73                    * FFFEs and FFFFs beyond 0x10FFFF. */
  74                   ((uv <= PERL_UNICODE_MAX) ||
  75                    !(flags & UNICODE_ALLOW_SUPER))
  76                   )
  77               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  78                          "Unicode character 0x%04"UVxf" is illegal", uv);
  79     }
  80     if (UNI_IS_INVARIANT(uv)) {
  81         *d++ = UTF_TO_NATIVE(uv);
  82         return d;
  83     }
  84 #if defined(EBCDIC)
  85     else {
  86         STRLEN len  = UNISKIP(uv);
  87         U8 *p = d+len-1;
  88         while (p > d) {
  89             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  90             uv >>= UTF_ACCUMULATION_SHIFT;
  91         }
  92         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  93         return d+len;
  94     }
  95 #else /* Non loop style */
  96     if (uv < 0x800) {
  97         *d++ = (( uv >>  6)         | 0xc0);
  98         *d++ = (( uv        & 0x3f) | 0x80);
  99         return d;
 100     }
 101     if (uv < 0x10000) {
 102         *d++ = (( uv >> 12)         | 0xe0);
 103         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 104         *d++ = (( uv        & 0x3f) | 0x80);
 105         return d;
 106     }
 107     if (uv < 0x200000) {
 108         *d++ = (( uv >> 18)         | 0xf0);
 109         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 110         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 111         *d++ = (( uv        & 0x3f) | 0x80);
 112         return d;
 113     }
 114     if (uv < 0x4000000) {
 115         *d++ = (( uv >> 24)         | 0xf8);
 116         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 117         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 118         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 119         *d++ = (( uv        & 0x3f) | 0x80);
 120         return d;
 121     }
 122     if (uv < 0x80000000) {
 123         *d++ = (( uv >> 30)         | 0xfc);
 124         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 125         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 126         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 127         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 128         *d++ = (( uv        & 0x3f) | 0x80);
 129         return d;
 130     }
 131 #ifdef HAS_QUAD
 132     if (uv < UTF8_QUAD_MAX)
 133 #endif
 134     {
 135         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 136         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 137         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 138         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 139         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 140         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 141         *d++ = (( uv        & 0x3f) | 0x80);
 142         return d;
 143     }
 144 #ifdef HAS_QUAD
 145     {
 146         *d++ =                        0xff;     /* Can't match U+FFFE! */
 147         *d++ =                        0x80;     /* 6 Reserved bits */
 148         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 149         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 150         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 151         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 152         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 153         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 154         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 155         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 156         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 157         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 158         *d++ = (( uv        & 0x3f) | 0x80);
 159         return d;
 160     }
 161 #endif
 162 #endif /* Loop style */
 163 }
 164
 165 U8 *
 166 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 167 {
 168     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 169 }
 170
 171
 172 /*
 173 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 174
 175 Tests if some arbitrary number of bytes begins in a valid UTF-8
 176 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 177 The actual number of bytes in the UTF-8 character will be returned if
 178 it is valid, otherwise 0.
 179
 180 =cut
 181 */
 182 STRLEN
 183 Perl_is_utf8_char(pTHX_ U8 *s)
 184 {
 185     U8 u = *s;
 186     STRLEN slen, len;
 187     UV uv, ouv;
 188
 189     if (UTF8_IS_INVARIANT(u))
 190         return 1;
 191
 192     if (!UTF8_IS_START(u))
 193         return 0;
 194
 195     len = UTF8SKIP(s);
 196
 197     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 198         return 0;
 199
 200     slen = len - 1;
 201     s++;
 202     u &= UTF_START_MASK(len);
 203     uv  = u;
 204     ouv = uv;
 205     while (slen--) {
 206         if (!UTF8_IS_CONTINUATION(*s))
 207             return 0;
 208         uv = UTF8_ACCUMULATE(uv, *s);
 209         if (uv < ouv)
 210             return 0;
 211         ouv = uv;
 212         s++;
 213     }
 214
 215     if (UNISKIP(uv) < len)
 216         return 0;
 217
 218     return len;
 219 }
 220
 221 /*
 222 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 223
 224 Returns true if first C<len> bytes of the given string form a valid UTF8
 225 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 226 'a string that contains UTF8' because a valid ASCII string is a valid
 227 UTF8 string.
 228
 229 =cut
 230 */
 231
 232 bool
 233 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 234 {
 235     U8* x = s;
 236     U8* send;
 237     STRLEN c;
 238
 239     if (!len)
 240         len = strlen((char *)s);
 241     send = s + len;
 242
 243     while (x < send) {
 244         c = is_utf8_char(x);
 245         if (!c)
 246             return FALSE;
 247         x += c;
 248     }
 249     if (x != send)
 250         return FALSE;
 251
 252     return TRUE;
 253 }
 254
 255 /*
 256 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 257
 258 Bottom level UTF-8 decode routine.
 259 Returns the unicode code point value of the first character in the string C<s>
 260 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 261 C<retlen> will be set to the length, in bytes, of that character.
 262
 263 If C<s> does not point to a well-formed UTF8 character, the behaviour
 264 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 265 it is assumed that the caller will raise a warning, and this function
 266 will silently just set C<retlen> to C<-1> and return zero.  If the
 267 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 268 malformations will be given, C<retlen> will be set to the expected
 269 length of the UTF-8 character in bytes, and zero will be returned.
 270
 271 The C<flags> can also contain various flags to allow deviations from
 272 the strict UTF-8 encoding (see F<utf8.h>).
 273
 274 Most code should use utf8_to_uvchr() rather than call this directly.
 275
 276 =cut
 277 */
 278
 279 UV
 280 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 281 {
 282     U8 *s0 = s;
 283     UV uv = *s, ouv = 0;
 284     STRLEN len = 1;
 285     bool dowarn = ckWARN_d(WARN_UTF8);
 286     UV startbyte = *s;
 287     STRLEN expectlen = 0;
 288     U32 warning = 0;
 289
 290 /* This list is a superset of the UTF8_ALLOW_XXX. */
 291
 292 #define UTF8_WARN_EMPTY                          1
 293 #define UTF8_WARN_CONTINUATION                   2
 294 #define UTF8_WARN_NON_CONTINUATION               3
 295 #define UTF8_WARN_FE_FF                          4
 296 #define UTF8_WARN_SHORT                          5
 297 #define UTF8_WARN_OVERFLOW                       6
 298 #define UTF8_WARN_SURROGATE                      7
 299 #define UTF8_WARN_BOM                            8
 300 #define UTF8_WARN_LONG                           9
 301 #define UTF8_WARN_FFFF                          10
 302
 303     if (curlen == 0 &&
 304         !(flags & UTF8_ALLOW_EMPTY)) {
 305         warning = UTF8_WARN_EMPTY;
 306         goto malformed;
 307     }
 308
 309     if (UTF8_IS_INVARIANT(uv)) {
 310         if (retlen)
 311             *retlen = 1;
 312         return (UV) (NATIVE_TO_UTF(*s));
 313     }
 314
 315     if (UTF8_IS_CONTINUATION(uv) &&
 316         !(flags & UTF8_ALLOW_CONTINUATION)) {
 317         warning = UTF8_WARN_CONTINUATION;
 318         goto malformed;
 319     }
 320
 321     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 322         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 323         warning = UTF8_WARN_NON_CONTINUATION;
 324         goto malformed;
 325     }
 326
 327 #ifdef EBCDIC
 328     uv = NATIVE_TO_UTF(uv);
 329 #else
 330     if ((uv == 0xfe || uv == 0xff) &&
 331         !(flags & UTF8_ALLOW_FE_FF)) {
 332         warning = UTF8_WARN_FE_FF;
 333         goto malformed;
 334     }
 335 #endif
 336
 337     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 338     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 339     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 340     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 341 #ifdef EBCDIC
 342     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 343     else                        { len =  7; uv &= 0x01; }
 344 #else
 345     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 346     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 347     else                        { len = 13; uv = 0; } /* whoa! */
 348 #endif
 349
 350     if (retlen)
 351         *retlen = len;
 352
 353     expectlen = len;
 354
 355     if ((curlen < expectlen) &&
 356         !(flags & UTF8_ALLOW_SHORT)) {
 357         warning = UTF8_WARN_SHORT;
 358         goto malformed;
 359     }
 360
 361     len--;
 362     s++;
 363     ouv = uv;
 364
 365     while (len--) {
 366         if (!UTF8_IS_CONTINUATION(*s) &&
 367             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 368             s--;
 369             warning = UTF8_WARN_NON_CONTINUATION;
 370             goto malformed;
 371         }
 372         else
 373             uv = UTF8_ACCUMULATE(uv, *s);
 374         if (!(uv > ouv)) {
 375             /* These cannot be allowed. */
 376             if (uv == ouv) {
 377                 if (!(flags & UTF8_ALLOW_LONG)) {
 378                     warning = UTF8_WARN_LONG;
 379                     goto malformed;
 380                 }
 381             }
 382             else { /* uv < ouv */
 383                 /* This cannot be allowed. */
 384                 warning = UTF8_WARN_OVERFLOW;
 385                 goto malformed;
 386             }
 387         }
 388         s++;
 389         ouv = uv;
 390     }
 391
 392     if (UNICODE_IS_SURROGATE(uv) &&
 393         !(flags & UTF8_ALLOW_SURROGATE)) {
 394         warning = UTF8_WARN_SURROGATE;
 395         goto malformed;
 396     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 397                !(flags & UTF8_ALLOW_BOM)) {
 398         warning = UTF8_WARN_BOM;
 399         goto malformed;
 400     } else if ((expectlen > UNISKIP(uv)) &&
 401                !(flags & UTF8_ALLOW_LONG)) {
 402         warning = UTF8_WARN_LONG;
 403         goto malformed;
 404     } else if (UNICODE_IS_ILLEGAL(uv) &&
 405                !(flags & UTF8_ALLOW_FFFF)) {
 406         warning = UTF8_WARN_FFFF;
 407         goto malformed;
 408     }
 409
 410     return uv;
 411
 412 malformed:
 413
 414     if (flags & UTF8_CHECK_ONLY) {
 415         if (retlen)
 416             *retlen = -1;
 417         return 0;
 418     }
 419
 420     if (dowarn) {
 421         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 422
 423         switch (warning) {
 424         case 0: /* Intentionally empty. */ break;
 425         case UTF8_WARN_EMPTY:
 426             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 427             break;
 428         case UTF8_WARN_CONTINUATION:
 429             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 430             break;
 431         case UTF8_WARN_NON_CONTINUATION:
 432             if (s == s0)
 433                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 434                            (UV)s[1], startbyte);
 435             else
 436                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 437                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 438
 439             break;
 440         case UTF8_WARN_FE_FF:
 441             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 442             break;
 443         case UTF8_WARN_SHORT:
 444             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 445                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 446             expectlen = curlen;         /* distance for caller to skip */
 447             break;
 448         case UTF8_WARN_OVERFLOW:
 449             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 450                            ouv, *s, startbyte);
 451             break;
 452         case UTF8_WARN_SURROGATE:
 453             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 454             break;
 455         case UTF8_WARN_BOM:
 456             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 457             break;
 458         case UTF8_WARN_LONG:
 459             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 460                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 461             break;
 462         case UTF8_WARN_FFFF:
 463             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 464             break;
 465         default:
 466             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 467             break;
 468         }
 469
 470         if (warning) {
 471             char *s = SvPVX(sv);
 472
 473             if (PL_op)
 474                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 475                             "%s in %s", s,  OP_DESC(PL_op));
 476             else
 477                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 478         }
 479     }
 480
 481     if (retlen)
 482         *retlen = expectlen ? expectlen : len;
 483
 484     return 0;
 485 }
 486
 487 /*
 488 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 489
 490 Returns the native character value of the first character in the string C<s>
 491 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 492 length, in bytes, of that character.
 493
 494 If C<s> does not point to a well-formed UTF8 character, zero is
 495 returned and retlen is set, if possible, to -1.
 496
 497 =cut
 498 */
 499
 500 UV
 501 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 502 {
 503     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 504 }
 505
 506 /*
 507 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 508
 509 Returns the Unicode code point of the first character in the string C<s>
 510 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 511 length, in bytes, of that character.
 512
 513 This function should only be used when returned UV is considered
 514 an index into the Unicode semantic tables (e.g. swashes).
 515
 516 If C<s> does not point to a well-formed UTF8 character, zero is
 517 returned and retlen is set, if possible, to -1.
 518
 519 =cut
 520 */
 521
 522 UV
 523 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 524 {
 525     /* Call the low level routine asking for checks */
 526     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 527 }
 528
 529 /*
 530 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 531
 532 Return the length of the UTF-8 char encoded string C<s> in characters.
 533 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 534 up past C<e>, croaks.
 535
 536 =cut
 537 */
 538
 539 STRLEN
 540 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 541 {
 542     STRLEN len = 0;
 543
 544     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 545      * the bitops (especially ~) can create illegal UTF-8.
 546      * In other words: in Perl UTF-8 is not just for Unicode. */
 547
 548     if (e < s) {
 549         if (ckWARN_d(WARN_UTF8)) {
 550             if (PL_op)
 551                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 552                             "%s in %s", unees, OP_DESC(PL_op));
 553             else
 554                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 555         }
 556         return 0;
 557     }
 558     while (s < e) {
 559         U8 t = UTF8SKIP(s);
 560
 561         if (e - s < t) {
 562             if (ckWARN_d(WARN_UTF8)) {
 563                 if (PL_op)
 564                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 565                                 unees, OP_DESC(PL_op));
 566                 else
 567                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 568             }
 569             return len;
 570         }
 571         s += t;
 572         len++;
 573     }
 574
 575     return len;
 576 }
 577
 578 /*
 579 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 580
 581 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 582 and C<b>.
 583
 584 WARNING: use only if you *know* that the pointers point inside the
 585 same UTF-8 buffer.
 586
 587 =cut
 588 */
 589
 590 IV
 591 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 592 {
 593     IV off = 0;
 594
 595     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 596      * the bitops (especially ~) can create illegal UTF-8.
 597      * In other words: in Perl UTF-8 is not just for Unicode. */
 598
 599     if (a < b) {
 600         while (a < b) {
 601             U8 c = UTF8SKIP(a);
 602
 603             if (b - a < c) {
 604                 if (ckWARN_d(WARN_UTF8)) {
 605                     if (PL_op)
 606                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 607                                     "%s in %s", unees, OP_DESC(PL_op));
 608                     else
 609                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 610                 }
 611                 return off;
 612             }
 613             a += c;
 614             off--;
 615         }
 616     }
 617     else {
 618         while (b < a) {
 619             U8 c = UTF8SKIP(b);
 620
 621             if (a - b < c) {
 622                 if (ckWARN_d(WARN_UTF8)) {
 623                     if (PL_op)
 624                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 625                                     "%s in %s", unees, OP_DESC(PL_op));
 626                     else
 627                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 628                 }
 629                 return off;
 630             }
 631             b += c;
 632             off++;
 633         }
 634     }
 635
 636     return off;
 637 }
 638
 639 /*
 640 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 641
 642 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 643 forward or backward.
 644
 645 WARNING: do not use the following unless you *know* C<off> is within
 646 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 647 on the first byte of character or just after the last byte of a character.
 648
 649 =cut
 650 */
 651
 652 U8 *
 653 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 654 {
 655     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 656      * the bitops (especially ~) can create illegal UTF-8.
 657      * In other words: in Perl UTF-8 is not just for Unicode. */
 658
 659     if (off >= 0) {
 660         while (off--)
 661             s += UTF8SKIP(s);
 662     }
 663     else {
 664         while (off++) {
 665             s--;
 666             while (UTF8_IS_CONTINUATION(*s))
 667                 s--;
 668         }
 669     }
 670     return s;
 671 }
 672
 673 /*
 674 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 675
 676 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 677 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 678 updates len to contain the new length.
 679 Returns zero on failure, setting C<len> to -1.
 680
 681 =cut
 682 */
 683
 684 U8 *
 685 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 686 {
 687     U8 *send;
 688     U8 *d;
 689     U8 *save = s;
 690
 691     /* ensure valid UTF8 and chars < 256 before updating string */
 692     for (send = s + *len; s < send; ) {
 693         U8 c = *s++;
 694
 695         if (!UTF8_IS_INVARIANT(c) &&
 696             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 697              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 698             *len = -1;
 699             return 0;
 700         }
 701     }
 702
 703     d = s = save;
 704     while (s < send) {
 705         STRLEN ulen;
 706         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 707         s += ulen;
 708     }
 709     *d = '\0';
 710     *len = d - save;
 711     return save;
 712 }
 713
 714 /*
 715 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 716
 717 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 718 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 719 the newly-created string, and updates C<len> to contain the new
 720 length.  Returns the original string if no conversion occurs, C<len>
 721 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 722 0 if C<s> is converted or contains all 7bit characters.
 723
 724 =cut
 725 */
 726
 727 U8 *
 728 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 729 {
 730     U8 *d;
 731     U8 *start = s;
 732     U8 *send;
 733     I32 count = 0;
 734
 735     if (!*is_utf8)
 736         return start;
 737
 738     /* ensure valid UTF8 and chars < 256 before converting string */
 739     for (send = s + *len; s < send;) {
 740         U8 c = *s++;
 741         if (!UTF8_IS_INVARIANT(c)) {
 742             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 743                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 744                 count++;
 745             else
 746                 return start;
 747         }
 748     }
 749
 750     *is_utf8 = 0;
 751
 752     Newz(801, d, (*len) - count + 1, U8);
 753     s = start; start = d;
 754     while (s < send) {
 755         U8 c = *s++;
 756         if (!UTF8_IS_INVARIANT(c)) {
 757             /* Then it is two-byte encoded */
 758             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 759             c = ASCII_TO_NATIVE(c);
 760         }
 761         *d++ = c;
 762     }
 763     *d = '\0';
 764     *len = d - start;
 765     return start;
 766 }
 767
 768 /*
 769 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 770
 771 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 772 Returns a pointer to the newly-created string, and sets C<len> to
 773 reflect the new length.
 774
 775 =cut
 776 */
 777
 778 U8*
 779 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 780 {
 781     U8 *send;
 782     U8 *d;
 783     U8 *dst;
 784     send = s + (*len);
 785
 786     Newz(801, d, (*len) * 2 + 1, U8);
 787     dst = d;
 788
 789     while (s < send) {
 790         UV uv = NATIVE_TO_ASCII(*s++);
 791         if (UNI_IS_INVARIANT(uv))
 792             *d++ = UTF_TO_NATIVE(uv);
 793         else {
 794             *d++ = UTF8_EIGHT_BIT_HI(uv);
 795             *d++ = UTF8_EIGHT_BIT_LO(uv);
 796         }
 797     }
 798     *d = '\0';
 799     *len = d-dst;
 800     return dst;
 801 }
 802
 803 /*
 804  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 805  *
 806  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 807  * We optimize for native, for obvious reasons. */
 808
 809 U8*
 810 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 811 {
 812     U8* pend;
 813     U8* dstart = d;
 814
 815     if (bytelen & 1)
 816         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 817
 818     pend = p + bytelen;
 819
 820     while (p < pend) {
 821         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 822         p += 2;
 823         if (uv < 0x80) {
 824             *d++ = uv;
 825             continue;
 826         }
 827         if (uv < 0x800) {
 828             *d++ = (( uv >>  6)         | 0xc0);
 829             *d++ = (( uv        & 0x3f) | 0x80);
 830             continue;
 831         }
 832         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 833             UV low = *p++;
 834             if (low < 0xdc00 || low >= 0xdfff)
 835                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 836             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 837         }
 838         if (uv < 0x10000) {
 839             *d++ = (( uv >> 12)         | 0xe0);
 840             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 841             *d++ = (( uv        & 0x3f) | 0x80);
 842             continue;
 843         }
 844         else {
 845             *d++ = (( uv >> 18)         | 0xf0);
 846             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 847             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 848             *d++ = (( uv        & 0x3f) | 0x80);
 849             continue;
 850         }
 851     }
 852     *newlen = d - dstart;
 853     return d;
 854 }
 855
 856 /* Note: this one is slightly destructive of the source. */
 857
 858 U8*
 859 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 860 {
 861     U8* s = (U8*)p;
 862     U8* send = s + bytelen;
 863     while (s < send) {
 864         U8 tmp = s[0];
 865         s[0] = s[1];
 866         s[1] = tmp;
 867         s += 2;
 868     }
 869     return utf16_to_utf8(p, d, bytelen, newlen);
 870 }
 871
 872 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 873
 874 bool
 875 Perl_is_uni_alnum(pTHX_ UV c)
 876 {
 877     U8 tmpbuf[UTF8_MAXLEN+1];
 878     uvchr_to_utf8(tmpbuf, c);
 879     return is_utf8_alnum(tmpbuf);
 880 }
 881
 882 bool
 883 Perl_is_uni_alnumc(pTHX_ UV c)
 884 {
 885     U8 tmpbuf[UTF8_MAXLEN+1];
 886     uvchr_to_utf8(tmpbuf, c);
 887     return is_utf8_alnumc(tmpbuf);
 888 }
 889
 890 bool
 891 Perl_is_uni_idfirst(pTHX_ UV c)
 892 {
 893     U8 tmpbuf[UTF8_MAXLEN+1];
 894     uvchr_to_utf8(tmpbuf, c);
 895     return is_utf8_idfirst(tmpbuf);
 896 }
 897
 898 bool
 899 Perl_is_uni_alpha(pTHX_ UV c)
 900 {
 901     U8 tmpbuf[UTF8_MAXLEN+1];
 902     uvchr_to_utf8(tmpbuf, c);
 903     return is_utf8_alpha(tmpbuf);
 904 }
 905
 906 bool
 907 Perl_is_uni_ascii(pTHX_ UV c)
 908 {
 909     U8 tmpbuf[UTF8_MAXLEN+1];
 910     uvchr_to_utf8(tmpbuf, c);
 911     return is_utf8_ascii(tmpbuf);
 912 }
 913
 914 bool
 915 Perl_is_uni_space(pTHX_ UV c)
 916 {
 917     U8 tmpbuf[UTF8_MAXLEN+1];
 918     uvchr_to_utf8(tmpbuf, c);
 919     return is_utf8_space(tmpbuf);
 920 }
 921
 922 bool
 923 Perl_is_uni_digit(pTHX_ UV c)
 924 {
 925     U8 tmpbuf[UTF8_MAXLEN+1];
 926     uvchr_to_utf8(tmpbuf, c);
 927     return is_utf8_digit(tmpbuf);
 928 }
 929
 930 bool
 931 Perl_is_uni_upper(pTHX_ UV c)
 932 {
 933     U8 tmpbuf[UTF8_MAXLEN+1];
 934     uvchr_to_utf8(tmpbuf, c);
 935     return is_utf8_upper(tmpbuf);
 936 }
 937
 938 bool
 939 Perl_is_uni_lower(pTHX_ UV c)
 940 {
 941     U8 tmpbuf[UTF8_MAXLEN+1];
 942     uvchr_to_utf8(tmpbuf, c);
 943     return is_utf8_lower(tmpbuf);
 944 }
 945
 946 bool
 947 Perl_is_uni_cntrl(pTHX_ UV c)
 948 {
 949     U8 tmpbuf[UTF8_MAXLEN+1];
 950     uvchr_to_utf8(tmpbuf, c);
 951     return is_utf8_cntrl(tmpbuf);
 952 }
 953
 954 bool
 955 Perl_is_uni_graph(pTHX_ UV c)
 956 {
 957     U8 tmpbuf[UTF8_MAXLEN+1];
 958     uvchr_to_utf8(tmpbuf, c);
 959     return is_utf8_graph(tmpbuf);
 960 }
 961
 962 bool
 963 Perl_is_uni_print(pTHX_ UV c)
 964 {
 965     U8 tmpbuf[UTF8_MAXLEN+1];
 966     uvchr_to_utf8(tmpbuf, c);
 967     return is_utf8_print(tmpbuf);
 968 }
 969
 970 bool
 971 Perl_is_uni_punct(pTHX_ UV c)
 972 {
 973     U8 tmpbuf[UTF8_MAXLEN+1];
 974     uvchr_to_utf8(tmpbuf, c);
 975     return is_utf8_punct(tmpbuf);
 976 }
 977
 978 bool
 979 Perl_is_uni_xdigit(pTHX_ UV c)
 980 {
 981     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 982     uvchr_to_utf8(tmpbuf, c);
 983     return is_utf8_xdigit(tmpbuf);
 984 }
 985
 986 UV
 987 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 988 {
 989     uvchr_to_utf8(p, c);
 990     return to_utf8_upper(p, p, lenp);
 991 }
 992
 993 UV
 994 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 995 {
 996     uvchr_to_utf8(p, c);
 997     return to_utf8_title(p, p, lenp);
 998 }
 999
1000 UV
1001 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1002 {
1003     uvchr_to_utf8(p, c);
1004     return to_utf8_lower(p, p, lenp);
1005 }
1006
1007 UV
1008 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1009 {
1010     uvchr_to_utf8(p, c);
1011     return to_utf8_fold(p, p, lenp);
1012 }
1013
1014 /* for now these all assume no locale info available for Unicode > 255 */
1015
1016 bool
1017 Perl_is_uni_alnum_lc(pTHX_ UV c)
1018 {
1019     return is_uni_alnum(c);     /* XXX no locale support yet */
1020 }
1021
1022 bool
1023 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1024 {
1025     return is_uni_alnumc(c);    /* XXX no locale support yet */
1026 }
1027
1028 bool
1029 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1030 {
1031     return is_uni_idfirst(c);   /* XXX no locale support yet */
1032 }
1033
1034 bool
1035 Perl_is_uni_alpha_lc(pTHX_ UV c)
1036 {
1037     return is_uni_alpha(c);     /* XXX no locale support yet */
1038 }
1039
1040 bool
1041 Perl_is_uni_ascii_lc(pTHX_ UV c)
1042 {
1043     return is_uni_ascii(c);     /* XXX no locale support yet */
1044 }
1045
1046 bool
1047 Perl_is_uni_space_lc(pTHX_ UV c)
1048 {
1049     return is_uni_space(c);     /* XXX no locale support yet */
1050 }
1051
1052 bool
1053 Perl_is_uni_digit_lc(pTHX_ UV c)
1054 {
1055     return is_uni_digit(c);     /* XXX no locale support yet */
1056 }
1057
1058 bool
1059 Perl_is_uni_upper_lc(pTHX_ UV c)
1060 {
1061     return is_uni_upper(c);     /* XXX no locale support yet */
1062 }
1063
1064 bool
1065 Perl_is_uni_lower_lc(pTHX_ UV c)
1066 {
1067     return is_uni_lower(c);     /* XXX no locale support yet */
1068 }
1069
1070 bool
1071 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1072 {
1073     return is_uni_cntrl(c);     /* XXX no locale support yet */
1074 }
1075
1076 bool
1077 Perl_is_uni_graph_lc(pTHX_ UV c)
1078 {
1079     return is_uni_graph(c);     /* XXX no locale support yet */
1080 }
1081
1082 bool
1083 Perl_is_uni_print_lc(pTHX_ UV c)
1084 {
1085     return is_uni_print(c);     /* XXX no locale support yet */
1086 }
1087
1088 bool
1089 Perl_is_uni_punct_lc(pTHX_ UV c)
1090 {
1091     return is_uni_punct(c);     /* XXX no locale support yet */
1092 }
1093
1094 bool
1095 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1096 {
1097     return is_uni_xdigit(c);    /* XXX no locale support yet */
1098 }
1099
1100 U32
1101 Perl_to_uni_upper_lc(pTHX_ U32 c)
1102 {
1103     /* XXX returns only the first character -- do not use XXX */
1104     /* XXX no locale support yet */
1105     STRLEN len;
1106     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1107     return (U32)to_uni_upper(c, tmpbuf, &len);
1108 }
1109
1110 U32
1111 Perl_to_uni_title_lc(pTHX_ U32 c)
1112 {
1113     /* XXX returns only the first character XXX -- do not use XXX */
1114     /* XXX no locale support yet */
1115     STRLEN len;
1116     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1117     return (U32)to_uni_title(c, tmpbuf, &len);
1118 }
1119
1120 U32
1121 Perl_to_uni_lower_lc(pTHX_ U32 c)
1122 {
1123     /* XXX returns only the first character -- do not use XXX */
1124     /* XXX no locale support yet */
1125     STRLEN len;
1126     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1127     return (U32)to_uni_lower(c, tmpbuf, &len);
1128 }
1129
1130 bool
1131 Perl_is_utf8_alnum(pTHX_ U8 *p)
1132 {
1133     if (!is_utf8_char(p))
1134         return FALSE;
1135     if (!PL_utf8_alnum)
1136         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1137          * descendant of isalnum(3), in other words, it doesn't
1138          * contain the '_'. --jhi */
1139         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1140     return swash_fetch(PL_utf8_alnum, p, TRUE);
1141 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1142 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1143     if (!PL_utf8_alnum)
1144         PL_utf8_alnum = swash_init("utf8", "",
1145             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1146     return swash_fetch(PL_utf8_alnum, p, TRUE);
1147 #endif
1148 }
1149
1150 bool
1151 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1152 {
1153     if (!is_utf8_char(p))
1154         return FALSE;
1155     if (!PL_utf8_alnum)
1156         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1157     return swash_fetch(PL_utf8_alnum, p, TRUE);
1158 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1159 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1160     if (!PL_utf8_alnum)
1161         PL_utf8_alnum = swash_init("utf8", "",
1162             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1163     return swash_fetch(PL_utf8_alnum, p, TRUE);
1164 #endif
1165 }
1166
1167 bool
1168 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1169 {
1170     return *p == '_' || is_utf8_alpha(p);
1171 }
1172
1173 bool
1174 Perl_is_utf8_alpha(pTHX_ U8 *p)
1175 {
1176     if (!is_utf8_char(p))
1177         return FALSE;
1178     if (!PL_utf8_alpha)
1179         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1180     return swash_fetch(PL_utf8_alpha, p, TRUE);
1181 }
1182
1183 bool
1184 Perl_is_utf8_ascii(pTHX_ U8 *p)
1185 {
1186     if (!is_utf8_char(p))
1187         return FALSE;
1188     if (!PL_utf8_ascii)
1189         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1190     return swash_fetch(PL_utf8_ascii, p, TRUE);
1191 }
1192
1193 bool
1194 Perl_is_utf8_space(pTHX_ U8 *p)
1195 {
1196     if (!is_utf8_char(p))
1197         return FALSE;
1198     if (!PL_utf8_space)
1199         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1200     return swash_fetch(PL_utf8_space, p, TRUE);
1201 }
1202
1203 bool
1204 Perl_is_utf8_digit(pTHX_ U8 *p)
1205 {
1206     if (!is_utf8_char(p))
1207         return FALSE;
1208     if (!PL_utf8_digit)
1209         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1210     return swash_fetch(PL_utf8_digit, p, TRUE);
1211 }
1212
1213 bool
1214 Perl_is_utf8_upper(pTHX_ U8 *p)
1215 {
1216     if (!is_utf8_char(p))
1217         return FALSE;
1218     if (!PL_utf8_upper)
1219         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1220     return swash_fetch(PL_utf8_upper, p, TRUE);
1221 }
1222
1223 bool
1224 Perl_is_utf8_lower(pTHX_ U8 *p)
1225 {
1226     if (!is_utf8_char(p))
1227         return FALSE;
1228     if (!PL_utf8_lower)
1229         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1230     return swash_fetch(PL_utf8_lower, p, TRUE);
1231 }
1232
1233 bool
1234 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1235 {
1236     if (!is_utf8_char(p))
1237         return FALSE;
1238     if (!PL_utf8_cntrl)
1239         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1240     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1241 }
1242
1243 bool
1244 Perl_is_utf8_graph(pTHX_ U8 *p)
1245 {
1246     if (!is_utf8_char(p))
1247         return FALSE;
1248     if (!PL_utf8_graph)
1249         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1250     return swash_fetch(PL_utf8_graph, p, TRUE);
1251 }
1252
1253 bool
1254 Perl_is_utf8_print(pTHX_ U8 *p)
1255 {
1256     if (!is_utf8_char(p))
1257         return FALSE;
1258     if (!PL_utf8_print)
1259         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1260     return swash_fetch(PL_utf8_print, p, TRUE);
1261 }
1262
1263 bool
1264 Perl_is_utf8_punct(pTHX_ U8 *p)
1265 {
1266     if (!is_utf8_char(p))
1267         return FALSE;
1268     if (!PL_utf8_punct)
1269         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1270     return swash_fetch(PL_utf8_punct, p, TRUE);
1271 }
1272
1273 bool
1274 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1275 {
1276     if (!is_utf8_char(p))
1277         return FALSE;
1278     if (!PL_utf8_xdigit)
1279         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1280     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1281 }
1282
1283 bool
1284 Perl_is_utf8_mark(pTHX_ U8 *p)
1285 {
1286     if (!is_utf8_char(p))
1287         return FALSE;
1288     if (!PL_utf8_mark)
1289         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1290     return swash_fetch(PL_utf8_mark, p, TRUE);
1291 }
1292
1293 /*
1294 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1295
1296 The "p" contains the pointer to the UTF-8 string encoding
1297 the character that is being converted.
1298
1299 The "ustrp" is a pointer to the character buffer to put the
1300 conversion result to.  The "lenp" is a pointer to the length
1301 of the result.
1302
1303 The "swashp" is a pointer to the swash to use.
1304
1305 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1306 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1307 but not always, a multicharacter mapping), is tried first.
1308
1309 The "special" is a string like "utf8::ToSpecLower", which means the
1310 hash %utf8::ToSpecLower.  The access to the hash is through
1311 Perl_to_utf8_case().
1312
1313 The "normal" is a string like "ToLower" which means the swash
1314 %utf8::ToLower.
1315
1316 =cut */
1317
1318 UV
1319 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1320 {
1321     UV uv0, uv1;
1322     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1323     STRLEN len = 0;
1324
1325     uv0 = utf8_to_uvchr(p, 0);
1326     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1327      * are necessary in EBCDIC, they are redundant no-ops
1328      * in ASCII-ish platforms, and hopefully optimized away. */
1329     uv1 = NATIVE_TO_UNI(uv0);
1330     uvuni_to_utf8(tmpbuf, uv1);
1331
1332     if (!*swashp) /* load on-demand */
1333          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1334
1335     if (special) {
1336          /* It might be "special" (sometimes, but not always,
1337           * a multicharacter mapping) */
1338          HV *hv;
1339          SV *keysv;
1340          HE *he;
1341          SV *val;
1342
1343          if ((hv    = get_hv(special, FALSE)) &&
1344              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1345              (he    = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1346              (val   = HeVAL(he))) {
1347              char *s;
1348
1349               s = SvPV(val, len);
1350               if (len == 1)
1351                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1352               else {
1353 #ifdef EBCDIC
1354                    /* If we have EBCDIC we need to remap the characters
1355                     * since any characters in the low 256 are Unicode
1356                     * code points, not EBCDIC. */
1357                    U8 *t = (U8*)s, *tend = t + len, *d;
1358
1359                    d = tmpbuf;
1360                    if (SvUTF8(val)) {
1361                         STRLEN tlen = 0;
1362
1363                         while (t < tend) {
1364                              UV c = utf8_to_uvchr(t, &tlen);
1365                              if (tlen > 0) {
1366                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1367                                   t += tlen;
1368                              }
1369                              else
1370                                   break;
1371                         }
1372                    }
1373                    else {
1374                         while (t < tend) {
1375                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1376                              t++;
1377                         }
1378                    }
1379                    len = d - tmpbuf;
1380                    Copy(tmpbuf, ustrp, len, U8);
1381 #else
1382                    Copy(s, ustrp, len, U8);
1383 #endif
1384               }
1385          }
1386     }
1387
1388     if (!len && *swashp) {
1389          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1390
1391          if (uv2) {
1392               /* It was "normal" (a single character mapping). */
1393               UV uv3 = UNI_TO_NATIVE(uv2);
1394
1395               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1396          }
1397     }
1398
1399     if (!len) /* Neither: just copy. */
1400          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1401
1402     if (lenp)
1403          *lenp = len;
1404
1405     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1406 }
1407
1408 /*
1409 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1410
1411 Convert the UTF-8 encoded character at p to its uppercase version and
1412 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1413 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1414 uppercase version may be longer than the original character (up to two
1415 characters).
1416
1417 The first character of the uppercased version is returned
1418 (but note, as explained above, that there may be more.)
1419
1420 =cut */
1421
1422 UV
1423 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1424 {
1425     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1426                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1427 }
1428
1429 /*
1430 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1431
1432 Convert the UTF-8 encoded character at p to its titlecase version and
1433 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1434 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1435 titlecase version may be longer than the original character (up to two
1436 characters).
1437
1438 The first character of the titlecased version is returned
1439 (but note, as explained above, that there may be more.)
1440
1441 =cut */
1442
1443 UV
1444 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1445 {
1446     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1447                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1448 }
1449
1450 /*
1451 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1452
1453 Convert the UTF-8 encoded character at p to its lowercase version and
1454 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1455 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1456 lowercase version may be longer than the original character (up to two
1457 characters).
1458
1459 The first character of the lowercased version is returned
1460 (but note, as explained above, that there may be more.)
1461
1462 =cut */
1463
1464 UV
1465 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1466 {
1467     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1468                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1469 }
1470
1471 /*
1472 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1473
1474 Convert the UTF-8 encoded character at p to its foldcase version and
1475 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1476 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1477 foldcase version may be longer than the original character (up to
1478 three characters).
1479
1480 The first character of the foldcased version is returned
1481 (but note, as explained above, that there may be more.)
1482
1483 =cut */
1484
1485 UV
1486 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1487 {
1488     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1489                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1490 }
1491
1492 /* a "swash" is a swatch hash */
1493
1494 SV*
1495 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1496 {
1497     SV* retval;
1498     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1499     dSP;
1500     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1501     SV* errsv_save;
1502
1503     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1504         ENTER;
1505         errsv_save = newSVsv(ERRSV);
1506         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1507         if (!SvTRUE(ERRSV))
1508             sv_setsv(ERRSV, errsv_save);
1509         SvREFCNT_dec(errsv_save);
1510         LEAVE;
1511     }
1512     SPAGAIN;
1513     PUSHSTACKi(PERLSI_MAGIC);
1514     PUSHMARK(SP);
1515     EXTEND(SP,5);
1516     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1517     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1518     PUSHs(listsv);
1519     PUSHs(sv_2mortal(newSViv(minbits)));
1520     PUSHs(sv_2mortal(newSViv(none)));
1521     PUTBACK;
1522     ENTER;
1523     SAVEI32(PL_hints);
1524     PL_hints = 0;
1525     save_re_context();
1526     if (PL_curcop == &PL_compiling)
1527         /* XXX ought to be handled by lex_start */
1528         sv_setpv(tokenbufsv, PL_tokenbuf);
1529     errsv_save = newSVsv(ERRSV);
1530     if (call_method("SWASHNEW", G_SCALAR))
1531         retval = newSVsv(*PL_stack_sp--);
1532     else
1533         retval = &PL_sv_undef;
1534     if (!SvTRUE(ERRSV))
1535         sv_setsv(ERRSV, errsv_save);
1536     SvREFCNT_dec(errsv_save);
1537     LEAVE;
1538     POPSTACK;
1539     if (PL_curcop == &PL_compiling) {
1540         STRLEN len;
1541         char* pv = SvPV(tokenbufsv, len);
1542
1543         Copy(pv, PL_tokenbuf, len+1, char);
1544         PL_curcop->op_private = PL_hints;
1545     }
1546     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1547         if (SvPOK(retval))
1548             Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1549                        SvPV_nolen(retval));
1550         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1551     }
1552     return retval;
1553 }
1554
1555
1556 /* This API is wrong for special case conversions since we may need to
1557  * return several Unicode characters for a single Unicode character
1558  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1559  * the lower-level routine, and it is similarly broken for returning
1560  * multiple values.  --jhi */
1561 UV
1562 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1563 {
1564     HV* hv = (HV*)SvRV(sv);
1565     U32 klen;
1566     U32 off;
1567     STRLEN slen;
1568     STRLEN needents;
1569     U8 *tmps = NULL;
1570     U32 bit;
1571     SV *retval;
1572     U8 tmputf8[2];
1573     UV c = NATIVE_TO_ASCII(*ptr);
1574
1575     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1576         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1577         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1578         ptr = tmputf8;
1579     }
1580     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1581      * then the "swatch" is a vec() for al the chars which start
1582      * with 0xAA..0xYY
1583      * So the key in the hash (klen) is length of encoded char -1
1584      */
1585     klen = UTF8SKIP(ptr) - 1;
1586     off  = ptr[klen];
1587
1588     if (klen == 0)
1589      {
1590       /* If char in invariant then swatch is for all the invariant chars
1591        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1592        */
1593       needents = UTF_CONTINUATION_MARK;
1594       off      = NATIVE_TO_UTF(ptr[klen]);
1595      }
1596     else
1597      {
1598       /* If char is encoded then swatch is for the prefix */
1599       needents = (1 << UTF_ACCUMULATION_SHIFT);
1600       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1601      }
1602
1603     /*
1604      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1605      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1606      * it's nothing to sniff at.)  Pity we usually come through at least
1607      * two function calls to get here...
1608      *
1609      * NB: this code assumes that swatches are never modified, once generated!
1610      */
1611
1612     if (hv   == PL_last_swash_hv &&
1613         klen == PL_last_swash_klen &&
1614         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1615     {
1616         tmps = PL_last_swash_tmps;
1617         slen = PL_last_swash_slen;
1618     }
1619     else {
1620         /* Try our second-level swatch cache, kept in a hash. */
1621         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1622
1623         /* If not cached, generate it via utf8::SWASHGET */
1624         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1625             dSP;
1626             /* We use utf8n_to_uvuni() as we want an index into
1627                Unicode tables, not a native character number.
1628              */
1629             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1630             SV *errsv_save;
1631             ENTER;
1632             SAVETMPS;
1633             save_re_context();
1634             PUSHSTACKi(PERLSI_MAGIC);
1635             PUSHMARK(SP);
1636             EXTEND(SP,3);
1637             PUSHs((SV*)sv);
1638             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1639             PUSHs(sv_2mortal(newSViv((klen) ?
1640                                      (code_point & ~(needents - 1)) : 0)));
1641             PUSHs(sv_2mortal(newSViv(needents)));
1642             PUTBACK;
1643             errsv_save = newSVsv(ERRSV);
1644             if (call_method("SWASHGET", G_SCALAR))
1645                 retval = newSVsv(*PL_stack_sp--);
1646             else
1647                 retval = &PL_sv_undef;
1648             if (!SvTRUE(ERRSV))
1649                 sv_setsv(ERRSV, errsv_save);
1650             SvREFCNT_dec(errsv_save);
1651             POPSTACK;
1652             FREETMPS;
1653             LEAVE;
1654             if (PL_curcop == &PL_compiling)
1655                 PL_curcop->op_private = PL_hints;
1656
1657             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1658
1659             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1660                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1661         }
1662
1663         PL_last_swash_hv = hv;
1664         PL_last_swash_klen = klen;
1665         PL_last_swash_tmps = tmps;
1666         PL_last_swash_slen = slen;
1667         if (klen)
1668             Copy(ptr, PL_last_swash_key, klen, U8);
1669     }
1670
1671     switch ((int)((slen << 3) / needents)) {
1672     case 1:
1673         bit = 1 << (off & 7);
1674         off >>= 3;
1675         return (tmps[off] & bit) != 0;
1676     case 8:
1677         return tmps[off];
1678     case 16:
1679         off <<= 1;
1680         return (tmps[off] << 8) + tmps[off + 1] ;
1681     case 32:
1682         off <<= 2;
1683         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1684     }
1685     Perl_croak(aTHX_ "panic: swash_fetch");
1686     return 0;
1687 }
1688
1689
1690 /*
1691 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1692
1693 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1694 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1695 bytes available. The return value is the pointer to the byte after the
1696 end of the new character. In other words,
1697
1698     d = uvchr_to_utf8(d, uv);
1699
1700 is the recommended wide native character-aware way of saying
1701
1702     *(d++) = uv;
1703
1704 =cut
1705 */
1706
1707 /* On ASCII machines this is normally a macro but we want a
1708    real function in case XS code wants it
1709 */
1710 #undef Perl_uvchr_to_utf8
1711 U8 *
1712 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1713 {
1714     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1715 }
1716
1717 U8 *
1718 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1719 {
1720     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1721 }
1722
1723 /*
1724 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1725
1726 Returns the native character value of the first character in the string C<s>
1727 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1728 length, in bytes, of that character.
1729
1730 Allows length and flags to be passed to low level routine.
1731
1732 =cut
1733 */
1734 /* On ASCII machines this is normally a macro but we want
1735    a real function in case XS code wants it
1736 */
1737 #undef Perl_utf8n_to_uvchr
1738 UV
1739 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1740 {
1741     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1742     return UNI_TO_NATIVE(uv);
1743 }
1744
1745 /*
1746 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1747
1748 Build to the scalar dsv a displayable version of the string spv,
1749 length len, the displayable version being at most pvlim bytes long
1750 (if longer, the rest is truncated and "..." will be appended).
1751
1752 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1753 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1754 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1755 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1756 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1757 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1758
1759 The pointer to the PV of the dsv is returned.
1760
1761 =cut */
1762 char *
1763 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1764 {
1765     int truncated = 0;
1766     char *s, *e;
1767
1768     sv_setpvn(dsv, "", 0);
1769     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1770          UV u;
1771          bool ok = FALSE;
1772
1773          if (pvlim && SvCUR(dsv) >= pvlim) {
1774               truncated++;
1775               break;
1776          }
1777          u = utf8_to_uvchr((U8*)s, 0);
1778          if (u < 256) {
1779              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1780                  switch (u & 0xFF) {
1781                  case '\n':
1782                      Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1783                  case '\r':
1784                      Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1785                  case '\t':
1786                      Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1787                  case '\f':
1788                      Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1789                  case '\a':
1790                      Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1791                  case '\\':
1792                      Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1793                  default: break;
1794                  }
1795              }
1796              /* isPRINT() is the locale-blind version. */
1797              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1798                  Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1799                  ok = TRUE;
1800              }
1801          }
1802          if (!ok)
1803              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1804     }
1805     if (truncated)
1806          sv_catpvn(dsv, "...", 3);
1807
1808     return SvPVX(dsv);
1809 }
1810
1811 /*
1812 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1813
1814 Build to the scalar dsv a displayable version of the scalar sv,
1815 the displayable version being at most pvlim bytes long
1816 (if longer, the rest is truncated and "..." will be appended).
1817
1818 The flags argument is as in pv_uni_display().
1819
1820 The pointer to the PV of the dsv is returned.
1821
1822 =cut */
1823 char *
1824 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1825 {
1826      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1827                                 pvlim, flags);
1828 }
1829
1830 /*
1831 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1832
1833 Return true if the strings s1 and s2 differ case-insensitively, false
1834 if not (if they are equal case-insensitively).  If u1 is true, the
1835 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1836 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1837 are false, the respective string is assumed to be in native 8-bit
1838 encoding.
1839
1840 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1841 in there (they will point at the beginning of the I<next> character).
1842 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1843 pointers beyond which scanning will not continue under any
1844 circustances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1845 s2+l2 will be used as goal end pointers that will also stop the scan,
1846 and which qualify towards defining a successful match: all the scans
1847 that define an explicit length must reach their goal pointers for
1848 a match to succeed).
1849
1850 For case-insensitiveness, the "casefolding" of Unicode is used
1851 instead of upper/lowercasing both the characters, see
1852 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1853
1854 =cut */
1855 I32
1856 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1857 {
1858      register U8 *p1  = (U8*)s1;
1859      register U8 *p2  = (U8*)s2;
1860      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1861      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1862      STRLEN n1 = 0, n2 = 0;
1863      U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1864      U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1865      U8 natbuf[1+1];
1866      STRLEN foldlen1, foldlen2;
1867      bool match;
1868
1869      if (pe1)
1870           e1 = *(U8**)pe1;
1871      if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1872           f1 = (U8*)s1 + l1;
1873      if (pe2)
1874           e2 = *(U8**)pe2;
1875      if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1876           f2 = (U8*)s2 + l2;
1877
1878      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1879           return 1; /* mismatch; possible infinite loop or false positive */
1880
1881      if (!u1 || !u2)
1882           natbuf[1] = 0; /* Need to terminate the buffer. */
1883
1884      while ((e1 == 0 || p1 < e1) &&
1885             (f1 == 0 || p1 < f1) &&
1886             (e2 == 0 || p2 < e2) &&
1887             (f2 == 0 || p2 < f2)) {
1888           if (n1 == 0) {
1889                if (u1)
1890                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1891                else {
1892                     natbuf[0] = *p1;
1893                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1894                }
1895                q1 = foldbuf1;
1896                n1 = foldlen1;
1897           }
1898           if (n2 == 0) {
1899                if (u2)
1900                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1901                else {
1902                     natbuf[0] = *p2;
1903                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1904                }
1905                q2 = foldbuf2;
1906                n2 = foldlen2;
1907           }
1908           while (n1 && n2) {
1909                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1910                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1911                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1912                    return 1; /* mismatch */
1913                n1 -= UTF8SKIP(q1);
1914                q1 += UTF8SKIP(q1);
1915                n2 -= UTF8SKIP(q2);
1916                q2 += UTF8SKIP(q2);
1917           }
1918           if (n1 == 0)
1919                p1 += u1 ? UTF8SKIP(p1) : 1;
1920           if (n2 == 0)
1921                p2 += u2 ? UTF8SKIP(p2) : 1;
1922
1923      }
1924
1925      /* A match is defined by all the scans that specified
1926       * an explicit length reaching their final goals. */
1927      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1928
1929      if (match) {
1930           if (pe1)
1931                *pe1 = (char*)p1;
1932           if (pe2)
1933                *pe2 = (char*)p2;
1934      }
1935
1936      return match ? 0 : 1; /* 0 match, 1 mismatch */
1937 }
1938