utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =cut
  55 */
  56
  57 /*
  58 =for apidoc is_ascii_string
  59
  60 Returns true if first C<len> bytes of the given string are ASCII (i.e. none
  61 of them even raise the question of UTF-8-ness).
  62
  63 See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  64
  65 =cut
  66 */
  67
  68 bool
  69 Perl_is_ascii_string(const U8 *s, STRLEN len)
  70 {
  71     const U8* const send = s + (len ? len : strlen((const char *)s));
  72     const U8* x = s;
  73
  74     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  75
  76     for (; x < send; ++x) {
  77         if (!UTF8_IS_INVARIANT(*x))
  78             break;
  79     }
  80
  81     return x == send;
  82 }
  83
  84 /*
  85 =for apidoc uvuni_to_utf8_flags
  86
  87 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  88 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  89 bytes available. The return value is the pointer to the byte after the
  90 end of the new character. In other words,
  91
  92     d = uvuni_to_utf8_flags(d, uv, flags);
  93
  94 or, in most cases,
  95
  96     d = uvuni_to_utf8(d, uv);
  97
  98 (which is equivalent to)
  99
 100     d = uvuni_to_utf8_flags(d, uv, 0);
 101
 102 is the recommended Unicode-aware way of saying
 103
 104     *(d++) = uv;
 105
 106 =cut
 107 */
 108
 109 U8 *
 110 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 111 {
 112     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
 113
 114     if (ckWARN(WARN_UTF8)) {
 115          if (UNICODE_IS_SURROGATE(uv) &&
 116              !(flags & UNICODE_ALLOW_SURROGATE))
 117               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
 118          else if (
 119                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
 120                     !(flags & UNICODE_ALLOW_FDD0))
 121                    ||
 122                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
 123                     !(flags & UNICODE_ALLOW_FFFF))) &&
 124                   /* UNICODE_ALLOW_SUPER includes
 125                    * FFFEs and FFFFs beyond 0x10FFFF. */
 126                   ((uv <= PERL_UNICODE_MAX) ||
 127                    !(flags & UNICODE_ALLOW_SUPER))
 128                   )
 129               Perl_warner(aTHX_ packWARN(WARN_UTF8),
 130                       "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
 131     }
 132     if (UNI_IS_INVARIANT(uv)) {
 133         *d++ = (U8)UTF_TO_NATIVE(uv);
 134         return d;
 135     }
 136 #if defined(EBCDIC)
 137     else {
 138         STRLEN len  = UNISKIP(uv);
 139         U8 *p = d+len-1;
 140         while (p > d) {
 141             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 142             uv >>= UTF_ACCUMULATION_SHIFT;
 143         }
 144         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 145         return d+len;
 146     }
 147 #else /* Non loop style */
 148     if (uv < 0x800) {
 149         *d++ = (U8)(( uv >>  6)         | 0xc0);
 150         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 151         return d;
 152     }
 153     if (uv < 0x10000) {
 154         *d++ = (U8)(( uv >> 12)         | 0xe0);
 155         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 156         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 157         return d;
 158     }
 159     if (uv < 0x200000) {
 160         *d++ = (U8)(( uv >> 18)         | 0xf0);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166     if (uv < 0x4000000) {
 167         *d++ = (U8)(( uv >> 24)         | 0xf8);
 168         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 169         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 170         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 171         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 172         return d;
 173     }
 174     if (uv < 0x80000000) {
 175         *d++ = (U8)(( uv >> 30)         | 0xfc);
 176         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 177         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 178         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 179         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 180         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 181         return d;
 182     }
 183 #ifdef HAS_QUAD
 184     if (uv < UTF8_QUAD_MAX)
 185 #endif
 186     {
 187         *d++ =                            0xfe; /* Can't match U+FEFF! */
 188         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 189         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 190         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 191         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 192         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 193         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 194         return d;
 195     }
 196 #ifdef HAS_QUAD
 197     {
 198         *d++ =                            0xff;         /* Can't match U+FFFE! */
 199         *d++ =                            0x80;         /* 6 Reserved bits */
 200         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 201         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 202         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 203         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 205         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 206         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 207         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 208         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 209         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 210         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 211         return d;
 212     }
 213 #endif
 214 #endif /* Loop style */
 215 }
 216
 217 /*
 218
 219 Tests if some arbitrary number of bytes begins in a valid UTF-8
 220 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 221 UTF-8 character.  The actual number of bytes in the UTF-8 character
 222 will be returned if it is valid, otherwise 0.
 223
 224 This is the "slow" version as opposed to the "fast" version which is
 225 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 226 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 227 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 228 you should use the _slow().  In practice this means that the _slow()
 229 will be used very rarely, since the maximum Unicode code point (as of
 230 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 231 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 232 five bytes or more.
 233
 234 =cut */
 235 STATIC STRLEN
 236 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 237 {
 238     U8 u = *s;
 239     STRLEN slen;
 240     UV uv, ouv;
 241
 242     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 243
 244     if (UTF8_IS_INVARIANT(u))
 245         return 1;
 246
 247     if (!UTF8_IS_START(u))
 248         return 0;
 249
 250     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 251         return 0;
 252
 253     slen = len - 1;
 254     s++;
 255 #ifdef EBCDIC
 256     u = NATIVE_TO_UTF(u);
 257 #endif
 258     u &= UTF_START_MASK(len);
 259     uv  = u;
 260     ouv = uv;
 261     while (slen--) {
 262         if (!UTF8_IS_CONTINUATION(*s))
 263             return 0;
 264         uv = UTF8_ACCUMULATE(uv, *s);
 265         if (uv < ouv)
 266             return 0;
 267         ouv = uv;
 268         s++;
 269     }
 270
 271     if ((STRLEN)UNISKIP(uv) < len)
 272         return 0;
 273
 274     return len;
 275 }
 276
 277 /*
 278 =for apidoc is_utf8_char
 279
 280 Tests if some arbitrary number of bytes begins in a valid UTF-8
 281 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 282 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 283 character will be returned if it is valid, otherwise 0.
 284
 285 =cut */
 286 STRLEN
 287 Perl_is_utf8_char(const U8 *s)
 288 {
 289     const STRLEN len = UTF8SKIP(s);
 290
 291     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 292 #ifdef IS_UTF8_CHAR
 293     if (IS_UTF8_CHAR_FAST(len))
 294         return IS_UTF8_CHAR(s, len) ? len : 0;
 295 #endif /* #ifdef IS_UTF8_CHAR */
 296     return is_utf8_char_slow(s, len);
 297 }
 298
 299
 300 /*
 301 =for apidoc is_utf8_string
 302
 303 Returns true if first C<len> bytes of the given string form a valid
 304 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 305 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 306 because a valid ASCII string is a valid UTF-8 string.
 307
 308 See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
 309
 310 =cut
 311 */
 312
 313 bool
 314 Perl_is_utf8_string(const U8 *s, STRLEN len)
 315 {
 316     const U8* const send = s + (len ? len : strlen((const char *)s));
 317     const U8* x = s;
 318
 319     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 320
 321     while (x < send) {
 322         STRLEN c;
 323          /* Inline the easy bits of is_utf8_char() here for speed... */
 324          if (UTF8_IS_INVARIANT(*x))
 325               c = 1;
 326          else if (!UTF8_IS_START(*x))
 327              goto out;
 328          else {
 329               /* ... and call is_utf8_char() only if really needed. */
 330 #ifdef IS_UTF8_CHAR
 331              c = UTF8SKIP(x);
 332              if (IS_UTF8_CHAR_FAST(c)) {
 333                  if (!IS_UTF8_CHAR(x, c))
 334                      c = 0;
 335              }
 336              else
 337                 c = is_utf8_char_slow(x, c);
 338 #else
 339              c = is_utf8_char(x);
 340 #endif /* #ifdef IS_UTF8_CHAR */
 341               if (!c)
 342                   goto out;
 343          }
 344         x += c;
 345     }
 346
 347  out:
 348     if (x != send)
 349         return FALSE;
 350
 351     return TRUE;
 352 }
 353
 354 /*
 355 Implemented as a macro in utf8.h
 356
 357 =for apidoc is_utf8_string_loc
 358
 359 Like is_utf8_string() but stores the location of the failure (in the
 360 case of "utf8ness failure") or the location s+len (in the case of
 361 "utf8ness success") in the C<ep>.
 362
 363 See also is_utf8_string_loclen() and is_utf8_string().
 364
 365 =for apidoc is_utf8_string_loclen
 366
 367 Like is_utf8_string() but stores the location of the failure (in the
 368 case of "utf8ness failure") or the location s+len (in the case of
 369 "utf8ness success") in the C<ep>, and the number of UTF-8
 370 encoded characters in the C<el>.
 371
 372 See also is_utf8_string_loc() and is_utf8_string().
 373
 374 =cut
 375 */
 376
 377 bool
 378 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 379 {
 380     const U8* const send = s + (len ? len : strlen((const char *)s));
 381     const U8* x = s;
 382     STRLEN c;
 383     STRLEN outlen = 0;
 384
 385     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 386
 387     while (x < send) {
 388          /* Inline the easy bits of is_utf8_char() here for speed... */
 389          if (UTF8_IS_INVARIANT(*x))
 390              c = 1;
 391          else if (!UTF8_IS_START(*x))
 392              goto out;
 393          else {
 394              /* ... and call is_utf8_char() only if really needed. */
 395 #ifdef IS_UTF8_CHAR
 396              c = UTF8SKIP(x);
 397              if (IS_UTF8_CHAR_FAST(c)) {
 398                  if (!IS_UTF8_CHAR(x, c))
 399                      c = 0;
 400              } else
 401                  c = is_utf8_char_slow(x, c);
 402 #else
 403              c = is_utf8_char(x);
 404 #endif /* #ifdef IS_UTF8_CHAR */
 405              if (!c)
 406                  goto out;
 407          }
 408          x += c;
 409          outlen++;
 410     }
 411
 412  out:
 413     if (el)
 414         *el = outlen;
 415
 416     if (ep)
 417         *ep = x;
 418     return (x == send);
 419 }
 420
 421 /*
 422
 423 =for apidoc utf8n_to_uvuni
 424
 425 Bottom level UTF-8 decode routine.
 426 Returns the Unicode code point value of the first character in the string C<s>
 427 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 428 C<retlen> will be set to the length, in bytes, of that character.
 429
 430 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 431 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 432 it is assumed that the caller will raise a warning, and this function
 433 will silently just set C<retlen> to C<-1> and return zero.  If the
 434 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 435 malformations will be given, C<retlen> will be set to the expected
 436 length of the UTF-8 character in bytes, and zero will be returned.
 437
 438 The C<flags> can also contain various flags to allow deviations from
 439 the strict UTF-8 encoding (see F<utf8.h>).
 440
 441 Most code should use utf8_to_uvchr() rather than call this directly.
 442
 443 =cut
 444 */
 445
 446 UV
 447 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 448 {
 449     dVAR;
 450     const U8 * const s0 = s;
 451     UV uv = *s, ouv = 0;
 452     STRLEN len = 1;
 453     const bool dowarn = ckWARN_d(WARN_UTF8);
 454     const UV startbyte = *s;
 455     STRLEN expectlen = 0;
 456     U32 warning = 0;
 457     SV* sv;
 458
 459     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 460
 461 /* This list is a superset of the UTF8_ALLOW_XXX.  BUT it isn't, eg SUPER missing XXX */
 462
 463 #define UTF8_WARN_EMPTY                          1
 464 #define UTF8_WARN_CONTINUATION                   2
 465 #define UTF8_WARN_NON_CONTINUATION               3
 466 #define UTF8_WARN_FE_FF                          4
 467 #define UTF8_WARN_SHORT                          5
 468 #define UTF8_WARN_OVERFLOW                       6
 469 #define UTF8_WARN_SURROGATE                      7
 470 #define UTF8_WARN_LONG                           8
 471 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 472
 473     if (curlen == 0 &&
 474         !(flags & UTF8_ALLOW_EMPTY)) {
 475         warning = UTF8_WARN_EMPTY;
 476         goto malformed;
 477     }
 478
 479     if (UTF8_IS_INVARIANT(uv)) {
 480         if (retlen)
 481             *retlen = 1;
 482         return (UV) (NATIVE_TO_UTF(*s));
 483     }
 484
 485     if (UTF8_IS_CONTINUATION(uv) &&
 486         !(flags & UTF8_ALLOW_CONTINUATION)) {
 487         warning = UTF8_WARN_CONTINUATION;
 488         goto malformed;
 489     }
 490
 491     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 492         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 493         warning = UTF8_WARN_NON_CONTINUATION;
 494         goto malformed;
 495     }
 496
 497 #ifdef EBCDIC
 498     uv = NATIVE_TO_UTF(uv);
 499 #else
 500     if ((uv == 0xfe || uv == 0xff) &&
 501         !(flags & UTF8_ALLOW_FE_FF)) {
 502         warning = UTF8_WARN_FE_FF;
 503         goto malformed;
 504     }
 505 #endif
 506
 507     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 508     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 509     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 510     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 511 #ifdef EBCDIC
 512     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 513     else                        { len =  7; uv &= 0x01; }
 514 #else
 515     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 516     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 517     else                        { len = 13; uv = 0; } /* whoa! */
 518 #endif
 519
 520     if (retlen)
 521         *retlen = len;
 522
 523     expectlen = len;
 524
 525     if ((curlen < expectlen) &&
 526         !(flags & UTF8_ALLOW_SHORT)) {
 527         warning = UTF8_WARN_SHORT;
 528         goto malformed;
 529     }
 530
 531     len--;
 532     s++;
 533     ouv = uv;
 534
 535     while (len--) {
 536         if (!UTF8_IS_CONTINUATION(*s) &&
 537             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 538             s--;
 539             warning = UTF8_WARN_NON_CONTINUATION;
 540             goto malformed;
 541         }
 542         else
 543             uv = UTF8_ACCUMULATE(uv, *s);
 544         if (!(uv > ouv)) {
 545             /* These cannot be allowed. */
 546             if (uv == ouv) {
 547                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 548                     warning = UTF8_WARN_LONG;
 549                     goto malformed;
 550                 }
 551             }
 552             else { /* uv < ouv */
 553                 /* This cannot be allowed. */
 554                 warning = UTF8_WARN_OVERFLOW;
 555                 goto malformed;
 556             }
 557         }
 558         s++;
 559         ouv = uv;
 560     }
 561
 562     if (UNICODE_IS_SURROGATE(uv) &&
 563         !(flags & UTF8_ALLOW_SURROGATE)) {
 564         warning = UTF8_WARN_SURROGATE;
 565         goto malformed;
 566     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 567                !(flags & UTF8_ALLOW_LONG)) {
 568         warning = UTF8_WARN_LONG;
 569         goto malformed;
 570     } else if (UNICODE_IS_ILLEGAL(uv) &&
 571                !(flags & UTF8_ALLOW_FFFF)) {
 572         warning = UTF8_WARN_FFFF;
 573         goto malformed;
 574     }
 575
 576     return uv;
 577
 578 malformed:
 579
 580     if (flags & UTF8_CHECK_ONLY) {
 581         if (retlen)
 582             *retlen = ((STRLEN) -1);
 583         return 0;
 584     }
 585
 586     if (dowarn) {
 587         if (warning == UTF8_WARN_FFFF) {
 588             sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
 589             Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
 590         }
 591         else {
 592             sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 593
 594             switch (warning) {
 595                 case 0: /* Intentionally empty. */ break;
 596                 case UTF8_WARN_EMPTY:
 597                     sv_catpvs(sv, "(empty string)");
 598                     break;
 599                 case UTF8_WARN_CONTINUATION:
 600                     Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 601                     break;
 602                 case UTF8_WARN_NON_CONTINUATION:
 603                     if (s == s0)
 604                         Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 605                                    (UV)s[1], startbyte);
 606                     else {
 607                         const int len = (int)(s-s0);
 608                         Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 609                                    (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 610                     }
 611
 612                     break;
 613                 case UTF8_WARN_FE_FF:
 614                     Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 615                     break;
 616                 case UTF8_WARN_SHORT:
 617                     Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 618                                    (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 619                     expectlen = curlen;         /* distance for caller to skip */
 620                     break;
 621                 case UTF8_WARN_OVERFLOW:
 622                     Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 623                                    ouv, *s, startbyte);
 624                     break;
 625                 case UTF8_WARN_SURROGATE:
 626                     Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 627                     break;
 628                 case UTF8_WARN_LONG:
 629                     Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 630                                    (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 631                     break;
 632                 default:
 633                     sv_catpvs(sv, "(unknown reason)");
 634                     break;
 635             }
 636         }
 637
 638         if (warning) {
 639             const char * const s = SvPVX_const(sv);
 640
 641             if (PL_op)
 642                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 643                             "%s in %s", s,  OP_DESC(PL_op));
 644             else
 645                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 646         }
 647     }
 648
 649     if (retlen)
 650         *retlen = expectlen ? expectlen : len;
 651
 652     return 0;
 653 }
 654
 655 /*
 656 =for apidoc utf8_to_uvchr
 657
 658 Returns the native character value of the first character in the string C<s>
 659 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 660 length, in bytes, of that character.
 661
 662 If C<s> does not point to a well-formed UTF-8 character, zero is
 663 returned and retlen is set, if possible, to -1.
 664
 665 =cut
 666 */
 667
 668 UV
 669 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 670 {
 671     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 672
 673     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 674                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 675 }
 676
 677 /*
 678 =for apidoc utf8_to_uvuni
 679
 680 Returns the Unicode code point of the first character in the string C<s>
 681 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 682 length, in bytes, of that character.
 683
 684 This function should only be used when the returned UV is considered
 685 an index into the Unicode semantic tables (e.g. swashes).
 686
 687 If C<s> does not point to a well-formed UTF-8 character, zero is
 688 returned and retlen is set, if possible, to -1.
 689
 690 =cut
 691 */
 692
 693 UV
 694 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 695 {
 696     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 697
 698     /* Call the low level routine asking for checks */
 699     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 700                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 701 }
 702
 703 /*
 704 =for apidoc utf8_length
 705
 706 Return the length of the UTF-8 char encoded string C<s> in characters.
 707 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 708 up past C<e>, croaks.
 709
 710 =cut
 711 */
 712
 713 STRLEN
 714 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 715 {
 716     dVAR;
 717     STRLEN len = 0;
 718
 719     PERL_ARGS_ASSERT_UTF8_LENGTH;
 720
 721     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 722      * the bitops (especially ~) can create illegal UTF-8.
 723      * In other words: in Perl UTF-8 is not just for Unicode. */
 724
 725     if (e < s)
 726         goto warn_and_return;
 727     while (s < e) {
 728         if (!UTF8_IS_INVARIANT(*s))
 729             s += UTF8SKIP(s);
 730         else
 731             s++;
 732         len++;
 733     }
 734
 735     if (e != s) {
 736         len--;
 737         warn_and_return:
 738         if (PL_op)
 739             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
 740                              "%s in %s", unees, OP_DESC(PL_op));
 741         else
 742             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
 743     }
 744
 745     return len;
 746 }
 747
 748 /*
 749 =for apidoc utf8_distance
 750
 751 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 752 and C<b>.
 753
 754 WARNING: use only if you *know* that the pointers point inside the
 755 same UTF-8 buffer.
 756
 757 =cut
 758 */
 759
 760 IV
 761 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 762 {
 763     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 764
 765     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 766 }
 767
 768 /*
 769 =for apidoc utf8_hop
 770
 771 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 772 forward or backward.
 773
 774 WARNING: do not use the following unless you *know* C<off> is within
 775 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 776 on the first byte of character or just after the last byte of a character.
 777
 778 =cut
 779 */
 780
 781 U8 *
 782 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 783 {
 784     PERL_ARGS_ASSERT_UTF8_HOP;
 785
 786     PERL_UNUSED_CONTEXT;
 787     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 788      * the bitops (especially ~) can create illegal UTF-8.
 789      * In other words: in Perl UTF-8 is not just for Unicode. */
 790
 791     if (off >= 0) {
 792         while (off--)
 793             s += UTF8SKIP(s);
 794     }
 795     else {
 796         while (off++) {
 797             s--;
 798             while (UTF8_IS_CONTINUATION(*s))
 799                 s--;
 800         }
 801     }
 802     return (U8 *)s;
 803 }
 804
 805 /*
 806 =for apidoc utf8_to_bytes
 807
 808 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 809 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 810 updates len to contain the new length.
 811 Returns zero on failure, setting C<len> to -1.
 812
 813 If you need a copy of the string, see C<bytes_from_utf8>.
 814
 815 =cut
 816 */
 817
 818 U8 *
 819 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 820 {
 821     U8 * const save = s;
 822     U8 * const send = s + *len;
 823     U8 *d;
 824
 825     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 826
 827     /* ensure valid UTF-8 and chars < 256 before updating string */
 828     while (s < send) {
 829         U8 c = *s++;
 830
 831         if (!UTF8_IS_INVARIANT(c) &&
 832             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 833              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 834             *len = ((STRLEN) -1);
 835             return 0;
 836         }
 837     }
 838
 839     d = s = save;
 840     while (s < send) {
 841         STRLEN ulen;
 842         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 843         s += ulen;
 844     }
 845     *d = '\0';
 846     *len = d - save;
 847     return save;
 848 }
 849
 850 /*
 851 =for apidoc bytes_from_utf8
 852
 853 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 854 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 855 the newly-created string, and updates C<len> to contain the new
 856 length.  Returns the original string if no conversion occurs, C<len>
 857 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 858 0 if C<s> is converted or consisted entirely of characters that are invariant
 859 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
 860
 861 =cut
 862 */
 863
 864 U8 *
 865 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 866 {
 867     U8 *d;
 868     const U8 *start = s;
 869     const U8 *send;
 870     I32 count = 0;
 871
 872     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 873
 874     PERL_UNUSED_CONTEXT;
 875     if (!*is_utf8)
 876         return (U8 *)start;
 877
 878     /* ensure valid UTF-8 and chars < 256 before converting string */
 879     for (send = s + *len; s < send;) {
 880         U8 c = *s++;
 881         if (!UTF8_IS_INVARIANT(c)) {
 882             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 883                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 884                 count++;
 885             else
 886                 return (U8 *)start;
 887         }
 888     }
 889
 890     *is_utf8 = FALSE;
 891
 892     Newx(d, (*len) - count + 1, U8);
 893     s = start; start = d;
 894     while (s < send) {
 895         U8 c = *s++;
 896         if (!UTF8_IS_INVARIANT(c)) {
 897             /* Then it is two-byte encoded */
 898             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 899             c = ASCII_TO_NATIVE(c);
 900         }
 901         *d++ = c;
 902     }
 903     *d = '\0';
 904     *len = d - start;
 905     return (U8 *)start;
 906 }
 907
 908 /*
 909 =for apidoc bytes_to_utf8
 910
 911 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
 912 Returns a pointer to the newly-created string, and sets C<len> to
 913 reflect the new length.
 914
 915 A NUL character will be written after the end of the string.
 916
 917 If you want to convert to UTF-8 from encodings other than
 918 the native (Latin1 or EBCDIC),
 919 see sv_recode_to_utf8().
 920
 921 =cut
 922 */
 923
 924 U8*
 925 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 926 {
 927     const U8 * const send = s + (*len);
 928     U8 *d;
 929     U8 *dst;
 930
 931     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 932     PERL_UNUSED_CONTEXT;
 933
 934     Newx(d, (*len) * 2 + 1, U8);
 935     dst = d;
 936
 937     while (s < send) {
 938         const UV uv = NATIVE_TO_ASCII(*s++);
 939         if (UNI_IS_INVARIANT(uv))
 940             *d++ = (U8)UTF_TO_NATIVE(uv);
 941         else {
 942             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 943             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 944         }
 945     }
 946     *d = '\0';
 947     *len = d-dst;
 948     return dst;
 949 }
 950
 951 /*
 952  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 953  *
 954  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 955  * We optimize for native, for obvious reasons. */
 956
 957 U8*
 958 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 959 {
 960     U8* pend;
 961     U8* dstart = d;
 962
 963     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 964
 965     if (bytelen & 1)
 966         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 967
 968     pend = p + bytelen;
 969
 970     while (p < pend) {
 971         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 972         p += 2;
 973         if (uv < 0x80) {
 974 #ifdef EBCDIC
 975             *d++ = UNI_TO_NATIVE(uv);
 976 #else
 977             *d++ = (U8)uv;
 978 #endif
 979             continue;
 980         }
 981         if (uv < 0x800) {
 982             *d++ = (U8)(( uv >>  6)         | 0xc0);
 983             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 984             continue;
 985         }
 986         if (uv >= 0xd800 && uv <= 0xdbff) {     /* surrogates */
 987             if (p >= pend) {
 988                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 989             } else {
 990                 UV low = (p[0] << 8) + p[1];
 991                 p += 2;
 992                 if (low < 0xdc00 || low > 0xdfff)
 993                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 994                 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 995             }
 996         } else if (uv >= 0xdc00 && uv <= 0xdfff) {
 997             Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 998         }
 999         if (uv < 0x10000) {
1000             *d++ = (U8)(( uv >> 12)         | 0xe0);
1001             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1002             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1003             continue;
1004         }
1005         else {
1006             *d++ = (U8)(( uv >> 18)         | 0xf0);
1007             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1008             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1009             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1010             continue;
1011         }
1012     }
1013     *newlen = d - dstart;
1014     return d;
1015 }
1016
1017 /* Note: this one is slightly destructive of the source. */
1018
1019 U8*
1020 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1021 {
1022     U8* s = (U8*)p;
1023     U8* const send = s + bytelen;
1024
1025     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1026
1027     if (bytelen & 1)
1028         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1029                    (UV)bytelen);
1030
1031     while (s < send) {
1032         const U8 tmp = s[0];
1033         s[0] = s[1];
1034         s[1] = tmp;
1035         s += 2;
1036     }
1037     return utf16_to_utf8(p, d, bytelen, newlen);
1038 }
1039
1040 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1041
1042 bool
1043 Perl_is_uni_alnum(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_alnum(tmpbuf);
1048 }
1049
1050 bool
1051 Perl_is_uni_idfirst(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_idfirst(tmpbuf);
1056 }
1057
1058 bool
1059 Perl_is_uni_alpha(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_alpha(tmpbuf);
1064 }
1065
1066 bool
1067 Perl_is_uni_ascii(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_ascii(tmpbuf);
1072 }
1073
1074 bool
1075 Perl_is_uni_space(pTHX_ UV c)
1076 {
1077     U8 tmpbuf[UTF8_MAXBYTES+1];
1078     uvchr_to_utf8(tmpbuf, c);
1079     return is_utf8_space(tmpbuf);
1080 }
1081
1082 bool
1083 Perl_is_uni_digit(pTHX_ UV c)
1084 {
1085     U8 tmpbuf[UTF8_MAXBYTES+1];
1086     uvchr_to_utf8(tmpbuf, c);
1087     return is_utf8_digit(tmpbuf);
1088 }
1089
1090 bool
1091 Perl_is_uni_upper(pTHX_ UV c)
1092 {
1093     U8 tmpbuf[UTF8_MAXBYTES+1];
1094     uvchr_to_utf8(tmpbuf, c);
1095     return is_utf8_upper(tmpbuf);
1096 }
1097
1098 bool
1099 Perl_is_uni_lower(pTHX_ UV c)
1100 {
1101     U8 tmpbuf[UTF8_MAXBYTES+1];
1102     uvchr_to_utf8(tmpbuf, c);
1103     return is_utf8_lower(tmpbuf);
1104 }
1105
1106 bool
1107 Perl_is_uni_cntrl(pTHX_ UV c)
1108 {
1109     U8 tmpbuf[UTF8_MAXBYTES+1];
1110     uvchr_to_utf8(tmpbuf, c);
1111     return is_utf8_cntrl(tmpbuf);
1112 }
1113
1114 bool
1115 Perl_is_uni_graph(pTHX_ UV c)
1116 {
1117     U8 tmpbuf[UTF8_MAXBYTES+1];
1118     uvchr_to_utf8(tmpbuf, c);
1119     return is_utf8_graph(tmpbuf);
1120 }
1121
1122 bool
1123 Perl_is_uni_print(pTHX_ UV c)
1124 {
1125     U8 tmpbuf[UTF8_MAXBYTES+1];
1126     uvchr_to_utf8(tmpbuf, c);
1127     return is_utf8_print(tmpbuf);
1128 }
1129
1130 bool
1131 Perl_is_uni_punct(pTHX_ UV c)
1132 {
1133     U8 tmpbuf[UTF8_MAXBYTES+1];
1134     uvchr_to_utf8(tmpbuf, c);
1135     return is_utf8_punct(tmpbuf);
1136 }
1137
1138 bool
1139 Perl_is_uni_xdigit(pTHX_ UV c)
1140 {
1141     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1142     uvchr_to_utf8(tmpbuf, c);
1143     return is_utf8_xdigit(tmpbuf);
1144 }
1145
1146 UV
1147 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1148 {
1149     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1150
1151     uvchr_to_utf8(p, c);
1152     return to_utf8_upper(p, p, lenp);
1153 }
1154
1155 UV
1156 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1157 {
1158     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1159
1160     uvchr_to_utf8(p, c);
1161     return to_utf8_title(p, p, lenp);
1162 }
1163
1164 UV
1165 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1166 {
1167     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1168
1169     uvchr_to_utf8(p, c);
1170     return to_utf8_lower(p, p, lenp);
1171 }
1172
1173 UV
1174 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1175 {
1176     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1177
1178     uvchr_to_utf8(p, c);
1179     return to_utf8_fold(p, p, lenp);
1180 }
1181
1182 /* for now these all assume no locale info available for Unicode > 255 */
1183
1184 bool
1185 Perl_is_uni_alnum_lc(pTHX_ UV c)
1186 {
1187     return is_uni_alnum(c);     /* XXX no locale support yet */
1188 }
1189
1190 bool
1191 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1192 {
1193     return is_uni_idfirst(c);   /* XXX no locale support yet */
1194 }
1195
1196 bool
1197 Perl_is_uni_alpha_lc(pTHX_ UV c)
1198 {
1199     return is_uni_alpha(c);     /* XXX no locale support yet */
1200 }
1201
1202 bool
1203 Perl_is_uni_ascii_lc(pTHX_ UV c)
1204 {
1205     return is_uni_ascii(c);     /* XXX no locale support yet */
1206 }
1207
1208 bool
1209 Perl_is_uni_space_lc(pTHX_ UV c)
1210 {
1211     return is_uni_space(c);     /* XXX no locale support yet */
1212 }
1213
1214 bool
1215 Perl_is_uni_digit_lc(pTHX_ UV c)
1216 {
1217     return is_uni_digit(c);     /* XXX no locale support yet */
1218 }
1219
1220 bool
1221 Perl_is_uni_upper_lc(pTHX_ UV c)
1222 {
1223     return is_uni_upper(c);     /* XXX no locale support yet */
1224 }
1225
1226 bool
1227 Perl_is_uni_lower_lc(pTHX_ UV c)
1228 {
1229     return is_uni_lower(c);     /* XXX no locale support yet */
1230 }
1231
1232 bool
1233 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1234 {
1235     return is_uni_cntrl(c);     /* XXX no locale support yet */
1236 }
1237
1238 bool
1239 Perl_is_uni_graph_lc(pTHX_ UV c)
1240 {
1241     return is_uni_graph(c);     /* XXX no locale support yet */
1242 }
1243
1244 bool
1245 Perl_is_uni_print_lc(pTHX_ UV c)
1246 {
1247     return is_uni_print(c);     /* XXX no locale support yet */
1248 }
1249
1250 bool
1251 Perl_is_uni_punct_lc(pTHX_ UV c)
1252 {
1253     return is_uni_punct(c);     /* XXX no locale support yet */
1254 }
1255
1256 bool
1257 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1258 {
1259     return is_uni_xdigit(c);    /* XXX no locale support yet */
1260 }
1261
1262 U32
1263 Perl_to_uni_upper_lc(pTHX_ U32 c)
1264 {
1265     /* XXX returns only the first character -- do not use XXX */
1266     /* XXX no locale support yet */
1267     STRLEN len;
1268     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1269     return (U32)to_uni_upper(c, tmpbuf, &len);
1270 }
1271
1272 U32
1273 Perl_to_uni_title_lc(pTHX_ U32 c)
1274 {
1275     /* XXX returns only the first character XXX -- do not use XXX */
1276     /* XXX no locale support yet */
1277     STRLEN len;
1278     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1279     return (U32)to_uni_title(c, tmpbuf, &len);
1280 }
1281
1282 U32
1283 Perl_to_uni_lower_lc(pTHX_ U32 c)
1284 {
1285     /* XXX returns only the first character -- do not use XXX */
1286     /* XXX no locale support yet */
1287     STRLEN len;
1288     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1289     return (U32)to_uni_lower(c, tmpbuf, &len);
1290 }
1291
1292 static bool
1293 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1294                  const char *const swashname)
1295 {
1296     dVAR;
1297
1298     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1299
1300     if (!is_utf8_char(p))
1301         return FALSE;
1302     if (!*swash)
1303         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1304     return swash_fetch(*swash, p, TRUE) != 0;
1305 }
1306
1307 bool
1308 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1309 {
1310     dVAR;
1311
1312     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1313
1314     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1315      * descendant of isalnum(3), in other words, it doesn't
1316      * contain the '_'. --jhi */
1317     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1318 }
1319
1320 bool
1321 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1322 {
1323     dVAR;
1324
1325     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1326
1327     if (*p == '_')
1328         return TRUE;
1329     /* is_utf8_idstart would be more logical. */
1330     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1331 }
1332
1333 bool
1334 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1335 {
1336     dVAR;
1337
1338     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1339
1340     if (*p == '_')
1341         return TRUE;
1342     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1343 }
1344
1345 bool
1346 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1347 {
1348     dVAR;
1349
1350     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1351
1352     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1353 }
1354
1355 bool
1356 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1357 {
1358     dVAR;
1359
1360     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1361
1362     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1363 }
1364
1365 bool
1366 Perl_is_utf8_space(pTHX_ const U8 *p)
1367 {
1368     dVAR;
1369
1370     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1371
1372     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1373 }
1374
1375 bool
1376 Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1377 {
1378     dVAR;
1379
1380     PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
1381
1382     return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
1383 }
1384
1385 bool
1386 Perl_is_utf8_perl_word(pTHX_ const U8 *p)
1387 {
1388     dVAR;
1389
1390     PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
1391
1392     return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
1393 }
1394
1395 bool
1396 Perl_is_utf8_digit(pTHX_ const U8 *p)
1397 {
1398     dVAR;
1399
1400     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1401
1402     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1403 }
1404
1405 bool
1406 Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
1407 {
1408     dVAR;
1409
1410     PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
1411
1412     return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
1413 }
1414
1415 bool
1416 Perl_is_utf8_upper(pTHX_ const U8 *p)
1417 {
1418     dVAR;
1419
1420     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1421
1422     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1423 }
1424
1425 bool
1426 Perl_is_utf8_lower(pTHX_ const U8 *p)
1427 {
1428     dVAR;
1429
1430     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1431
1432     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1433 }
1434
1435 bool
1436 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1437 {
1438     dVAR;
1439
1440     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1441
1442     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1443 }
1444
1445 bool
1446 Perl_is_utf8_graph(pTHX_ const U8 *p)
1447 {
1448     dVAR;
1449
1450     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1451
1452     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1453 }
1454
1455 bool
1456 Perl_is_utf8_print(pTHX_ const U8 *p)
1457 {
1458     dVAR;
1459
1460     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1461
1462     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1463 }
1464
1465 bool
1466 Perl_is_utf8_punct(pTHX_ const U8 *p)
1467 {
1468     dVAR;
1469
1470     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1471
1472     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1473 }
1474
1475 bool
1476 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1477 {
1478     dVAR;
1479
1480     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1481
1482     return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
1483 }
1484
1485 bool
1486 Perl_is_utf8_mark(pTHX_ const U8 *p)
1487 {
1488     dVAR;
1489
1490     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1491
1492     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1493 }
1494
1495 bool
1496 Perl_is_utf8_X_begin(pTHX_ const U8 *p)
1497 {
1498     dVAR;
1499
1500     PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
1501
1502     return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
1503 }
1504
1505 bool
1506 Perl_is_utf8_X_extend(pTHX_ const U8 *p)
1507 {
1508     dVAR;
1509
1510     PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
1511
1512     return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
1513 }
1514
1515 bool
1516 Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
1517 {
1518     dVAR;
1519
1520     PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
1521
1522     return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
1523 }
1524
1525 bool
1526 Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
1527 {
1528     dVAR;
1529
1530     PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
1531
1532     return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
1533 }
1534
1535 bool
1536 Perl_is_utf8_X_L(pTHX_ const U8 *p)
1537 {
1538     dVAR;
1539
1540     PERL_ARGS_ASSERT_IS_UTF8_X_L;
1541
1542     return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
1543 }
1544
1545 bool
1546 Perl_is_utf8_X_LV(pTHX_ const U8 *p)
1547 {
1548     dVAR;
1549
1550     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
1551
1552     return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
1553 }
1554
1555 bool
1556 Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
1557 {
1558     dVAR;
1559
1560     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
1561
1562     return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
1563 }
1564
1565 bool
1566 Perl_is_utf8_X_T(pTHX_ const U8 *p)
1567 {
1568     dVAR;
1569
1570     PERL_ARGS_ASSERT_IS_UTF8_X_T;
1571
1572     return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
1573 }
1574
1575 bool
1576 Perl_is_utf8_X_V(pTHX_ const U8 *p)
1577 {
1578     dVAR;
1579
1580     PERL_ARGS_ASSERT_IS_UTF8_X_V;
1581
1582     return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
1583 }
1584
1585 bool
1586 Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
1587 {
1588     dVAR;
1589
1590     PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
1591
1592     return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
1593 }
1594
1595 /*
1596 =for apidoc to_utf8_case
1597
1598 The "p" contains the pointer to the UTF-8 string encoding
1599 the character that is being converted.
1600
1601 The "ustrp" is a pointer to the character buffer to put the
1602 conversion result to.  The "lenp" is a pointer to the length
1603 of the result.
1604
1605 The "swashp" is a pointer to the swash to use.
1606
1607 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1608 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1609 but not always, a multicharacter mapping), is tried first.
1610
1611 The "special" is a string like "utf8::ToSpecLower", which means the
1612 hash %utf8::ToSpecLower.  The access to the hash is through
1613 Perl_to_utf8_case().
1614
1615 The "normal" is a string like "ToLower" which means the swash
1616 %utf8::ToLower.
1617
1618 =cut */
1619
1620 UV
1621 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1622                         SV **swashp, const char *normal, const char *special)
1623 {
1624     dVAR;
1625     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1626     STRLEN len = 0;
1627     const UV uv0 = utf8_to_uvchr(p, NULL);
1628     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1629      * are necessary in EBCDIC, they are redundant no-ops
1630      * in ASCII-ish platforms, and hopefully optimized away. */
1631     const UV uv1 = NATIVE_TO_UNI(uv0);
1632
1633     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1634
1635     uvuni_to_utf8(tmpbuf, uv1);
1636
1637     if (!*swashp) /* load on-demand */
1638          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1639     /* This is the beginnings of a skeleton of code to read the info section
1640      * that is in all the swashes in case we ever want to do that, so one can
1641      * read things whose maps aren't code points, and whose default if missing
1642      * is not to the code point itself.  This was just to see if it actually
1643      * worked.  Details on what the possibilities are are in perluniprops.pod
1644         HV * const hv = get_hv("utf8::SwashInfo", 0);
1645         if (hv) {
1646          SV **svp;
1647          svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
1648              const char *s;
1649
1650               HV * const this_hash = SvRV(*svp);
1651                 svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
1652               s = SvPV_const(*svp, len);
1653         }
1654     }*/
1655
1656     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1657     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1658          /* It might be "special" (sometimes, but not always,
1659           * a multicharacter mapping) */
1660          HV * const hv = get_hv(special, 0);
1661          SV **svp;
1662
1663          if (hv &&
1664              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1665              (*svp)) {
1666              const char *s;
1667
1668               s = SvPV_const(*svp, len);
1669               if (len == 1)
1670                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1671               else {
1672 #ifdef EBCDIC
1673                    /* If we have EBCDIC we need to remap the characters
1674                     * since any characters in the low 256 are Unicode
1675                     * code points, not EBCDIC. */
1676                    U8 *t = (U8*)s, *tend = t + len, *d;
1677
1678                    d = tmpbuf;
1679                    if (SvUTF8(*svp)) {
1680                         STRLEN tlen = 0;
1681
1682                         while (t < tend) {
1683                              const UV c = utf8_to_uvchr(t, &tlen);
1684                              if (tlen > 0) {
1685                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1686                                   t += tlen;
1687                              }
1688                              else
1689                                   break;
1690                         }
1691                    }
1692                    else {
1693                         while (t < tend) {
1694                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1695                              t++;
1696                         }
1697                    }
1698                    len = d - tmpbuf;
1699                    Copy(tmpbuf, ustrp, len, U8);
1700 #else
1701                    Copy(s, ustrp, len, U8);
1702 #endif
1703               }
1704          }
1705     }
1706
1707     if (!len && *swashp) {
1708         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1709
1710          if (uv2) {
1711               /* It was "normal" (a single character mapping). */
1712               const UV uv3 = UNI_TO_NATIVE(uv2);
1713               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1714          }
1715     }
1716
1717     if (!len) /* Neither: just copy.  In other words, there was no mapping
1718                  defined, which means that the code point maps to itself */
1719          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1720
1721     if (lenp)
1722          *lenp = len;
1723
1724     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1725 }
1726
1727 /*
1728 =for apidoc to_utf8_upper
1729
1730 Convert the UTF-8 encoded character at p to its uppercase version and
1731 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1732 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1733 the uppercase version may be longer than the original character.
1734
1735 The first character of the uppercased version is returned
1736 (but note, as explained above, that there may be more.)
1737
1738 =cut */
1739
1740 UV
1741 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1742 {
1743     dVAR;
1744
1745     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1746
1747     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1748                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1749 }
1750
1751 /*
1752 =for apidoc to_utf8_title
1753
1754 Convert the UTF-8 encoded character at p to its titlecase version and
1755 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1756 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1757 titlecase version may be longer than the original character.
1758
1759 The first character of the titlecased version is returned
1760 (but note, as explained above, that there may be more.)
1761
1762 =cut */
1763
1764 UV
1765 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1766 {
1767     dVAR;
1768
1769     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1770
1771     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1772                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1773 }
1774
1775 /*
1776 =for apidoc to_utf8_lower
1777
1778 Convert the UTF-8 encoded character at p to its lowercase version and
1779 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1780 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1781 lowercase version may be longer than the original character.
1782
1783 The first character of the lowercased version is returned
1784 (but note, as explained above, that there may be more.)
1785
1786 =cut */
1787
1788 UV
1789 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1790 {
1791     dVAR;
1792
1793     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1794
1795     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1796                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1797 }
1798
1799 /*
1800 =for apidoc to_utf8_fold
1801
1802 Convert the UTF-8 encoded character at p to its foldcase version and
1803 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1804 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1805 foldcase version may be longer than the original character (up to
1806 three characters).
1807
1808 The first character of the foldcased version is returned
1809 (but note, as explained above, that there may be more.)
1810
1811 =cut */
1812
1813 UV
1814 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1815 {
1816     dVAR;
1817
1818     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1819
1820     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1821                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1822 }
1823
1824 /* Note:
1825  * A "swash" is a swatch hash.
1826  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1827  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1828  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1829  */
1830 SV*
1831 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1832 {
1833     dVAR;
1834     SV* retval;
1835     dSP;
1836     const size_t pkg_len = strlen(pkg);
1837     const size_t name_len = strlen(name);
1838     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1839     SV* errsv_save;
1840
1841     PERL_ARGS_ASSERT_SWASH_INIT;
1842
1843     PUSHSTACKi(PERLSI_MAGIC);
1844     ENTER;
1845     SAVEI32(PL_hints);
1846     PL_hints = 0;
1847     save_re_context();
1848     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1849         ENTER;
1850         errsv_save = newSVsv(ERRSV);
1851         /* It is assumed that callers of this routine are not passing in any
1852            user derived data.  */
1853         /* Need to do this after save_re_context() as it will set PL_tainted to
1854            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1855            Even line to create errsv_save can turn on PL_tainted.  */
1856         SAVEBOOL(PL_tainted);
1857         PL_tainted = 0;
1858         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1859                          NULL);
1860         if (!SvTRUE(ERRSV))
1861             sv_setsv(ERRSV, errsv_save);
1862         SvREFCNT_dec(errsv_save);
1863         LEAVE;
1864     }
1865     SPAGAIN;
1866     PUSHMARK(SP);
1867     EXTEND(SP,5);
1868     mPUSHp(pkg, pkg_len);
1869     mPUSHp(name, name_len);
1870     PUSHs(listsv);
1871     mPUSHi(minbits);
1872     mPUSHi(none);
1873     PUTBACK;
1874     errsv_save = newSVsv(ERRSV);
1875     if (call_method("SWASHNEW", G_SCALAR))
1876         retval = newSVsv(*PL_stack_sp--);
1877     else
1878         retval = &PL_sv_undef;
1879     if (!SvTRUE(ERRSV))
1880         sv_setsv(ERRSV, errsv_save);
1881     SvREFCNT_dec(errsv_save);
1882     LEAVE;
1883     POPSTACK;
1884     if (IN_PERL_COMPILETIME) {
1885         CopHINTS_set(PL_curcop, PL_hints);
1886     }
1887     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1888         if (SvPOK(retval))
1889             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1890                        SVfARG(retval));
1891         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1892     }
1893     return retval;
1894 }
1895
1896
1897 /* This API is wrong for special case conversions since we may need to
1898  * return several Unicode characters for a single Unicode character
1899  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1900  * the lower-level routine, and it is similarly broken for returning
1901  * multiple values.  --jhi */
1902 /* Now SWASHGET is recasted into S_swash_get in this file. */
1903
1904 /* Note:
1905  * Returns the value of property/mapping C<swash> for the first character
1906  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1907  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1908  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1909  */
1910 UV
1911 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1912 {
1913     dVAR;
1914     HV *const hv = MUTABLE_HV(SvRV(swash));
1915     U32 klen;
1916     U32 off;
1917     STRLEN slen;
1918     STRLEN needents;
1919     const U8 *tmps = NULL;
1920     U32 bit;
1921     SV *swatch;
1922     U8 tmputf8[2];
1923     const UV c = NATIVE_TO_ASCII(*ptr);
1924
1925     PERL_ARGS_ASSERT_SWASH_FETCH;
1926
1927     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1928         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1929         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1930         ptr = tmputf8;
1931     }
1932     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1933      * then the "swatch" is a vec() for all the chars which start
1934      * with 0xAA..0xYY
1935      * So the key in the hash (klen) is length of encoded char -1
1936      */
1937     klen = UTF8SKIP(ptr) - 1;
1938     off  = ptr[klen];
1939
1940     if (klen == 0) {
1941       /* If char is invariant then swatch is for all the invariant chars
1942        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1943        */
1944         needents = UTF_CONTINUATION_MARK;
1945         off      = NATIVE_TO_UTF(ptr[klen]);
1946     }
1947     else {
1948       /* If char is encoded then swatch is for the prefix */
1949         needents = (1 << UTF_ACCUMULATION_SHIFT);
1950         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1951     }
1952
1953     /*
1954      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1955      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1956      * it's nothing to sniff at.)  Pity we usually come through at least
1957      * two function calls to get here...
1958      *
1959      * NB: this code assumes that swatches are never modified, once generated!
1960      */
1961
1962     if (hv   == PL_last_swash_hv &&
1963         klen == PL_last_swash_klen &&
1964         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1965     {
1966         tmps = PL_last_swash_tmps;
1967         slen = PL_last_swash_slen;
1968     }
1969     else {
1970         /* Try our second-level swatch cache, kept in a hash. */
1971         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1972
1973         /* If not cached, generate it via swash_get */
1974         if (!svp || !SvPOK(*svp)
1975                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1976             /* We use utf8n_to_uvuni() as we want an index into
1977                Unicode tables, not a native character number.
1978              */
1979             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1980                                            ckWARN(WARN_UTF8) ?
1981                                            0 : UTF8_ALLOW_ANY);
1982             swatch = swash_get(swash,
1983                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1984                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1985                                 needents);
1986
1987             if (IN_PERL_COMPILETIME)
1988                 CopHINTS_set(PL_curcop, PL_hints);
1989
1990             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1991
1992             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1993                      || (slen << 3) < needents)
1994                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1995         }
1996
1997         PL_last_swash_hv = hv;
1998         assert(klen <= sizeof(PL_last_swash_key));
1999         PL_last_swash_klen = (U8)klen;
2000         /* FIXME change interpvar.h?  */
2001         PL_last_swash_tmps = (U8 *) tmps;
2002         PL_last_swash_slen = slen;
2003         if (klen)
2004             Copy(ptr, PL_last_swash_key, klen, U8);
2005     }
2006
2007     switch ((int)((slen << 3) / needents)) {
2008     case 1:
2009         bit = 1 << (off & 7);
2010         off >>= 3;
2011         return (tmps[off] & bit) != 0;
2012     case 8:
2013         return tmps[off];
2014     case 16:
2015         off <<= 1;
2016         return (tmps[off] << 8) + tmps[off + 1] ;
2017     case 32:
2018         off <<= 2;
2019         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
2020     }
2021     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
2022     NORETURN_FUNCTION_END;
2023 }
2024
2025 /* Note:
2026  * Returns a swatch (a bit vector string) for a code point sequence
2027  * that starts from the value C<start> and comprises the number C<span>.
2028  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
2029  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
2030  */
2031 STATIC SV*
2032 S_swash_get(pTHX_ SV* swash, UV start, UV span)
2033 {
2034     SV *swatch;
2035     U8 *l, *lend, *x, *xend, *s;
2036     STRLEN lcur, xcur, scur;
2037     HV *const hv = MUTABLE_HV(SvRV(swash));
2038     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
2039     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
2040     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
2041     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
2042     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
2043     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
2044     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
2045     const STRLEN bits  = SvUV(*bitssvp);
2046     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
2047     const UV     none  = SvUV(*nonesvp);
2048     const UV     end   = start + span;
2049
2050     PERL_ARGS_ASSERT_SWASH_GET;
2051
2052     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
2053         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
2054                                                  (UV)bits);
2055     }
2056
2057     /* create and initialize $swatch */
2058     scur   = octets ? (span * octets) : (span + 7) / 8;
2059     swatch = newSV(scur);
2060     SvPOK_on(swatch);
2061     s = (U8*)SvPVX(swatch);
2062     if (octets && none) {
2063         const U8* const e = s + scur;
2064         while (s < e) {
2065             if (bits == 8)
2066                 *s++ = (U8)(none & 0xff);
2067             else if (bits == 16) {
2068                 *s++ = (U8)((none >>  8) & 0xff);
2069                 *s++ = (U8)( none        & 0xff);
2070             }
2071             else if (bits == 32) {
2072                 *s++ = (U8)((none >> 24) & 0xff);
2073                 *s++ = (U8)((none >> 16) & 0xff);
2074                 *s++ = (U8)((none >>  8) & 0xff);
2075                 *s++ = (U8)( none        & 0xff);
2076             }
2077         }
2078         *s = '\0';
2079     }
2080     else {
2081         (void)memzero((U8*)s, scur + 1);
2082     }
2083     SvCUR_set(swatch, scur);
2084     s = (U8*)SvPVX(swatch);
2085
2086     /* read $swash->{LIST} */
2087     l = (U8*)SvPV(*listsvp, lcur);
2088     lend = l + lcur;
2089     while (l < lend) {
2090         UV min, max, val;
2091         STRLEN numlen;
2092         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
2093
2094         U8* const nl = (U8*)memchr(l, '\n', lend - l);
2095
2096         numlen = lend - l;
2097         min = grok_hex((char *)l, &numlen, &flags, NULL);
2098         if (numlen)
2099             l += numlen;
2100         else if (nl) {
2101             l = nl + 1; /* 1 is length of "\n" */
2102             continue;
2103         }
2104         else {
2105             l = lend; /* to LIST's end at which \n is not found */
2106             break;
2107         }
2108
2109         if (isBLANK(*l)) {
2110             ++l;
2111             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
2112             numlen = lend - l;
2113             max = grok_hex((char *)l, &numlen, &flags, NULL);
2114             if (numlen)
2115                 l += numlen;
2116             else
2117                 max = min;
2118
2119             if (octets) {
2120                 if (isBLANK(*l)) {
2121                     ++l;
2122                     flags = PERL_SCAN_SILENT_ILLDIGIT |
2123                             PERL_SCAN_DISALLOW_PREFIX;
2124                     numlen = lend - l;
2125                     val = grok_hex((char *)l, &numlen, &flags, NULL);
2126                     if (numlen)
2127                         l += numlen;
2128                     else
2129                         val = 0;
2130                 }
2131                 else {
2132                     val = 0;
2133                     if (typeto) {
2134                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2135                                          typestr, l);
2136                     }
2137                 }
2138             }
2139             else
2140                 val = 0; /* bits == 1, then val should be ignored */
2141         }
2142         else {
2143             max = min;
2144             if (octets) {
2145                 val = 0;
2146                 if (typeto) {
2147                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2148                 }
2149             }
2150             else
2151                 val = 0; /* bits == 1, then val should be ignored */
2152         }
2153
2154         if (nl)
2155             l = nl + 1;
2156         else
2157             l = lend;
2158
2159         if (max < start)
2160             continue;
2161
2162         if (octets) {
2163             UV key;
2164             if (min < start) {
2165                 if (!none || val < none) {
2166                     val += start - min;
2167                 }
2168                 min = start;
2169             }
2170             for (key = min; key <= max; key++) {
2171                 STRLEN offset;
2172                 if (key >= end)
2173                     goto go_out_list;
2174                 /* offset must be non-negative (start <= min <= key < end) */
2175                 offset = octets * (key - start);
2176                 if (bits == 8)
2177                     s[offset] = (U8)(val & 0xff);
2178                 else if (bits == 16) {
2179                     s[offset    ] = (U8)((val >>  8) & 0xff);
2180                     s[offset + 1] = (U8)( val        & 0xff);
2181                 }
2182                 else if (bits == 32) {
2183                     s[offset    ] = (U8)((val >> 24) & 0xff);
2184                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2185                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2186                     s[offset + 3] = (U8)( val        & 0xff);
2187                 }
2188
2189                 if (!none || val < none)
2190                     ++val;
2191             }
2192         }
2193         else { /* bits == 1, then val should be ignored */
2194             UV key;
2195             if (min < start)
2196                 min = start;
2197             for (key = min; key <= max; key++) {
2198                 const STRLEN offset = (STRLEN)(key - start);
2199                 if (key >= end)
2200                     goto go_out_list;
2201                 s[offset >> 3] |= 1 << (offset & 7);
2202             }
2203         }
2204     } /* while */
2205   go_out_list:
2206
2207     /* read $swash->{EXTRAS} */
2208     x = (U8*)SvPV(*extssvp, xcur);
2209     xend = x + xcur;
2210     while (x < xend) {
2211         STRLEN namelen;
2212         U8 *namestr;
2213         SV** othersvp;
2214         HV* otherhv;
2215         STRLEN otherbits;
2216         SV **otherbitssvp, *other;
2217         U8 *s, *o, *nl;
2218         STRLEN slen, olen;
2219
2220         const U8 opc = *x++;
2221         if (opc == '\n')
2222             continue;
2223
2224         nl = (U8*)memchr(x, '\n', xend - x);
2225
2226         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2227             if (nl) {
2228                 x = nl + 1; /* 1 is length of "\n" */
2229                 continue;
2230             }
2231             else {
2232                 x = xend; /* to EXTRAS' end at which \n is not found */
2233                 break;
2234             }
2235         }
2236
2237         namestr = x;
2238         if (nl) {
2239             namelen = nl - namestr;
2240             x = nl + 1;
2241         }
2242         else {
2243             namelen = xend - namestr;
2244             x = xend;
2245         }
2246
2247         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2248         otherhv = MUTABLE_HV(SvRV(*othersvp));
2249         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2250         otherbits = (STRLEN)SvUV(*otherbitssvp);
2251         if (bits < otherbits)
2252             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2253
2254         /* The "other" swatch must be destroyed after. */
2255         other = swash_get(*othersvp, start, span);
2256         o = (U8*)SvPV(other, olen);
2257
2258         if (!olen)
2259             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2260
2261         s = (U8*)SvPV(swatch, slen);
2262         if (bits == 1 && otherbits == 1) {
2263             if (slen != olen)
2264                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2265
2266             switch (opc) {
2267             case '+':
2268                 while (slen--)
2269                     *s++ |= *o++;
2270                 break;
2271             case '!':
2272                 while (slen--)
2273                     *s++ |= ~*o++;
2274                 break;
2275             case '-':
2276                 while (slen--)
2277                     *s++ &= ~*o++;
2278                 break;
2279             case '&':
2280                 while (slen--)
2281                     *s++ &= *o++;
2282                 break;
2283             default:
2284                 break;
2285             }
2286         }
2287         else {
2288             STRLEN otheroctets = otherbits >> 3;
2289             STRLEN offset = 0;
2290             U8* const send = s + slen;
2291
2292             while (s < send) {
2293                 UV otherval = 0;
2294
2295                 if (otherbits == 1) {
2296                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2297                     ++offset;
2298                 }
2299                 else {
2300                     STRLEN vlen = otheroctets;
2301                     otherval = *o++;
2302                     while (--vlen) {
2303                         otherval <<= 8;
2304                         otherval |= *o++;
2305                     }
2306                 }
2307
2308                 if (opc == '+' && otherval)
2309                     NOOP;   /* replace with otherval */
2310                 else if (opc == '!' && !otherval)
2311                     otherval = 1;
2312                 else if (opc == '-' && otherval)
2313                     otherval = 0;
2314                 else if (opc == '&' && !otherval)
2315                     otherval = 0;
2316                 else {
2317                     s += octets; /* no replacement */
2318                     continue;
2319                 }
2320
2321                 if (bits == 8)
2322                     *s++ = (U8)( otherval & 0xff);
2323                 else if (bits == 16) {
2324                     *s++ = (U8)((otherval >>  8) & 0xff);
2325                     *s++ = (U8)( otherval        & 0xff);
2326                 }
2327                 else if (bits == 32) {
2328                     *s++ = (U8)((otherval >> 24) & 0xff);
2329                     *s++ = (U8)((otherval >> 16) & 0xff);
2330                     *s++ = (U8)((otherval >>  8) & 0xff);
2331                     *s++ = (U8)( otherval        & 0xff);
2332                 }
2333             }
2334         }
2335         sv_free(other); /* through with it! */
2336     } /* while */
2337     return swatch;
2338 }
2339
2340 /*
2341 =for apidoc uvchr_to_utf8
2342
2343 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2344 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2345 bytes available. The return value is the pointer to the byte after the
2346 end of the new character. In other words,
2347
2348     d = uvchr_to_utf8(d, uv);
2349
2350 is the recommended wide native character-aware way of saying
2351
2352     *(d++) = uv;
2353
2354 =cut
2355 */
2356
2357 /* On ASCII machines this is normally a macro but we want a
2358    real function in case XS code wants it
2359 */
2360 U8 *
2361 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2362 {
2363     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2364
2365     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2366 }
2367
2368 U8 *
2369 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2370 {
2371     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2372
2373     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2374 }
2375
2376 /*
2377 =for apidoc utf8n_to_uvchr
2378 flags
2379
2380 Returns the native character value of the first character in the string
2381 C<s>
2382 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2383 length, in bytes, of that character.
2384
2385 Allows length and flags to be passed to low level routine.
2386
2387 =cut
2388 */
2389 /* On ASCII machines this is normally a macro but we want
2390    a real function in case XS code wants it
2391 */
2392 UV
2393 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2394 U32 flags)
2395 {
2396     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2397
2398     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2399
2400     return UNI_TO_NATIVE(uv);
2401 }
2402
2403 /*
2404 =for apidoc pv_uni_display
2405
2406 Build to the scalar dsv a displayable version of the string spv,
2407 length len, the displayable version being at most pvlim bytes long
2408 (if longer, the rest is truncated and "..." will be appended).
2409
2410 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2411 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2412 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2413 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2414 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2415 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2416
2417 The pointer to the PV of the dsv is returned.
2418
2419 =cut */
2420 char *
2421 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2422 {
2423     int truncated = 0;
2424     const char *s, *e;
2425
2426     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2427
2428     sv_setpvs(dsv, "");
2429     SvUTF8_off(dsv);
2430     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2431          UV u;
2432           /* This serves double duty as a flag and a character to print after
2433              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2434           */
2435          char ok = 0;
2436
2437          if (pvlim && SvCUR(dsv) >= pvlim) {
2438               truncated++;
2439               break;
2440          }
2441          u = utf8_to_uvchr((U8*)s, 0);
2442          if (u < 256) {
2443              const unsigned char c = (unsigned char)u & 0xFF;
2444              if (flags & UNI_DISPLAY_BACKSLASH) {
2445                  switch (c) {
2446                  case '\n':
2447                      ok = 'n'; break;
2448                  case '\r':
2449                      ok = 'r'; break;
2450                  case '\t':
2451                      ok = 't'; break;
2452                  case '\f':
2453                      ok = 'f'; break;
2454                  case '\a':
2455                      ok = 'a'; break;
2456                  case '\\':
2457                      ok = '\\'; break;
2458                  default: break;
2459                  }
2460                  if (ok) {
2461                      const char string = ok;
2462                      sv_catpvs(dsv, "\\");
2463                      sv_catpvn(dsv, &string, 1);
2464                  }
2465              }
2466              /* isPRINT() is the locale-blind version. */
2467              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2468                  const char string = c;
2469                  sv_catpvn(dsv, &string, 1);
2470                  ok = 1;
2471              }
2472          }
2473          if (!ok)
2474              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2475     }
2476     if (truncated)
2477          sv_catpvs(dsv, "...");
2478
2479     return SvPVX(dsv);
2480 }
2481
2482 /*
2483 =for apidoc sv_uni_display
2484
2485 Build to the scalar dsv a displayable version of the scalar sv,
2486 the displayable version being at most pvlim bytes long
2487 (if longer, the rest is truncated and "..." will be appended).
2488
2489 The flags argument is as in pv_uni_display().
2490
2491 The pointer to the PV of the dsv is returned.
2492
2493 =cut
2494 */
2495 char *
2496 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2497 {
2498     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2499
2500      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2501                                 SvCUR(ssv), pvlim, flags);
2502 }
2503
2504 /*
2505 =for apidoc ibcmp_utf8
2506
2507 Return true if the strings s1 and s2 differ case-insensitively, false
2508 if not (if they are equal case-insensitively).  If u1 is true, the
2509 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2510 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2511 are false, the respective string is assumed to be in native 8-bit
2512 encoding.
2513
2514 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2515 in there (they will point at the beginning of the I<next> character).
2516 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2517 pointers beyond which scanning will not continue under any
2518 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2519 s2+l2 will be used as goal end pointers that will also stop the scan,
2520 and which qualify towards defining a successful match: all the scans
2521 that define an explicit length must reach their goal pointers for
2522 a match to succeed).
2523
2524 For case-insensitiveness, the "casefolding" of Unicode is used
2525 instead of upper/lowercasing both the characters, see
2526 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2527
2528 =cut */
2529 I32
2530 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2531 {
2532      dVAR;
2533      register const U8 *p1  = (const U8*)s1;
2534      register const U8 *p2  = (const U8*)s2;
2535      register const U8 *f1 = NULL;
2536      register const U8 *f2 = NULL;
2537      register U8 *e1 = NULL;
2538      register U8 *q1 = NULL;
2539      register U8 *e2 = NULL;
2540      register U8 *q2 = NULL;
2541      STRLEN n1 = 0, n2 = 0;
2542      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2543      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2544      U8 natbuf[1+1];
2545      STRLEN foldlen1, foldlen2;
2546      bool match;
2547
2548      PERL_ARGS_ASSERT_IBCMP_UTF8;
2549
2550      if (pe1)
2551           e1 = *(U8**)pe1;
2552      /* assert(e1 || l1); */
2553      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2554           f1 = (const U8*)s1 + l1;
2555      if (pe2)
2556           e2 = *(U8**)pe2;
2557      /* assert(e2 || l2); */
2558      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2559           f2 = (const U8*)s2 + l2;
2560
2561      /* This shouldn't happen. However, putting an assert() there makes some
2562       * tests fail. */
2563      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2564      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2565           return 1; /* mismatch; possible infinite loop or false positive */
2566
2567      if (!u1 || !u2)
2568           natbuf[1] = 0; /* Need to terminate the buffer. */
2569
2570      while ((e1 == 0 || p1 < e1) &&
2571             (f1 == 0 || p1 < f1) &&
2572             (e2 == 0 || p2 < e2) &&
2573             (f2 == 0 || p2 < f2)) {
2574           if (n1 == 0) {
2575                if (u1)
2576                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2577                else {
2578                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2579                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2580                }
2581                q1 = foldbuf1;
2582                n1 = foldlen1;
2583           }
2584           if (n2 == 0) {
2585                if (u2)
2586                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2587                else {
2588                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2589                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2590                }
2591                q2 = foldbuf2;
2592                n2 = foldlen2;
2593           }
2594           while (n1 && n2) {
2595                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2596                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2597                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2598                    return 1; /* mismatch */
2599                n1 -= UTF8SKIP(q1);
2600                q1 += UTF8SKIP(q1);
2601                n2 -= UTF8SKIP(q2);
2602                q2 += UTF8SKIP(q2);
2603           }
2604           if (n1 == 0)
2605                p1 += u1 ? UTF8SKIP(p1) : 1;
2606           if (n2 == 0)
2607                p2 += u2 ? UTF8SKIP(p2) : 1;
2608
2609      }
2610
2611      /* A match is defined by all the scans that specified
2612       * an explicit length reaching their final goals. */
2613      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2614
2615      if (match) {
2616           if (pe1)
2617                *pe1 = (char*)p1;
2618           if (pe2)
2619                *pe2 = (char*)p2;
2620      }
2621
2622      return match ? 0 : 1; /* 0 match, 1 mismatch */
2623 }
2624
2625 /*
2626  * Local variables:
2627  * c-indentation-style: bsd
2628  * c-basic-offset: 4
2629  * indent-tabs-mode: t
2630  * End:
2631  *
2632  * ex: set ts=8 sts=4 sw=4 noet:
2633  */