- rename utf8/uv functions to indicate what sort of uv they provide (uvuni/uvchr)
- use utf8n_xxxx (c.f. pvn) for forms which take length.
- back out vN.N and $^V exceptions to e2a/a2e
- make "locale" isxxx macros be uvchr (may be redundant?)
Not clear yet that toUPPER_uni et. al. return being handled correctly.
The tr// and rexexp stuff still needs an audit, assumption is they are working
in Unicode space.
Need to provide v5.6 names for XS modules (decide is uni or chr ?).
p4raw-id: //depot/perlio@9096
UV c;
/* Need to check this, otherwise 128..255 won't match */
- c = utf8_to_uv(s, send - s, &ulen, 0);
+ c = utf8n_to_uvchr(s, send - s, &ulen, 0);
if (c < 0x100 && (ch = tbl[c]) >= 0) {
matches++;
- d = uv_to_utf8(d, ch);
+ d = uvchr_to_utf8(d, ch);
s += ulen;
}
else { /* No match -> copy */
while (s < send) {
UV c;
STRLEN ulen;
- c = utf8_to_uv(s, send - s, &ulen, 0);
+ c = utf8n_to_uvchr(s, send - s, &ulen, 0);
if (c < 0x100) {
if (tbl[c] >= 0)
matches++;
UV pch = 0xfeedface;
while (s < send) {
STRLEN len;
- UV comp = utf8_to_uv_simple(s, &len);
+ UV comp = utf8_to_uvchr(s, &len);
if (comp > 0xff) {
if (!complement) {
ch = (comp - 0x100 < rlen) ?
tbl[comp+1] : tbl[0x100+rlen];
if (ch != pch) {
- d = uv_to_utf8(d, ch);
+ d = uvchr_to_utf8(d, ch);
pch = ch;
}
s += len;
else if ((ch = tbl[comp]) >= 0) {
matches++;
if (ch != pch) {
- d = uv_to_utf8(d, ch);
+ d = uvchr_to_utf8(d, ch);
pch = ch;
}
s += len;
else {
while (s < send) {
STRLEN len;
- UV comp = utf8_to_uv_simple(s, &len);
+ UV comp = utf8_to_uvchr(s, &len);
if (comp > 0xff) {
if (!complement) {
Copy(s, d, len, U8);
else {
matches++;
if (!del) {
- if (comp - 0x100 < rlen)
- d = uv_to_utf8(d, tbl[comp+1]);
+ if (comp - 0x100 < rlen)
+ d = uvchr_to_utf8(d, tbl[comp+1]);
else
- d = uv_to_utf8(d, tbl[0x100+rlen]);
+ d = uvchr_to_utf8(d, tbl[0x100+rlen]);
}
}
}
else if ((ch = tbl[comp]) >= 0) {
- d = uv_to_utf8(d, ch);
+ d = uvchr_to_utf8(d, ch);
matches++;
}
else if (ch == -1) { /* -1 is unmapped character */
if ((uv = swash_fetch(rv, s)) < none) {
s += UTF8SKIP(s);
matches++;
- d = uv_to_utf8(d, uv);
+ d = uvchr_to_utf8(d, uv);
}
else if (uv == none) {
int i = UTF8SKIP(s);
int i = UTF8SKIP(s);
s += i;
matches++;
- d = uv_to_utf8(d, final);
+ d = uvchr_to_utf8(d, final);
}
else
s += UTF8SKIP(s);
}
SvSETMAGIC(sv);
SvUTF8_on(sv);
+ /* Downgrading just 'cos it will is suspect - NI-S */
if (!isutf8 && !(PL_hints & HINT_UTF8))
sv_utf8_downgrade(sv, TRUE);
UV puv = 0xfeedface;
while (s < send) {
uv = swash_fetch(rv, s);
-
+
if (d > dend) {
STRLEN clen = d - dstart;
STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
matches++;
s += UTF8SKIP(s);
if (uv != puv) {
- d = uv_to_utf8(d, uv);
+ d = uvchr_to_utf8(d, uv);
puv = uv;
}
continue;
matches++;
s += UTF8SKIP(s);
if (uv != puv) {
- d = uv_to_utf8(d, final);
+ d = uvchr_to_utf8(d, final);
puv = final;
}
continue;
if (uv < none) {
matches++;
s += UTF8SKIP(s);
- d = uv_to_utf8(d, uv);
+ d = uvchr_to_utf8(d, uv);
continue;
}
else if (uv == none) { /* "none" is unmapped character */
else if (uv == extra && !del) {
matches++;
s += UTF8SKIP(s);
- d = uv_to_utf8(d, final);
+ d = uvchr_to_utf8(d, final);
continue;
}
matches++; /* "none+1" is delete character */
s = send - 1;
while (s > start && UTF8_IS_CONTINUATION(*s))
s--;
- if (utf8_to_uv_simple((U8*)s, 0)) {
+ if (utf8_to_uvchr((U8*)s, 0)) {
sv_setpvn(astr, s, send - s);
*s = '\0';
SvCUR_set(sv, s - start);
switch (optype) {
case OP_BIT_AND:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+ luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+ ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
rc += ulen;
rulen -= ulen;
duc = luc & ruc;
- dc = (char*)uv_to_utf8((U8*)dc, duc);
+ dc = (char*)uvchr_to_utf8((U8*)dc, duc);
}
if (sv == left || sv == right)
(void)sv_usepvn(sv, dcsave, needlen);
break;
case OP_BIT_XOR:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+ luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+ ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
rc += ulen;
rulen -= ulen;
duc = luc ^ ruc;
- dc = (char*)uv_to_utf8((U8*)dc, duc);
+ dc = (char*)uvchr_to_utf8((U8*)dc, duc);
}
goto mop_up_utf;
case OP_BIT_OR:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+ luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+ ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
rc += ulen;
rulen -= ulen;
duc = luc | ruc;
- dc = (char*)uv_to_utf8((U8*)dc, duc);
+ dc = (char*)uvchr_to_utf8((U8*)dc, duc);
}
mop_up_utf:
if (sv == left || sv == right)
#define utf8_to_bytes Perl_utf8_to_bytes
#define bytes_from_utf8 Perl_bytes_from_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
-#define utf8_to_uv_simple Perl_utf8_to_uv_simple
-#define utf8_to_uv Perl_utf8_to_uv
-#define uv_to_utf8 Perl_uv_to_utf8
+#define utf8_to_uvchr Perl_utf8_to_uvchr
+#define utf8_to_uvuni Perl_utf8_to_uvuni
+#define utf8n_to_uvchr Perl_utf8n_to_uvchr
+#define utf8n_to_uvuni Perl_utf8n_to_uvuni
+#define uvchr_to_utf8 Perl_uvchr_to_utf8
+#define uvuni_to_utf8 Perl_uvuni_to_utf8
#define vivify_defelem Perl_vivify_defelem
#define vivify_ref Perl_vivify_ref
#define wait4pid Perl_wait4pid
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
#define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
#define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
-#define utf8_to_uv_simple(a,b) Perl_utf8_to_uv_simple(aTHX_ a,b)
-#define utf8_to_uv(a,b,c,d) Perl_utf8_to_uv(aTHX_ a,b,c,d)
-#define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b)
+#define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b)
+#define utf8_to_uvuni(a,b) Perl_utf8_to_uvuni(aTHX_ a,b)
+#define utf8n_to_uvchr(a,b,c,d) Perl_utf8n_to_uvchr(aTHX_ a,b,c,d)
+#define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
+#define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX_ a,b)
+#define uvuni_to_utf8(a,b) Perl_uvuni_to_utf8(aTHX_ a,b)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
#define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b)
#define wait4pid(a,b,c) Perl_wait4pid(aTHX_ a,b,c)
#define bytes_from_utf8 Perl_bytes_from_utf8
#define Perl_bytes_to_utf8 CPerlObj::Perl_bytes_to_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
-#define Perl_utf8_to_uv_simple CPerlObj::Perl_utf8_to_uv_simple
-#define utf8_to_uv_simple Perl_utf8_to_uv_simple
-#define Perl_utf8_to_uv CPerlObj::Perl_utf8_to_uv
-#define utf8_to_uv Perl_utf8_to_uv
-#define Perl_uv_to_utf8 CPerlObj::Perl_uv_to_utf8
-#define uv_to_utf8 Perl_uv_to_utf8
+#define Perl_utf8_to_uvchr CPerlObj::Perl_utf8_to_uvchr
+#define utf8_to_uvchr Perl_utf8_to_uvchr
+#define Perl_utf8_to_uvuni CPerlObj::Perl_utf8_to_uvuni
+#define utf8_to_uvuni Perl_utf8_to_uvuni
+#define Perl_utf8n_to_uvchr CPerlObj::Perl_utf8n_to_uvchr
+#define utf8n_to_uvchr Perl_utf8n_to_uvchr
+#define Perl_utf8n_to_uvuni CPerlObj::Perl_utf8n_to_uvuni
+#define utf8n_to_uvuni Perl_utf8n_to_uvuni
+#define Perl_uvchr_to_utf8 CPerlObj::Perl_uvchr_to_utf8
+#define uvchr_to_utf8 Perl_uvchr_to_utf8
+#define Perl_uvuni_to_utf8 CPerlObj::Perl_uvuni_to_utf8
+#define uvuni_to_utf8 Perl_uvuni_to_utf8
#define Perl_vivify_defelem CPerlObj::Perl_vivify_defelem
#define vivify_defelem Perl_vivify_defelem
#define Perl_vivify_ref CPerlObj::Perl_vivify_ref
ApMd |U8* |utf8_to_bytes |U8 *s|STRLEN *len
ApMd |U8* |bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
ApMd |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
-Apd |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen
-Adp |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
-Apd |U8* |uv_to_utf8 |U8 *d|UV uv
+Apd |UV |utf8_to_uvchr |U8 *s|STRLEN* retlen
+Apd |UV |utf8_to_uvuni |U8 *s|STRLEN* retlen
+Adp |UV |utf8n_to_uvchr |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
+Adp |UV |utf8n_to_uvuni |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
+Apd |U8* |uvchr_to_utf8 |U8 *d|UV uv
+Apd |U8* |uvuni_to_utf8 |U8 *d|UV uv
p |void |vivify_defelem |SV* sv
p |void |vivify_ref |SV* sv|U32 to_what
p |I32 |wait4pid |Pid_t pid|int* statusp|int flags
if (!check && ckWARN_d(WARN_UTF8))
{
STRLEN clen;
- UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0);
- Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%"UVxf"}\" does not map to %s", ch, enc->name[0]);
+ UV ch = utf8n_to_uvuni(s+slen,(SvCUR(src)-slen),&clen,0);
+ Perl_warner(aTHX_ WARN_UTF8, "\"\\N{U+%"UVxf"}\" does not map to %s", ch, enc->name[0]);
/* FIXME: Skip over the character, copy in replacement and continue
* but that is messy so for now just fail.
*/
Perl_utf8_to_bytes
Perl_bytes_from_utf8
Perl_bytes_to_utf8
-Perl_utf8_to_uv_simple
-Perl_utf8_to_uv
-Perl_uv_to_utf8
+Perl_utf8_to_uvchr
+Perl_utf8_to_uvuni
+Perl_utf8n_to_uvchr
+Perl_utf8n_to_uvuni
+Perl_uvchr_to_utf8
+Perl_uvuni_to_utf8
Perl_warn
Perl_vwarn
Perl_warner
#define isPSXSPC_uni(c) (isSPACE_uni(c) ||(c) == '\f')
#define isBLANK_uni(c) isBLANK(c) /* could be wrong */
-#define isALNUM_LC_uni(c) (c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c))
-#define isIDFIRST_LC_uni(c) (c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c))
-#define isALPHA_LC_uni(c) (c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c))
-#define isSPACE_LC_uni(c) (c < 256 ? isSPACE_LC(c) : is_uni_space_lc(c))
-#define isDIGIT_LC_uni(c) (c < 256 ? isDIGIT_LC(c) : is_uni_digit_lc(c))
-#define isUPPER_LC_uni(c) (c < 256 ? isUPPER_LC(c) : is_uni_upper_lc(c))
-#define isLOWER_LC_uni(c) (c < 256 ? isLOWER_LC(c) : is_uni_lower_lc(c))
-#define isALNUMC_LC_uni(c) (c < 256 ? isALNUMC_LC(c) : is_uni_alnumc_lc(c))
-#define isCNTRL_LC_uni(c) (c < 256 ? isCNTRL_LC(c) : is_uni_cntrl_lc(c))
-#define isGRAPH_LC_uni(c) (c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c))
-#define isPRINT_LC_uni(c) (c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c))
-#define isPUNCT_LC_uni(c) (c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c))
-#define toUPPER_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_upper_lc(c))
-#define toTITLE_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c))
-#define toLOWER_LC_uni(c) (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c))
+#define isALNUM_LC_uvchr(c) (c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c))
+#define isIDFIRST_LC_uvchr(c) (c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c))
+#define isALPHA_LC_uvchr(c) (c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c))
+#define isSPACE_LC_uvchr(c) (c < 256 ? isSPACE_LC(c) : is_uni_space_lc(c))
+#define isDIGIT_LC_uvchr(c) (c < 256 ? isDIGIT_LC(c) : is_uni_digit_lc(c))
+#define isUPPER_LC_uvchr(c) (c < 256 ? isUPPER_LC(c) : is_uni_upper_lc(c))
+#define isLOWER_LC_uvchr(c) (c < 256 ? isLOWER_LC(c) : is_uni_lower_lc(c))
+#define isALNUMC_LC_uvchr(c) (c < 256 ? isALNUMC_LC(c) : is_uni_alnumc_lc(c))
+#define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : is_uni_cntrl_lc(c))
+#define isGRAPH_LC_uvchr(c) (c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c))
+#define isPRINT_LC_uvchr(c) (c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c))
+#define isPUNCT_LC_uvchr(c) (c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c))
+#define toUPPER_LC_uvchr(c) (c < 256 ? toUPPER_LC(c) : to_uni_upper_lc(c))
+#define toTITLE_LC_uvchr(c) (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c))
+#define toLOWER_LC_uvchr(c) (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c))
#define isPSXSPC_LC_uni(c) (isSPACE_LC_uni(c) ||(c) == '\f')
#define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */
#define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f')
#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */
-#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
-#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0))
+#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(utf8_to_uvchr(p, 0))
+#define toUPPER_LC_utf8(p) toUPPER_LC_uvchr(utf8_to_uvchr(p, 0))
+#define toTITLE_LC_utf8(p) toTITLE_LC_uvchr(utf8_to_uvchr(p, 0))
+#define toLOWER_LC_utf8(p) toLOWER_LC_uvchr(utf8_to_uvchr(p, 0))
#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f')
#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */
#define Perl_bytes_to_utf8 pPerl->Perl_bytes_to_utf8
#undef bytes_to_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
-#undef Perl_utf8_to_uv_simple
-#define Perl_utf8_to_uv_simple pPerl->Perl_utf8_to_uv_simple
-#undef utf8_to_uv_simple
-#define utf8_to_uv_simple Perl_utf8_to_uv_simple
-#undef Perl_utf8_to_uv
-#define Perl_utf8_to_uv pPerl->Perl_utf8_to_uv
-#undef utf8_to_uv
-#define utf8_to_uv Perl_utf8_to_uv
-#undef Perl_uv_to_utf8
-#define Perl_uv_to_utf8 pPerl->Perl_uv_to_utf8
-#undef uv_to_utf8
-#define uv_to_utf8 Perl_uv_to_utf8
+#undef Perl_utf8_to_uvchr
+#define Perl_utf8_to_uvchr pPerl->Perl_utf8_to_uvchr
+#undef utf8_to_uvchr
+#define utf8_to_uvchr Perl_utf8_to_uvchr
+#undef Perl_utf8_to_uvuni
+#define Perl_utf8_to_uvuni pPerl->Perl_utf8_to_uvuni
+#undef utf8_to_uvuni
+#define utf8_to_uvuni Perl_utf8_to_uvuni
+#undef Perl_utf8n_to_uvchr
+#define Perl_utf8n_to_uvchr pPerl->Perl_utf8n_to_uvchr
+#undef utf8n_to_uvchr
+#define utf8n_to_uvchr Perl_utf8n_to_uvchr
+#undef Perl_utf8n_to_uvuni
+#define Perl_utf8n_to_uvuni pPerl->Perl_utf8n_to_uvuni
+#undef utf8n_to_uvuni
+#define utf8n_to_uvuni Perl_utf8n_to_uvuni
+#undef Perl_uvchr_to_utf8
+#define Perl_uvchr_to_utf8 pPerl->Perl_uvchr_to_utf8
+#undef uvchr_to_utf8
+#define uvchr_to_utf8 Perl_uvchr_to_utf8
+#undef Perl_uvuni_to_utf8
+#define Perl_uvuni_to_utf8 pPerl->Perl_uvuni_to_utf8
+#undef uvuni_to_utf8
+#define uvuni_to_utf8 Perl_uvuni_to_utf8
#undef Perl_warn
#define Perl_warn pPerl->Perl_warn
#undef warn
*ep = d;
return *sp;
}
-
+
/* "register" allocation */
PL_modcount++;
return o;
case OP_CONST:
- if (o->op_private & (OPpCONST_BARE) &&
+ if (o->op_private & (OPpCONST_BARE) &&
!(type == OP_GREPSTART || type == OP_ENTERSUB || type == OP_REFGEN)) {
SV *sv = ((SVOP*)o)->op_sv;
GV *gv;
OP* enter;
gv = gv_fetchpv(SvPV_nolen(sv), TRUE, SVt_PVCV);
- enter = newUNOP(OP_ENTERSUB,0,
- newUNOP(OP_RV2CV, 0,
+ enter = newUNOP(OP_ENTERSUB,0,
+ newUNOP(OP_RV2CV, 0,
newGVOP(OP_GV, 0, gv)
));
enter->op_private |= OPpLVAL_INTRO;
for (j = 0; j < i; j++) {
U8 *s = cp[j];
I32 cur = j < i - 1 ? cp[j+1] - s : tend - s;
- UV val = utf8_to_uv(s, cur, &ulen, 0);
+ /* CHECKME: Use unicode code points for ranges - needs more thought ... NI-S */
+ UV val = utf8n_to_uvuni(s, cur, &ulen, 0);
s += ulen;
diff = val - nextmin;
if (diff > 0) {
- t = uv_to_utf8(tmpbuf,nextmin);
+ t = uvuni_to_utf8(tmpbuf,nextmin);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
if (diff > 1) {
- t = uv_to_utf8(tmpbuf, val - 1);
+ t = uvuni_to_utf8(tmpbuf, val - 1);
sv_catpvn(transv, "\377", 1);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
}
}
if (s < tend && *s == 0xff)
- val = utf8_to_uv(s+1, cur - 1, &ulen, 0);
+ val = utf8n_to_uvuni(s+1, cur - 1, &ulen, 0);
if (val >= nextmin)
nextmin = val + 1;
}
- t = uv_to_utf8(tmpbuf,nextmin);
+ t = uvuni_to_utf8(tmpbuf,nextmin);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
- t = uv_to_utf8(tmpbuf, 0x7fffffff);
+ t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
sv_catpvn(transv, "\377", 1);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
t = (U8*)SvPVX(transv);
while (t < tend || tfirst <= tlast) {
/* see if we need more "t" chars */
if (tfirst > tlast) {
- tfirst = (I32)utf8_to_uv(t, tend - t, &ulen, 0);
+ tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
t += ulen;
if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */
t++;
- tlast = (I32)utf8_to_uv(t, tend - t, &ulen, 0);
+ tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
t += ulen;
}
else
/* now see if we need more "r" chars */
if (rfirst > rlast) {
if (r < rend) {
- rfirst = (I32)utf8_to_uv(r, rend - r, &ulen, 0);
+ rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
r += ulen;
if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */
r++;
- rlast = (I32)utf8_to_uv(r, rend - r, &ulen, 0);
+ rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
r += ulen;
}
else
case OP_MATCH:
case OP_SUBST:
o->op_seq = PL_op_seqmax++;
- while (cPMOP->op_pmreplstart &&
+ while (cPMOP->op_pmreplstart &&
cPMOP->op_pmreplstart->op_type == OP_NULL)
cPMOP->op_pmreplstart = cPMOP->op_pmreplstart->op_next;
peep(cPMOP->op_pmreplstart);
if (PERL_REVISION > 127 || PERL_VERSION > 127 || PERL_SUBVERSION > 127)
SvGROW(PL_patchlevel, UTF8_MAXLEN*3+1);
s = (U8*)SvPVX(PL_patchlevel);
- s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_REVISION)));
- s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_VERSION)));
- s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_SUBVERSION)));
+ /* Build version strings using "native" characters */
+ s = uvchr_to_utf8(s, (UV)PERL_REVISION);
+ s = uvchr_to_utf8(s, (UV)PERL_VERSION);
+ s = uvchr_to_utf8(s, (UV)PERL_SUBVERSION);
*s = '\0';
SvCUR_set(PL_patchlevel, s - (U8*)SvPVX(PL_patchlevel));
SvPOK_on(PL_patchlevel);
#ifdef NEED_ENVIRON_DUP_FOR_MODIFY
{
char **env_base;
- for (env_base = env; *env; env++)
+ for (env_base = env; *env; env++)
dup_env_count++;
if ((dup_env_base = (char **)
safesysmalloc( sizeof(char *) * (dup_env_count+1) ))) {
return ((CPerlObj*)pPerl)->Perl_bytes_to_utf8(s, len);
}
-#undef Perl_utf8_to_uv_simple
+#undef Perl_utf8_to_uvchr
UV
-Perl_utf8_to_uv_simple(pTHXo_ U8 *s, STRLEN* retlen)
+Perl_utf8_to_uvchr(pTHXo_ U8 *s, STRLEN* retlen)
{
- return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_simple(s, retlen);
+ return ((CPerlObj*)pPerl)->Perl_utf8_to_uvchr(s, retlen);
}
-#undef Perl_utf8_to_uv
+#undef Perl_utf8_to_uvuni
UV
-Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags)
+Perl_utf8_to_uvuni(pTHXo_ U8 *s, STRLEN* retlen)
{
- return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, curlen, retlen, flags);
+ return ((CPerlObj*)pPerl)->Perl_utf8_to_uvuni(s, retlen);
}
-#undef Perl_uv_to_utf8
+#undef Perl_utf8n_to_uvchr
+UV
+Perl_utf8n_to_uvchr(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags)
+{
+ return ((CPerlObj*)pPerl)->Perl_utf8n_to_uvchr(s, curlen, retlen, flags);
+}
+
+#undef Perl_utf8n_to_uvuni
+UV
+Perl_utf8n_to_uvuni(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags)
+{
+ return ((CPerlObj*)pPerl)->Perl_utf8n_to_uvuni(s, curlen, retlen, flags);
+}
+
+#undef Perl_uvchr_to_utf8
+U8*
+Perl_uvchr_to_utf8(pTHXo_ U8 *d, UV uv)
+{
+ return ((CPerlObj*)pPerl)->Perl_uvchr_to_utf8(d, uv);
+}
+
+#undef Perl_uvuni_to_utf8
U8*
-Perl_uv_to_utf8(pTHXo_ U8 *d, UV uv)
+Perl_uvuni_to_utf8(pTHXo_ U8 *d, UV uv)
{
- return ((CPerlObj*)pPerl)->Perl_uv_to_utf8(d, uv);
+ return ((CPerlObj*)pPerl)->Perl_uvuni_to_utf8(d, uv);
}
#undef Perl_warn
UV result;
register UV buv;
bool buvok = SvUOK(TOPs);
-
+
if (buvok)
buv = SvUVX(TOPs);
else {
send = tmps + len;
while (tmps < send) {
- UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
+ UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
tmps += UTF8SKIP(tmps);
targlen += UNISKIP(~c);
nchar++;
if (nwide) {
Newz(0, result, targlen + 1, U8);
while (tmps < send) {
- UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
+ UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
tmps += UTF8SKIP(tmps);
- result = uv_to_utf8(result, ~c);
+ result = uvchr_to_utf8(result, ~c);
}
*result = '\0';
result -= targlen;
else {
Newz(0, result, nchar + 1, U8);
while (tmps < send) {
- U8 c = (U8)utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
+ U8 c = (U8)utf8n_to_uvchr(tmps, 0, &l, UTF8_ALLOW_ANY);
tmps += UTF8SKIP(tmps);
*result++ = ~c;
}
STRLEN len;
U8 *s = (U8*)SvPVx(argsv, len);
- XPUSHu(DO_UTF8(argsv) ? utf8_to_uv_simple(s, 0) : (*s & 0xff));
+ XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff));
RETURN;
}
if (value > 255 && !IN_BYTE) {
SvGROW(TARG, UNISKIP(value)+1);
- tmps = (char*)uv_to_utf8((U8*)SvPVX(TARG), value);
+ tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value);
SvCUR_set(TARG, tmps - SvPVX(TARG));
*tmps = '\0';
(void)SvPOK_only(TARG);
STRLEN ulen;
U8 tmpbuf[UTF8_MAXLEN+1];
U8 *tend;
- UV uv = utf8_to_uv(s, slen, &ulen, 0);
+ UV uv;
if (PL_op->op_private & OPpLOCALE) {
TAINT;
SvTAINTED_on(sv);
- uv = toTITLE_LC_uni(uv);
+ uv = toTITLE_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0));
}
else
uv = toTITLE_utf8(s);
- tend = uv_to_utf8(tmpbuf, uv);
+ tend = uvchr_to_utf8(tmpbuf, uv);
if (!SvPADTMP(sv) || tend - tmpbuf != ulen || SvREADONLY(sv)) {
dTARGET;
STRLEN ulen;
U8 tmpbuf[UTF8_MAXLEN+1];
U8 *tend;
- UV uv = utf8_to_uv(s, slen, &ulen, 0);
+ UV uv;
if (PL_op->op_private & OPpLOCALE) {
TAINT;
SvTAINTED_on(sv);
- uv = toLOWER_LC_uni(uv);
+ uv = toLOWER_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0));
}
else
uv = toLOWER_utf8(s);
- tend = uv_to_utf8(tmpbuf, uv);
+ tend = uvchr_to_utf8(tmpbuf, uv);
if (!SvPADTMP(sv) || tend - tmpbuf != ulen || SvREADONLY(sv)) {
dTARGET;
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, len, &ulen, 0)));
+ d = uvchr_to_utf8(d, toUPPER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0)));
s += ulen;
}
}
else {
while (s < send) {
- d = uv_to_utf8(d, toUPPER_utf8( s ));
+ d = uvchr_to_utf8(d, toUPPER_utf8( s ));
s += UTF8SKIP(s);
}
}
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, len, &ulen, 0)));
+ d = uvchr_to_utf8(d, toLOWER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0)));
s += ulen;
}
}
else {
while (s < send) {
- d = uv_to_utf8(d, toLOWER_utf8(s));
+ d = uvchr_to_utf8(d, toLOWER_utf8(s));
s += UTF8SKIP(s);
}
}
continue;
}
else {
- if (!utf8_to_uv_simple(s, 0))
+ if (!utf8_to_uvchr(s, 0))
break;
up = (char*)s;
s += UTF8SKIP(s);
STRLEN llen;
STRLEN rlen;
register char *pat = SvPV(left, llen);
+#if 0
+ /* Packed side is assumed to be octets - so force downgrade if it
+ has been UTF-8 encoded by accident
+ */
+ register char *s = SvPVbyte(right, rlen);
+#else
register char *s = SvPV(right, rlen);
+#endif
char *strend = s + rlen;
char *strbeg = s;
register char *patend = pat + llen;
if (checksum) {
while (len-- > 0 && s < strend) {
STRLEN alen;
- auint = utf8_to_uv((U8*)s, strend - s, &alen, 0);
+ auint = utf8n_to_uvchr((U8*)s, strend - s, &alen, 0);
along = alen;
s += along;
if (checksum > 32)
EXTEND_MORTAL(len);
while (len-- > 0 && s < strend) {
STRLEN alen;
- auint = utf8_to_uv((U8*)s, strend - s, &alen, 0);
+ auint = utf8n_to_uvchr((U8*)s, strend - s, &alen, 0);
along = alen;
s += along;
sv = NEWSV(37, 0);
fromstr = NEXTFROM;
auint = SvUV(fromstr);
SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1);
- SvCUR_set(cat, (char*)uv_to_utf8((U8*)SvEND(cat),auint)
+ SvCUR_set(cat, (char*)uvchr_to_utf8((U8*)SvEND(cat),auint)
- SvPVX(cat));
}
*SvEND(cat) = '\0';
register char *m = cx->sb_m;
char *orig = cx->sb_orig;
register REGEXP *rx = cx->sb_rx;
-
+
rxres_restore(&cx->sb_rxres, rx);
if (cx->sb_iters++) {
U8 *s = (U8*)SvPVX(sv);
U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
if (s < end) {
- rev = utf8_to_uv(s, end - s, &len, 0);
+ rev = utf8n_to_uvchr(s, end - s, &len, 0);
s += len;
if (s < end) {
- ver = utf8_to_uv(s, end - s, &len, 0);
+ ver = utf8n_to_uvchr(s, end - s, &len, 0);
s += len;
if (s < end)
- sver = utf8_to_uv(s, end - s, &len, 0);
+ sver = utf8n_to_uvchr(s, end - s, &len, 0);
}
}
if (PERL_REVISION < rev
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV Perl_utf8_to_uv_simple(pTHX_ U8 *s, STRLEN* retlen);
-PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
-PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
+PERL_CALLCONV UV Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
+PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
+PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
+PERL_CALLCONV U8* Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv);
PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv);
PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
PERL_CALLCONV I32 Perl_wait4pid(pTHX_ Pid_t pid, int* statusp, int flags);
if (UTF) {
U8 *s = (U8*)STRING(scan);
l = utf8_length(s, s + l);
- uc = utf8_to_uv_simple(s, NULL);
+ uc = utf8_to_uvchr(s, NULL);
}
min += l;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
if (UTF) {
U8 *s = (U8 *)STRING(scan);
l = utf8_length(s, s + l);
- uc = utf8_to_uv_simple(s, NULL);
+ uc = utf8_to_uvchr(s, NULL);
}
min += l;
if (data && (flags & SCF_DO_SUBSTR))
default:
normal_default:
if (UTF8_IS_START(*p) && UTF) {
- ender = utf8_to_uv((U8*)p, RExC_end - p,
+ ender = utf8n_to_uvuni((U8*)p, RExC_end - p,
&numlen, 0);
p += numlen;
}
p = regwhite(p, RExC_end);
if (UTF && FOLD) {
if (LOC)
- ender = toLOWER_LC_uni(ender);
+ ender = toLOWER_LC_uvchr(UNI_TO_NATIVE(ender));
else
ender = toLOWER_uni(ender);
}
if (!range)
rangebegin = RExC_parse;
if (UTF) {
- value = utf8_to_uv((U8*)RExC_parse,
+ value = utf8n_to_uvuni((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, 0);
RExC_parse += numlen;
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
if (UTF) {
- value = utf8_to_uv((U8*)RExC_parse,
+ value = utf8n_to_uvuni((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, 0);
RExC_parse += numlen;
STATIC void
S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
{
- *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s);
+ *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvuni_to_utf8((U8*)s, uv) - (U8*)s);
}
/*
U8 s[UTF8_MAXLEN+1];
for (i = 0; i <= 256; i++) { /* just the first 256 */
- U8 *e = uv_to_utf8(s, i);
+ U8 *e = uvuni_to_utf8(s, i);
if (i < 256 && swash_fetch(sw, s)) {
if (rangestart == -1)
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
- for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+ for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
}
else {
- for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+ for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
sv_catpv(sv, "-");
- for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
+ for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++)
put_byte(sv, *p);
}
rangestart = -1;
# define Perl_re_intuit_start my_re_intuit_start
/* *These* symbols are masked to allow static link. */
# define Perl_pregexec my_pregexec
-# define Perl_reginitcolors my_reginitcolors
+# define Perl_reginitcolors my_reginitcolors
# define Perl_regclass_swash my_regclass_swash
# define PERL_NO_GET_CONTEXT
-#endif
+#endif
/*SUPPRESS 112*/
/*
DEBUG_r(
PerlIO_printf(Perl_debug_log,
" restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
- (UV)paren, (IV)PL_regstartp[paren],
+ (UV)paren, (IV)PL_regstartp[paren],
(IV)(PL_reg_start_tmp[paren] - PL_bostr),
- (IV)PL_regendp[paren],
+ (IV)PL_regendp[paren],
(paren > *PL_reglastparen ? "(no)" : ""));
);
}
/* nosave: For optimizations. */
{
return
- regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
+ regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
nosave ? 0 : REXEC_COPY_STR);
}
PL_regprogram = prog->program;
#endif
PL_regnpar = prog->nparens;
- PL_regdata = prog->data;
- PL_reg_re = prog;
+ PL_regdata = prog->data;
+ PL_reg_re = prog;
}
-/*
+/*
* Need to implement the following flags for reg_anch:
*
* USE_INTUIT_NOML - Useful to call re_intuit_start() first
if (SvTAIL(check)) {
slen = SvCUR(check); /* >= 1 */
- if ( strend - s > slen || strend - s < slen - 1
+ if ( strend - s > slen || strend - s < slen - 1
|| (strend - s == slen && strend[-1] != '\n')) {
DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
goto fail_finish;
|| ( BmRARE(check) == '\n'
&& (BmPREVIOUS(check) == SvCUR(check) - 1)
&& SvTAIL(check) ))
- s = screaminstr(sv, check,
+ s = screaminstr(sv, check,
start_shift + (s - strbeg), end_shift, pp, 0);
else
goto fail_finish;
Thus we can arrive here only if check substr
is float. Redo checking for "other"=="fixed".
*/
- strpos = t + 1;
+ strpos = t + 1;
DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
goto do_other_anchored;
if (t + start_shift >= check_at) /* Contradicts floating=check */
goto retry_floating_check;
/* Recheck anchored substring, but not floating... */
- s = check_at;
+ s = check_at;
if (!check)
goto giveup;
DEBUG_r( PerlIO_printf(Perl_debug_log,
goto hop_and_restart;
}
DEBUG_r( if (t != s)
- PerlIO_printf(Perl_debug_log,
+ PerlIO_printf(Perl_debug_log,
"By STCLASS: moving %ld --> %ld\n",
(long)(t - i_strpos), (long)(s - i_strpos));
else
- PerlIO_printf(Perl_debug_log,
+ PerlIO_printf(Perl_debug_log,
"Does not contradict STCLASS...\n") );
}
giveup:
STRLEN len;
if (c1 == c2)
while (s <= e) {
- if ( utf8_to_uv_simple((U8*)s, &len) == c1
+ if ( utf8_to_uvchr((U8*)s, &len) == c1
&& regtry(prog, s) )
goto got_it;
s += len;
}
else
while (s <= e) {
- UV c = utf8_to_uv_simple((U8*)s, &len);
+ UV c = utf8_to_uvchr((U8*)s, &len);
if ( (c == c1 || c == c2) && regtry(prog, s) )
goto got_it;
s += len;
tmp = '\n';
else {
U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
-
- tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
+
+ tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0);
}
tmp = ((OP(c) == BOUND ?
- isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
+ isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
while (s < strend) {
if (tmp == !(OP(c) == BOUND ?
swash_fetch(PL_utf8_alnum, (U8*)s) :
tmp = '\n';
else {
U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
-
- tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
+
+ tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0);
}
tmp = ((OP(c) == NBOUND ?
- isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
+ isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
while (s < strend) {
if (tmp == !(OP(c) == NBOUND ?
swash_fetch(PL_utf8_alnum, (U8*)s) :
else {
if (prog->reganch & ROPT_UTF8 && do_utf8) {
U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
- PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0);
+ PL_regprev = utf8n_to_uvuni(s, (U8*)stringarg - s, NULL, 0);
}
else
PL_regprev = (U32)stringarg[-1];
}
/* Messy cases: unanchored match. */
- if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
+ if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
/* we have /x+whatever/ */
/* it must be a one character string (XXXX Except UTF?) */
char ch = SvPVX(prog->anchored_substr)[0];
/*SUPPRESS 560*/
else if (do_utf8 == (UTF!=0) &&
(prog->anchored_substr != Nullsv
- || (prog->float_substr != Nullsv
+ || (prog->float_substr != Nullsv
&& prog->float_max_offset < strend - s))) {
- SV *must = prog->anchored_substr
+ SV *must = prog->anchored_substr
? prog->anchored_substr : prog->float_substr;
- I32 back_max =
+ I32 back_max =
prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
- I32 back_min =
+ I32 back_min =
prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
char *last = HOP3c(strend, /* Cannot start after this */
-(I32)(CHR_SVLEN(must)
dontbother = end_shift;
strend = HOPc(strend, -dontbother);
while ( (s <= last) &&
- ((flags & REXEC_SCREAM)
+ ((flags & REXEC_SCREAM)
? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
end_shift, &scream_pos, 0))
: (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
- (unsigned char*)strend, must,
+ (unsigned char*)strend, must,
PL_multiline ? FBMrf_MULTILINE : 0))) ) {
DEBUG_r( did_match = 1 );
if (HOPc(s, -back_max) > last1) {
if (memEQ(strend - len + 1, little, len - 1))
last = strend - len + 1;
else if (!PL_multiline)
- last = memEQ(strend - len, little, len)
+ last = memEQ(strend - len, little, len)
? strend - len : Nullch;
else
goto find_last;
} else {
find_last:
- if (len)
+ if (len)
last = rninstr(s, strend, little, little + len);
else
last = strend; /* matching `$' */
prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
}
}
-
+
return 1;
phooey:
DEFSV = PL_reg_sv;
}
- if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
+ if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
&& (mg = mg_find(PL_reg_sv, 'g')))) {
/* prepare for quick setting of pos */
sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
after the current position the third one.
We assume that pref0_len <= pref_len, otherwise we
decrease pref0_len. */
- int pref_len = (locinput - PL_bostr) > (5 + taill) - l
+ int pref_len = (locinput - PL_bostr) > (5 + taill) - l
? (5 + taill) - l : locinput - PL_bostr;
int pref0_len;
pref_len++;
pref0_len = pref_len - (locinput - PL_reg_starttry);
if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
- l = ( PL_regeol - locinput > (5 + taill) - pref_len
+ l = ( PL_regeol - locinput > (5 + taill) - pref_len
? (5 + taill) - pref_len : PL_regeol - locinput);
while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
l--;
if (pref0_len > pref_len)
pref0_len = pref_len;
regprop(prop, scan);
- PerlIO_printf(Perl_debug_log,
+ PerlIO_printf(Perl_debug_log,
"%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
- (IV)(locinput - PL_bostr),
- PL_colors[4], pref0_len,
+ (IV)(locinput - PL_bostr),
+ PL_colors[4], pref0_len,
locinput - pref_len, PL_colors[5],
- PL_colors[2], pref_len - pref0_len,
+ PL_colors[2], pref_len - pref0_len,
locinput - pref_len + pref0_len, PL_colors[3],
(docolor ? "" : "> <"),
PL_colors[0], l, locinput, PL_colors[1],
case BOL:
if (locinput == PL_bostr
? PL_regprev == '\n'
- : (PL_multiline &&
+ : (PL_multiline &&
(nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
{
/* regtill = regbol; */
while (s < e) {
if (l >= PL_regeol)
sayNO;
- if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len))
+ if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
sayNO;
s++;
l += len;
while (s < e) {
if (l >= PL_regeol)
sayNO;
- if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len))
+ if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
sayNO;
s += len;
l++;
if (l >= PL_regeol) {
sayNO;
}
- if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
+ if ((UTF ? utf8n_to_uvuni((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
(c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
sayNO;
s += UTF ? UTF8SKIP(s) : 1;
ln = PL_regprev;
else {
U8 *r = reghop((U8*)locinput, -1);
-
- ln = utf8_to_uv(r, s - (char*)r, 0, 0);
+
+ ln = utf8n_to_uvuni(r, s - (char*)r, 0, 0);
}
if (OP(scan) == BOUND || OP(scan) == NBOUND) {
ln = isALNUM_uni(ln);
n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
}
else {
- ln = isALNUM_LC_uni(ln);
+ ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
n = isALNUM_LC_utf8((U8*)locinput);
}
}
COP *ocurcop = PL_curcop;
SV **ocurpad = PL_curpad;
SV *ret;
-
+
n = ARG(scan);
PL_op = (OP_4tree*)PL_regdata->data[n];
DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
SPAGAIN;
ret = POPs;
PUTBACK;
-
+
PL_op = oop;
PL_curpad = ocurpad;
PL_curcop = ocurcop;
pm.op_pmflags = 0;
re = CALLREGCOMP(aTHX_ t, t + len, &pm);
- if (!(SvFLAGS(ret)
+ if (!(SvFLAGS(ret)
& (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
PL_regprecomp = oprecomp;
PL_regnpar = onpar;
}
DEBUG_r(
- PerlIO_printf(Perl_debug_log,
+ PerlIO_printf(Perl_debug_log,
"Entering embedded `%s%.60s%s%s'\n",
PL_colors[0],
re->precomp,
state.re = PL_reg_re;
PL_regcc = 0;
-
+
cp = regcppush(0); /* Save *all* the positions. */
REGCP_SET(lastcp);
cache_re(re);
1) After matching X, regnode for CURLYX is processed;
- 2) This regnode creates infoblock on the stack, and calls
+ 2) This regnode creates infoblock on the stack, and calls
regmatch() recursively with the starting point at WHILEM node;
3) Each hit of WHILEM node tries to match A and Z (in the order
and whatever it mentions via ->next, and additional attached trees
corresponding to temporarily unset infoblocks as in "5" above.
- In the following picture infoblocks for outer loop of
+ In the following picture infoblocks for outer loop of
(Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
infoblocks are drawn below the "reset" infoblock.
PL_reginput = locinput;
DEBUG_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %ld out of %ld..%ld cc=%lx\n",
+ PerlIO_printf(Perl_debug_log,
+ "%*s %ld out of %ld..%ld cc=%lx\n",
REPORT_CODE_OFF+PL_regindent*2, "",
- (long)n, (long)cc->min,
+ (long)n, (long)cc->min,
(long)cc->max, (long)cc)
);
PL_regcc = cc;
if (n >= cc->max) { /* Maximum greed exceeded? */
- if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
+ if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
&& !(PL_reg_flags & RF_warned)) {
PL_reg_flags |= RF_warned;
Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
REPORT_CODE_OFF+PL_regindent*2, "")
);
}
- if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
+ if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
&& !(PL_reg_flags & RF_warned)) {
PL_reg_flags |= RF_warned;
Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
sayNO;
}
/* NOT REACHED */
- case BRANCHJ:
+ case BRANCHJ:
next = scan + ARG(scan);
if (next == scan)
next = NULL;
inner = NEXTOPER(NEXTOPER(scan));
goto do_branch;
- case BRANCH:
+ case BRANCH:
inner = NEXTOPER(scan);
do_branch:
{
{
I32 l = 0;
CHECKPOINT lastcp;
-
+
/* We suppose that the next guy does not need
backtracking: in particular, it is of constant length,
and has no parenths to influence future backrefs. */
c2 = to_utf8_upper(s);
}
else {
- c2 = c1 = utf8_to_uv_simple(s, NULL);
+ c2 = c1 = utf8_to_uvchr(s, NULL);
}
}
}
while (locinput <= e && *locinput != c1)
locinput++;
} else {
- while (locinput <= e
+ while (locinput <= e
&& *locinput != c1
&& *locinput != c2)
locinput++;
if (c1 == c2) {
for (count = 0;
locinput <= e &&
- utf8_to_uv_simple((U8*)locinput, &len) != c1;
+ utf8_to_uvchr((U8*)locinput, &len) != c1;
count++)
locinput += len;
} else {
for (count = 0; locinput <= e; count++) {
- UV c = utf8_to_uv_simple((U8*)locinput, &len);
+ UV c = utf8_to_uvchr((U8*)locinput, &len);
if (c == c1 || c == c2)
break;
- locinput += len;
+ locinput += len;
}
}
}
- if (locinput > e)
+ if (locinput > e)
sayNO;
/* PL_reginput == old now */
if (locinput != old) {
UV c;
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+ c = utf8_to_uvchr((U8*)PL_reginput, NULL);
else
- c = UCHARAT(PL_reginput);
+ c = UCHARAT(PL_reginput);
}
/* If it could work, try it. */
if (c1 == -1000 || c == c1 || c == c2)
while (n >= ln) {
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+ c = utf8_to_uvchr((U8*)PL_reginput, NULL);
else
- c = UCHARAT(PL_reginput);
+ c = UCHARAT(PL_reginput);
}
/* If it could work, try it. */
if (c1 == -1000 || c == c1 || c == c2)
while (n >= ln) {
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+ c = utf8_to_uvchr((U8*)PL_reginput, NULL);
else
- c = UCHARAT(PL_reginput);
+ c = UCHARAT(PL_reginput);
}
/* If it could work, try it. */
if (c1 == -1000 || c == c1 || c == c2)
case SUSPEND:
n = 1;
PL_reginput = locinput;
- goto do_ifmatch;
+ goto do_ifmatch;
case UNLESSM:
n = 0;
if (scan->flags) {
PL_reginput = s;
}
else {
- if (locinput < PL_bostr + scan->flags)
+ if (locinput < PL_bostr + scan->flags)
goto say_yes;
PL_reginput = locinput - scan->flags;
goto do_ifmatch;
PL_reginput = s;
}
else {
- if (locinput < PL_bostr + scan->flags)
+ if (locinput < PL_bostr + scan->flags)
goto say_no;
PL_reginput = locinput - scan->flags;
goto do_ifmatch;
{
re_unwind_branch_t *uwb = &(uw->branch);
I32 lastparen = uwb->lastparen;
-
+
REGCP_UNWIND(uwb->lastcp);
for (n = *PL_reglastparen; n > lastparen; n--)
PL_regendp[n] = -1;
*PL_reglastparen = n;
scan = next = uwb->next;
- if ( !scan ||
- OP(scan) != (uwb->type == RE_UNWIND_BRANCH
+ if ( !scan ||
+ OP(scan) != (uwb->type == RE_UNWIND_BRANCH
? BRANCH : BRANCHJ) ) { /* Failure */
unwind = uwb->prev;
#ifdef DEBUGGING
c = scan - PL_reginput;
PL_reginput = scan;
- DEBUG_r(
+ DEBUG_r(
{
SV *prop = sv_newmortal();
regprop(prop, p);
- PerlIO_printf(Perl_debug_log,
- "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
+ PerlIO_printf(Perl_debug_log,
+ "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
});
-
+
return(c);
}
/*
- regrepeat_hard - repeatedly match something, report total lenth and length
- *
+ *
* The repeater is supposed to have constant length.
*/
}
if (!res)
PL_reginput = scan;
-
+
return count;
}
SV *rv = (SV*)PL_regdata->data[n];
AV *av = (AV*)SvRV((SV*)rv);
SV **a;
-
+
si = *av_fetch(av, 0, FALSE);
a = av_fetch(av, 1, FALSE);
-
+
if (a)
sw = *a;
else if (si && doinit) {
STRLEN len;
if (do_utf8)
- c = utf8_to_uv_simple(p, &len);
+ c = utf8_to_uvchr(p, &len);
else
c = *p;
match = TRUE;
else if (flags & ANYOF_FOLD) {
U8 tmpbuf[UTF8_MAXLEN+1];
-
+
if (flags & ANYOF_LOCALE) {
PL_reg_flags |= RF_tainted;
- uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
+ uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
}
else
- uv_to_utf8(tmpbuf, toLOWER_utf8(p));
+ uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
if (swash_fetch(sw, tmpbuf))
match = TRUE;
}
STATIC U8 *
S_reghop(pTHX_ U8 *s, I32 off)
-{
+{
return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
}
STATIC U8 *
S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
-{
+{
if (off >= 0) {
while (off-- && s < lim) {
/* XXX could check well-formedness here */
STATIC U8 *
S_reghopmaybe(pTHX_ U8 *s, I32 off)
-{
+{
return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
}
len = 0;
while (s < send) {
STRLEN n;
-
- if (utf8_to_uv(s, UTF8SKIP(s), &n, 0)) {
+ /* We can use low level directly here as we are not looking at the values */
+ if (utf8n_to_uvuni(s, UTF8SKIP(s), &n, 0)) {
s += n;
len++;
}
uv = args ? va_arg(*args, int) : SvIVx(argsv);
if ((uv > 255 || (uv > 127 && SvUTF8(sv))) && !IN_BYTE) {
eptr = (char*)utf8buf;
- elen = uv_to_utf8((U8*)eptr, uv) - utf8buf;
+ elen = uvchr_to_utf8((U8*)eptr, uv) - utf8buf;
is_utf = TRUE;
}
else {
if (!veclen)
continue;
if (vec_utf)
- iv = (IV)utf8_to_uv(vecstr, veclen, &ulen, 0);
+ iv = (IV)utf8n_to_uvchr(vecstr, veclen, &ulen, 0);
else {
iv = *vecstr;
ulen = 1;
}
- if (iv <256)
- iv = NATIVE_TO_ASCII(iv); /* v-strings are codepoints */
vecstr += ulen;
veclen -= ulen;
}
if (!veclen)
continue;
if (vec_utf)
- uv = utf8_to_uv(vecstr, veclen, &ulen, 0);
+ uv = utf8n_to_uvchr(vecstr, veclen, &ulen, 0);
else {
uv = *vecstr;
ulen = 1;
}
- if (uv <256)
- uv = NATIVE_TO_ASCII(uv); /* v-strings are codepoints */
vecstr += ulen;
veclen -= ulen;
}
STATIC void
S_tokereport(pTHX_ char *thing, char* s, I32 rv)
-{
+{
SV *report;
DEBUG_T({
report = newSVpv(thing, 0);
STRLEN skip;
UV n;
if (utf)
- n = utf8_to_uv((U8*)start, len, &skip, 0);
+ n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
else {
n = *(U8*)start;
skip = 1;
if (hicount) {
char *old_pvx = SvPVX(sv);
char *src, *dst;
-
+
d = SvGROW(sv,
SvLEN(sv) + hicount + 1) +
(d - old_pvx);
}
if (has_utf8 || uv > 255) {
- d = (char*)uv_to_utf8((U8*)d, uv);
+ d = (char*)uvchr_to_utf8((U8*)d, uv);
has_utf8 = TRUE;
if (PL_lex_inwhat == OP_TRANS &&
PL_sublex_info.sub_op) {
STRLEN len = (STRLEN) -1;
UV uv;
if (this_utf8) {
- uv = utf8_to_uv((U8*)s, send - s, &len, 0);
+ uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0);
}
if (len == (STRLEN)-1) {
/* Illegal UTF8 (a high-bit byte), make it valid. */
char *old_pvx = SvPVX(sv);
/* need space for one extra char (NOTE: SvCUR() not set here) */
d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
- d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+ d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++);
}
else {
while (len--)
"Integer overflow in decimal number");
}
}
- /* THIS IS EVIL */
- if (rev < 256)
- rev = ASCII_TO_NATIVE(rev);
-
- tmpend = uv_to_utf8(tmpbuf, rev);
+ /* Append native character for the rev point */
+ tmpend = uvchr_to_utf8(tmpbuf, rev);
if (rev > revmax)
revmax = rev;
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
/* Unicode support */
/*
-=for apidoc A|U8*|uv_to_utf8|U8 *d|UV uv
+=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
+end of the new character. In other words,
- d = uv_to_utf8(d, uv);
+ d = uvuni_to_utf8(d, uv);
is the recommended Unicode-aware way of saying
*/
U8 *
-Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
{
- if (uv < 0x100)
- uv = NATIVE_TO_ASCII(uv);
if (uv < 0x80) {
*d++ = uv;
return d;
}
/*
+=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
+
+Adds the UTF8 representation of the Native codepoint C<uv> to the end
+of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+ d = uvchr_to_utf8(d, uv);
+
+is the recommended wide native character-aware way of saying
+
+ *(d++) = uv;
+
+=cut
+*/
+
+U8 *
+Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+{
+ if (uv < 0x100)
+ uv = NATIVE_TO_ASCII(uv);
+ return Perl_uvuni_to_utf8(aTHX_ d, uv);
+}
+
+
+/*
=for apidoc A|STRLEN|is_utf8_char|U8 *s
Tests if some arbitrary number of bytes begins in a valid UTF-8
character. Note that an ASCII character is a valid UTF-8 character.
The actual number of bytes in the UTF-8 character will be returned if
it is valid, otherwise 0.
-
+
=cut */
STRLEN
Perl_is_utf8_char(pTHX_ U8 *s)
}
/*
-=for apidoc A|UV|utf8_to_uv|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
+=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
-Returns the character value of the first character in the string C<s>
+Bottom level UTF-8 decode routine.
+Returns the unicode code point value of the first character in the string C<s>
which is assumed to be in UTF8 encoding and no longer than C<curlen>;
C<retlen> will be set to the length, in bytes, of that character.
The C<flags> can also contain various flags to allow deviations from
the strict UTF-8 encoding (see F<utf8.h>).
+Most code should use utf8_to_uvchr() rather than call this directly.
+
=cut */
UV
-Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
{
UV uv = *s, ouv;
STRLEN len = 1;
if (UTF8_IS_ASCII(uv)) {
if (retlen)
*retlen = 1;
- return ASCII_TO_NATIVE(*s);
+ return (UV) (*s);
}
if (UTF8_IS_CONTINUATION(uv) &&
warning = UTF8_WARN_NON_CONTINUATION;
goto malformed;
}
-
+
if ((uv == 0xfe || uv == 0xff) &&
!(flags & UTF8_ALLOW_FE_FF)) {
warning = UTF8_WARN_FE_FF;
if (retlen)
*retlen = len;
-
+
expectlen = len;
if ((curlen < expectlen) &&
}
/*
-=for apidoc A|U8* s|utf8_to_uv_simple|STRLEN *retlen
+=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
-Returns the character value of the first character in the string C<s>
+Returns the native character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
+Allows length and flags to be passed to low level routine.
+
+=cut
+*/
+
+UV
+Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+{
+ UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
+ if (uv < 0x100)
+ return (UV) ASCII_TO_NATIVE(uv);
+ return uv;
+}
+
+/*
+=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
+
+Returns the native character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+If C<s> does not point to a well-formed UTF8 character, zero is
+returned and retlen is set, if possible, to -1.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
+{
+ return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+}
+
+/*
+=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
+
+Returns the Unicode code point of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+This function should only be used when returned UV is considered
+an index into the Unicode semantic tables (e.g. swashes).
+
If C<s> does not point to a well-formed UTF8 character, zero is
returned and retlen is set, if possible, to -1.
*/
UV
-Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
+Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
{
- return Perl_utf8_to_uv(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+ /* Call the low level routine asking for checks */
+ return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
}
/*
d = s = save;
while (s < send) {
STRLEN ulen;
- *d++ = (U8)utf8_to_uv_simple(s, &ulen);
+ *d++ = (U8)utf8_to_uvchr(s, &ulen);
s += ulen;
}
*d = '\0';
Perl_is_uni_alnum(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_alnum(tmpbuf);
}
Perl_is_uni_alnumc(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_alnumc(tmpbuf);
}
Perl_is_uni_idfirst(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_idfirst(tmpbuf);
}
Perl_is_uni_alpha(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_alpha(tmpbuf);
}
Perl_is_uni_ascii(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_ascii(tmpbuf);
}
Perl_is_uni_space(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_space(tmpbuf);
}
Perl_is_uni_digit(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_digit(tmpbuf);
}
Perl_is_uni_upper(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_upper(tmpbuf);
}
Perl_is_uni_lower(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_lower(tmpbuf);
}
Perl_is_uni_cntrl(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_cntrl(tmpbuf);
}
Perl_is_uni_graph(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_graph(tmpbuf);
}
Perl_is_uni_print(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_print(tmpbuf);
}
Perl_is_uni_punct(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_punct(tmpbuf);
}
Perl_is_uni_xdigit(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return is_utf8_xdigit(tmpbuf);
}
Perl_to_uni_upper(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return to_utf8_upper(tmpbuf);
}
Perl_to_uni_title(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return to_utf8_title(tmpbuf);
}
Perl_to_uni_lower(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uv_to_utf8(tmpbuf, (UV)c);
+ uvuni_to_utf8(tmpbuf, (UV)c);
return to_utf8_lower(tmpbuf);
}
if (!PL_utf8_toupper)
PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_toupper, p);
- return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
+ return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
}
UV
if (!PL_utf8_totitle)
PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_totitle, p);
- return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
+ return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
}
UV
if (!PL_utf8_tolower)
PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_tolower, p);
- return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
+ return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
}
/* a "swash" is a swatch hash */
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
- PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, UTF8_MAXLEN, 0, 0) & ~(needents - 1))));
+ /* We call utf8_to_uni as we want and index into Unicode tables,
+ not a native character number.
+ */
+ PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))
#define UTF8_QUAD_MAX UINT64_C(0x1000000000)
/*
-
+
The following table is from Unicode 3.1.
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
(uv) < 0x200000 ? 4 : \
(uv) < 0x4000000 ? 5 : \
(uv) < 0x80000000 ? 6 : \
- (uv) < UTF8_QUAD_MAX ? 7 : 13 )
+ (uv) < UTF8_QUAD_MAX ? 7 : 13 )
#else
/* No, I'm not even going to *TRY* putting #ifdef inside a #define */
#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)
#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)
-/* EBCDIC-happy ways of converting native code to UTF8; the reverse
- process is taken care of in utf8_to_uv */
+/* EBCDIC-happy ways of converting native code to UTF8 */
#ifdef EBCDIC
#define NATIVE_TO_ASCII(ch) PL_e2a[(ch)]
#define ASCII_TO_NATIVE(ch) PL_a2e[(ch)]
+#define UNI_TO_NATIVE(ch) (((ch) > 0x100) ? (ch) : (UV) PL_a2e[(ch)])
+#define NATIVE_TO_UNI(ch) (((ch) > 0x100) ? (ch) : (UV) PL_e2a[(ch)])
#else
#define NATIVE_TO_ASCII(ch) (ch)
#define ASCII_TO_NATIVE(ch) (ch)
+#define UNI_TO_NATIVE(ch) (ch)
+#define NATIVE_TO_UNI(ch) (ch)
#endif
-#define UTF8_NEEDS_UPGRADE(ch) (NATIVE_TO_ASCII(ch) & 0x80)
-#define NATIVE_TO_UTF8(ch, string) STMT_START { \
- if (!UTF8_NEEDS_UPGRADE(ch)) \
- *(string)++ = NATIVE_TO_ASCII(ch); \
- else /* uv_to_utf8 is EBCDIC-aware */ \
- string = uv_to_utf8(string, ch); \
- } STMT_END