- Loose the extra level of function on ASCII.
- spotted a chr(0) issue in sv.c
- re-work of UTF-X tr/// ranges to work in Unicode
space. Still issues with the "0xff is illegal UTF-8" hack.
- Yet another ad. hoc. utf8 'upgrade' in op.c recoded
(why do it once when you can do it all over the place :-(
- Enable HINTS_UTF8 on EBCDIC - then ignore it in toke.c,
need utf8.pm for swashes.
- Simplified and commented scan_const() in toke.c
Still something wrong regexp and tr (swashes?).
p4raw-id: //depot/perlio@9267
close(DOC);
+unlink "pod/perlintern.pod";
+
open(GUTS, ">pod/perlintern.pod") or
die "Unable to create pod/perlintern.pod: $!\n";
print GUTS <<'END';
package utf8;
-if (ord('A') != 193) { # make things more pragmatic for EBCDIC folk
$utf8::hint_bits = 0x00800000;
Carp::croak("Undefined subroutine $AUTOLOAD called");
}
-}
-
1;
__END__
*sp = d;
while (s < e) {
- if (*s < 0x80 || *s == 0xff)
+ if (NATIVE_IS_INVARIANT(*s) || NATIVE_TO_UTF(*s) == 0xff)
*d++ = *s++;
else {
- U8 c = *s++;
- *d++ = ((c >> 6) | 0xc0);
- *d++ = ((c & 0x3f) | 0x80);
+ U8 c = NATIVE_TO_ASCII(*s++);
+ *d++ = UTF8_EIGHT_BIT_HI(c);
+ *d++ = UTF8_EIGHT_BIT_LO(c);
}
}
*ep = d;
}
static int
-utf8compare(const void *a, const void *b)
-{
- int i;
- for (i = 0; i < 10; i++) {
- if ((*(U8**)a)[i] < (*(U8**)b)[i])
- return -1;
- if ((*(U8**)a)[i] > (*(U8**)b)[i])
- return 1;
- }
+uvcompare(const void *a, const void *b)
+{
+ if (*((UV *)a) < (*(UV *)b))
+ return -1;
+ if (*((UV *)a) > (*(UV *)b))
+ return 1;
+ if (*((UV *)a+1) < (*(UV *)b+1))
+ return -1;
+ if (*((UV *)a+1) > (*(UV *)b+1))
+ return 1;
return 0;
}
U8* tsave = from_utf ? NULL : trlist_upgrade(&t, &tend);
U8* rsave = (to_utf || !rlen) ? NULL : trlist_upgrade(&r, &rend);
+/* There are several snags with this code on EBCDIC:
+ 1. 0xFF is a legal UTF-EBCDIC byte (there are no illegal bytes).
+ 2. scan_const() in toke.c has encoded chars in native encoding which makes
+ ranges at least in EBCDIC 0..255 range the bottom odd.
+*/
+
if (complement) {
U8 tmpbuf[UTF8_MAXLEN+1];
- U8** cp;
+ UV *cp;
UV nextmin = 0;
- New(1109, cp, tlen, U8*);
+ New(1109, cp, 2*tlen, UV);
i = 0;
transv = newSVpvn("",0);
while (t < tend) {
- cp[i++] = t;
- t += UTF8SKIP(t);
- if (t < tend && *t == 0xff) {
+ cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, 0);
+ t += ulen;
+ if (t < tend && NATIVE_TO_UTF(*t) == 0xff) {
t++;
- t += UTF8SKIP(t);
+ cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, 0);
+ t += ulen;
}
+ else {
+ cp[2*i+1] = cp[2*i];
+ }
+ i++;
}
- qsort(cp, i, sizeof(U8*), utf8compare);
+ qsort(cp, i, 2*sizeof(UV), uvcompare);
for (j = 0; j < i; j++) {
- U8 *s = cp[j];
- I32 cur = j < i - 1 ? cp[j+1] - s : tend - s;
- /* CHECKME: Use unicode code points for ranges - needs more thought ... NI-S */
- UV val = utf8n_to_uvuni(s, cur, &ulen, 0);
- s += ulen;
+ UV val = cp[2*j];
diff = val - nextmin;
if (diff > 0) {
t = uvuni_to_utf8(tmpbuf,nextmin);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
if (diff > 1) {
+ U8 range_mark = UTF_TO_NATIVE(0xff);
t = uvuni_to_utf8(tmpbuf, val - 1);
- sv_catpvn(transv, "\377", 1);
+ sv_catpvn(transv, (char *)&range_mark, 1);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
}
}
- if (s < tend && *s == 0xff)
- val = utf8n_to_uvuni(s+1, cur - 1, &ulen, 0);
+ val = cp[2*j+1];
if (val >= nextmin)
nextmin = val + 1;
}
t = uvuni_to_utf8(tmpbuf,nextmin);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
+ {
+ U8 range_mark = UTF_TO_NATIVE(0xff);
+ sv_catpvn(transv, (char *)&range_mark, 1);
+ }
t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
- sv_catpvn(transv, "\377", 1);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
t = (U8*)SvPVX(transv);
tlen = SvCUR(transv);
if (tfirst > tlast) {
tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
t += ulen;
- if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */
+ if (t < tend && NATIVE_TO_UTF(*t) == 0xff) { /* illegal utf8 val indicates range */
t++;
tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
t += ulen;
if (r < rend) {
rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
r += ulen;
- if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */
+ if (r < rend && NATIVE_TO_UTF(*r) == 0xff) { /* illegal utf8 val indicates range */
r++;
rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
r += ulen;
ST(0) = (SV*)XSANY.any_ptr;
XSRETURN(1);
}
+
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- else if (NATIVE_TO_ASCII(ender) >= 0x80 && UTF) {
+ else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
len += numlen;
}
break;
}
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- if (NATIVE_TO_ASCII(ender) >= 0x80 && UTF) {
+ if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
len += numlen - 1;
U8 s[UTF8_MAXLEN+1];
for (i = 0; i <= 256; i++) { /* just the first 256 */
- U8 *e = uvuni_to_utf8(s, i);
+ U8 *e = uvchr_to_utf8(s, i);
if (i < 256 && swash_fetch(sw, s)) {
if (rangestart == -1)
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
- for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+ for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
}
else {
- for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+ for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
sv_catpv(sv, "-");
- for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++)
+ for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
put_byte(sv, *p);
}
rangestart = -1;
else {
U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
- tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0);
+ tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
}
tmp = ((OP(c) == BOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
else {
U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
- tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0);
+ tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
}
tmp = ((OP(c) == NBOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
else {
if (prog->reganch & ROPT_UTF8 && do_utf8) {
U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
- PL_regprev = utf8n_to_uvuni(s, (U8*)stringarg - s, NULL, 0);
+ PL_regprev = utf8n_to_uvchr(s, (U8*)stringarg - s, NULL, 0);
}
else
PL_regprev = (U32)stringarg[-1];
if (l >= PL_regeol) {
sayNO;
}
- if ((UTF ? utf8n_to_uvuni((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
+ if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
(c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
sayNO;
s += UTF ? UTF8SKIP(s) : 1;
else {
U8 *r = reghop((U8*)locinput, -1);
- ln = utf8n_to_uvuni(r, s - (char*)r, 0, 0);
+ ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
}
if (OP(scan) == BOUND || OP(scan) == NBOUND) {
ln = isALNUM_uni(ln);
len = 0;
while (s < send) {
STRLEN n;
- /* We can use low level directly here as we are not looking at the values */
- if (utf8n_to_uvuni(s, UTF8SKIP(s), &n, 0)) {
+ /* Call utf8n_to_uvchr() to validate the sequence */
+ utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0);
+ if (n > 0) {
s += n;
len++;
}
#define XFAKEBRACK 128
#define XENUMMASK 127
-/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/
-#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || PL_hints & HINT_UTF8)
+#ifdef EBCDIC
+/* For now 'use utf8' does not affect tokenizer on EBCDIC */
+#define UTF (PL_linestr && DO_UTF8(PL_linestr))
+#else
+#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
+#endif
/* In variables name $^X, these are the legal values for X.
* 1999-02-27 mjd-perl-patch@plover.com */
register char *d = SvPVX(sv); /* destination for copies */
bool dorange = FALSE; /* are we in a translit range? */
bool didrange = FALSE; /* did we just finish a range? */
- bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr));
- /* the constant is UTF8 */
+ I32 has_utf8 = FALSE; /* Output constant is UTF8 */
+ I32 this_utf8 = UTF; /* The source string is assumed to be UTF8 */
UV uv;
- I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
- ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
- : UTF;
- I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
- ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ?
- OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF))
- : UTF;
const char *leaveit = /* set of acceptably-backslashed characters */
PL_lex_inpat
? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
: "";
+ if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+ /* If we are doing a trans and we know we want UTF8 set expectation */
+ has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
+ this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+ }
+
+
while (s < send || dorange) {
/* get transliterations out of the way (they're most literal) */
if (PL_lex_inwhat == OP_TRANS) {
I32 min; /* first character in range */
I32 max; /* last character in range */
- if (utf) {
+ if (has_utf8) {
char *c = (char*)utf8_hop((U8*)d, -1);
char *e = d++;
while (e-- > c)
*(e + 1) = *e;
- *c = (char)0xff;
+ *c = UTF_TO_NATIVE(0xff);
/* mark the range as done, and continue */
dorange = FALSE;
didrange = TRUE;
continue;
}
+
i = d - SvPVX(sv); /* remember current offset */
SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
d = SvPVX(sv) + i; /* refresh d after realloc */
if (didrange) {
Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
}
- if (utf) {
- *d++ = (char)0xff; /* use illegal utf8 byte--see pmtrans */
+ if (has_utf8) {
+ *d++ = UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
s++;
continue;
}
break; /* in regexp, $ might be tail anchor */
}
+ /* End of else if chain - OP_TRANS rejoin rest */
+
/* backslashes */
if (*s == '\\' && s+1 < send) {
s++;
PL_sublex_info.sub_op->op_private |=
(PL_lex_repl ? OPpTRANS_FROM_UTF
: OPpTRANS_TO_UTF);
- utf = TRUE;
}
}
else {
} /* end if (backslash) */
default_action:
- if (!UTF8_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
- STRLEN len = (STRLEN) -1;
- UV uv;
- if (this_utf8) {
- uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0);
- }
- if (len == (STRLEN)-1) {
- /* Illegal UTF8 (a high-bit byte), make it valid. */
- char *old_pvx = SvPVX(sv);
- /* need space for one extra char (NOTE: SvCUR() not set here) */
- d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
- d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++);
- }
- else {
- while (len--)
- *d++ = *s++;
- }
- has_utf8 = TRUE;
- if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
- PL_sublex_info.sub_op->op_private |=
- (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
- utf = TRUE;
- }
- continue;
- }
- *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+ /* If we started with encoded form, or already know we want it
+ and then encode the next character */
+ if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) {
+ STRLEN len = 1;
+ UV uv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
+ STRLEN need = UNISKIP(NATIVE_TO_UNI(uv));
+ s += len;
+ if (need > len) {
+ /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */
+ STRLEN off = d - SvPVX(sv);
+ d = SvGROW(sv, SvLEN(sv) + (need-len)) + off;
+ }
+ d = (char*)uvchr_to_utf8((U8*)d, uv);
+ has_utf8 = TRUE;
+ }
+ else {
+ *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+ }
} /* while loop to process each character */
/* terminate the string and set up the sv */
*d = '\0';
SvCUR_set(sv, d - SvPVX(sv));
+ if (SvCUR(sv) >= SvLEN(sv))
+ Perl_croak(aTHX_ "panic:constant overflowed allocated space");
+
SvPOK_on(sv);
- if (has_utf8)
+ if (has_utf8) {
SvUTF8_on(sv);
+ if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+ PL_sublex_info.sub_op->op_private |=
+ (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+ }
+ }
/* shrink the sv if we allocated more than we used */
if (SvCUR(sv) + 5 < SvLEN(sv)) {
#endif /* Loop style */
}
-/*
-=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
-
-Adds the UTF8 representation of the Native codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
- d = uvchr_to_utf8(d, uv);
-
-is the recommended wide native character-aware way of saying
-
- *(d++) = uv;
-
-=cut
-*/
-
-U8 *
-Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
-{
- return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
-}
/*
}
/*
-=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
-
-Returns the native character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Allows length and flags to be passed to low level routine.
-
-=cut
-*/
-
-UV
-Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
-{
- UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
- return UNI_TO_NATIVE(uv);
-}
-
-/*
=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
Returns the native character value of the first character in the string C<s>
Perl_is_uni_alnum(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_alnum(tmpbuf);
}
Perl_is_uni_alnumc(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_alnumc(tmpbuf);
}
Perl_is_uni_idfirst(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_idfirst(tmpbuf);
}
Perl_is_uni_alpha(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_alpha(tmpbuf);
}
Perl_is_uni_ascii(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_ascii(tmpbuf);
}
Perl_is_uni_space(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_space(tmpbuf);
}
Perl_is_uni_digit(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_digit(tmpbuf);
}
Perl_is_uni_upper(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_upper(tmpbuf);
}
Perl_is_uni_lower(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_lower(tmpbuf);
}
Perl_is_uni_cntrl(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_cntrl(tmpbuf);
}
Perl_is_uni_graph(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_graph(tmpbuf);
}
Perl_is_uni_print(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_print(tmpbuf);
}
Perl_is_uni_punct(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_punct(tmpbuf);
}
Perl_is_uni_xdigit(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_xdigit(tmpbuf);
}
Perl_to_uni_upper(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return to_utf8_upper(tmpbuf);
}
Perl_to_uni_title(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return to_utf8_title(tmpbuf);
}
Perl_to_uni_lower(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return to_utf8_lower(tmpbuf);
}
/* If not cached, generate it via utf8::SWASHGET */
if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
dSP;
+ /* We use utf8n_to_uvuni() as we want an index into
+ Unicode tables, not a native character number.
+ */
+ UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
ENTER;
SAVETMPS;
save_re_context();
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
- /* We call utf8_to_uni as we want and index into Unicode tables,
- not a native character number.
- */
- PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
+ PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1))));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))
Perl_croak(aTHX_ "panic: swash_fetch");
return 0;
}
+
+
+/*
+=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
+
+Adds the UTF8 representation of the Native codepoint C<uv> to the end
+of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+ d = uvchr_to_utf8(d, uv);
+
+is the recommended wide native character-aware way of saying
+
+ *(d++) = uv;
+
+=cut
+*/
+
+/* On ASCII machines this is normally a macro but we want a
+ real function in case XS code wants it
+*/
+#undef Perl_uvchr_to_utf8
+U8 *
+Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+{
+ return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+}
+
+
+/*
+=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
+
+Returns the native character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Allows length and flags to be passed to low level routine.
+
+=cut
+*/
+/* On ASCII machines this is normally a macro but we want a
+ real function in case XS code wants it
+*/
+#undef Perl_utf8n_to_uvchr
+UV
+Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+{
+ UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
+ return UNI_TO_NATIVE(uv);
+}
+
+
#define NATIVE_TO_NEED(enc,ch) (ch)
#define ASCII_TO_NEED(enc,ch) (ch)
+/* As there are no translations avoid the function wrapper */
+#define Perl_utf8n_to_uvchr Perl_utf8n_to_uvuni
+#define Perl_uvchr_to_utf8 Perl_uvuni_to_utf8
+
/*
The following table is from Unicode 3.1.
#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
#define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0)
-#define UTF8_IS_CONTINUATION(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) == 0xA0)
+#define UTF8_IS_CONTINUATION(c) ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0)
#define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0)
#define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xF8) == 0xC0)