U8 *
Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
{
- if (UTF8_IS_INVARIANT(uv)) {
- *d++ = uv;
+ if (UNI_IS_INVARIANT(uv)) {
+ *d++ = UTF_TO_NATIVE(uv);
return d;
}
#if defined(EBCDIC) || 1 /* always for testing */
#endif /* Loop style */
}
-/*
-=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
-
-Adds the UTF8 representation of the Native codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
- d = uvchr_to_utf8(d, uv);
-
-is the recommended wide native character-aware way of saying
-
- *(d++) = uv;
-
-=cut
-*/
-
-U8 *
-Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
-{
- if (uv < 0x100)
- uv = NATIVE_TO_ASCII(uv);
- return Perl_uvuni_to_utf8(aTHX_ d, uv);
-}
/*
if (UTF8_IS_INVARIANT(uv)) {
if (retlen)
*retlen = 1;
- return (UV) (*s);
+ return (UV) (NATIVE_TO_UTF(*s));
}
if (UTF8_IS_CONTINUATION(uv) &&
}
#ifdef EBCDIC
- uv = NATIVE_TO_UVF(uv);
+ uv = NATIVE_TO_UTF(uv);
#else
if ((uv == 0xfe || uv == 0xff) &&
!(flags & UTF8_ALLOW_FE_FF)) {
}
/*
-=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
-
-Returns the native character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Allows length and flags to be passed to low level routine.
-
-=cut
-*/
-
-UV
-Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
-{
- UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
- if (uv < 0x100)
- return (UV) ASCII_TO_NATIVE(uv);
- return uv;
-}
-
-/*
=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
Returns the native character value of the first character in the string C<s>
U8 t = UTF8SKIP(s);
if (e - s < t)
- Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
+ Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
s += t;
len++;
}
*is_utf8 = 0;
-#ifndef EBCDIC
- /* Can use as-is if no high chars */
- if (!count)
- return start;
-#endif
-
Newz(801, d, (*len) - count + 1, U8);
s = start; start = d;
while (s < send) {
U8 c = *s++;
- if (!UTF8_IS_INVARIANT(c))
- c = UTF8_ACCUMULATE(c, *s++);
- *d++ = ASCII_TO_NATIVE(c);
+ if (!UTF8_IS_INVARIANT(c)) {
+ /* Then it is two-byte encoded */
+ c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
+ c = ASCII_TO_NATIVE(c);
+ }
+ *d++ = c;
}
*d = '\0';
*len = d - start;
while (s < send) {
UV uv = NATIVE_TO_ASCII(*s++);
- if (UTF8_IS_INVARIANT(uv))
- *d++ = uv;
+ if (UNI_IS_INVARIANT(uv))
+ *d++ = UTF_TO_NATIVE(uv);
else {
*d++ = UTF8_EIGHT_BIT_HI(uv);
*d++ = UTF8_EIGHT_BIT_LO(uv);
Perl_is_uni_alnum(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_alnum(tmpbuf);
}
Perl_is_uni_alnumc(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_alnumc(tmpbuf);
}
Perl_is_uni_idfirst(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_idfirst(tmpbuf);
}
Perl_is_uni_alpha(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_alpha(tmpbuf);
}
Perl_is_uni_ascii(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_ascii(tmpbuf);
}
Perl_is_uni_space(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_space(tmpbuf);
}
Perl_is_uni_digit(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_digit(tmpbuf);
}
Perl_is_uni_upper(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_upper(tmpbuf);
}
Perl_is_uni_lower(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_lower(tmpbuf);
}
Perl_is_uni_cntrl(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_cntrl(tmpbuf);
}
Perl_is_uni_graph(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_graph(tmpbuf);
}
Perl_is_uni_print(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_print(tmpbuf);
}
Perl_is_uni_punct(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_punct(tmpbuf);
}
Perl_is_uni_xdigit(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return is_utf8_xdigit(tmpbuf);
}
Perl_to_uni_upper(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return to_utf8_upper(tmpbuf);
}
Perl_to_uni_title(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return to_utf8_title(tmpbuf);
}
Perl_to_uni_lower(pTHX_ U32 c)
{
U8 tmpbuf[UTF8_MAXLEN+1];
- uvuni_to_utf8(tmpbuf, (UV)c);
+ uvchr_to_utf8(tmpbuf, (UV)c);
return to_utf8_lower(tmpbuf);
}
SV* retval;
SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
dSP;
+ HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
- if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
+ if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ENTER;
Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
LEAVE;
Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
{
HV* hv = (HV*)SvRV(sv);
+ /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
+ then the "swatch" is a vec() for al the chars which start
+ with 0xAA..0xYY
+ So the key in the hash is length of encoded char -1
+ */
U32 klen = UTF8SKIP(ptr) - 1;
- U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
+ U32 off = ptr[klen];
STRLEN slen;
- STRLEN needents = (klen ? 64 : 128);
+ STRLEN needents;
U8 *tmps;
U32 bit;
SV *retval;
+ if (klen == 0)
+ {
+ /* If char in invariant then swatch is for all the invariant chars
+ * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
+ */
+ needents = UTF_CONTINUATION_MARK;
+ off = NATIVE_TO_UTF(ptr[klen]);
+ }
+ else
+ {
+ /* If char is encoded then swatch is for the prefix */
+ needents = (1 << UTF_ACCUMULATION_SHIFT);
+ off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+ }
+
/*
* This single-entry cache saves about 1/3 of the utf8 overhead in test
* suite. (That is, only 7-8% overall over just a hash cache. Still,
/* If not cached, generate it via utf8::SWASHGET */
if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
dSP;
+ /* We use utf8n_to_uvuni() as we want an index into
+ Unicode tables, not a native character number.
+ */
+ UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
ENTER;
SAVETMPS;
save_re_context();
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
- /* We call utf8_to_uni as we want and index into Unicode tables,
- not a native character number.
- */
- PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
+ /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
+ PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))
svp = hv_store(hv, (char*)ptr, klen, retval, 0);
- if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
+ if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
}
Perl_croak(aTHX_ "panic: swash_fetch");
return 0;
}
+
+
+/*
+=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
+
+Adds the UTF8 representation of the Native codepoint C<uv> to the end
+of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+ d = uvchr_to_utf8(d, uv);
+
+is the recommended wide native character-aware way of saying
+
+ *(d++) = uv;
+
+=cut
+*/
+
+/* On ASCII machines this is normally a macro but we want a
+ real function in case XS code wants it
+*/
+#undef Perl_uvchr_to_utf8
+U8 *
+Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+{
+ return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+}
+
+
+/*
+=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
+
+Returns the native character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Allows length and flags to be passed to low level routine.
+
+=cut
+*/
+/* On ASCII machines this is normally a macro but we want a
+ real function in case XS code wants it
+*/
+#undef Perl_utf8n_to_uvchr
+UV
+Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+{
+ UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
+ return UNI_TO_NATIVE(uv);
+}
+
+