Unicode support
finish byte <-> utf8 and localencoding <-> utf8 conversions
- make "$bytestr$charstr" do the right conversion
+ make substr($bytestr,0,0, $charstr) do the right conversion
add Unicode::Map equivivalent to core
add support for I/O disciplines
- open(F, "<!crlf!utf16", $file)
support C<print v1.2.3>
make C<v123> mean C<chr(123)> (if !exists(&v123))
autoload utf8_heavy.pl's swash routines in swash_init()
+ check uv_to_utf8() calls for buffer overflow
Multi-threading
support "use Thread;" under useithreads
#define sv_pv Perl_sv_pv
#define sv_pvutf8 Perl_sv_pvutf8
#define sv_pvbyte Perl_sv_pvbyte
+#define sv_utf8_upgrade Perl_sv_utf8_upgrade
+#define sv_utf8_downgrade Perl_sv_utf8_downgrade
+#define sv_utf8_encode Perl_sv_utf8_encode
+#define sv_utf8_decode Perl_sv_utf8_decode
#define sv_force_normal Perl_sv_force_normal
#define tmps_grow Perl_tmps_grow
#define sv_rvweaken Perl_sv_rvweaken
#define sv_pv(a) Perl_sv_pv(aTHX_ a)
#define sv_pvutf8(a) Perl_sv_pvutf8(aTHX_ a)
#define sv_pvbyte(a) Perl_sv_pvbyte(aTHX_ a)
+#define sv_utf8_upgrade(a) Perl_sv_utf8_upgrade(aTHX_ a)
+#define sv_utf8_downgrade(a,b) Perl_sv_utf8_downgrade(aTHX_ a,b)
+#define sv_utf8_encode(a) Perl_sv_utf8_encode(aTHX_ a)
+#define sv_utf8_decode(a) Perl_sv_utf8_decode(aTHX_ a)
#define sv_force_normal(a) Perl_sv_force_normal(aTHX_ a)
#define tmps_grow(a) Perl_tmps_grow(aTHX_ a)
#define sv_rvweaken(a) Perl_sv_rvweaken(aTHX_ a)
#define sv_pvutf8 Perl_sv_pvutf8
#define Perl_sv_pvbyte CPerlObj::Perl_sv_pvbyte
#define sv_pvbyte Perl_sv_pvbyte
+#define Perl_sv_utf8_upgrade CPerlObj::Perl_sv_utf8_upgrade
+#define sv_utf8_upgrade Perl_sv_utf8_upgrade
+#define Perl_sv_utf8_downgrade CPerlObj::Perl_sv_utf8_downgrade
+#define sv_utf8_downgrade Perl_sv_utf8_downgrade
+#define Perl_sv_utf8_encode CPerlObj::Perl_sv_utf8_encode
+#define sv_utf8_encode Perl_sv_utf8_encode
+#define Perl_sv_utf8_decode CPerlObj::Perl_sv_utf8_decode
+#define sv_utf8_decode Perl_sv_utf8_decode
#define Perl_sv_force_normal CPerlObj::Perl_sv_force_normal
#define sv_force_normal Perl_sv_force_normal
#define Perl_tmps_grow CPerlObj::Perl_tmps_grow
Ap |char* |sv_pv |SV *sv
Ap |char* |sv_pvutf8 |SV *sv
Ap |char* |sv_pvbyte |SV *sv
+Ap |void |sv_utf8_upgrade|SV *sv
+Ap |bool |sv_utf8_downgrade|SV *sv|bool fail_ok
+Ap |void |sv_utf8_encode |SV *sv
+Ap |bool |sv_utf8_decode |SV *sv
Ap |void |sv_force_normal|SV *sv
Ap |void |tmps_grow |I32 n
Ap |SV* |sv_rvweaken |SV *sv
Perl_sv_pv
Perl_sv_pvutf8
Perl_sv_pvbyte
+Perl_sv_utf8_upgrade
+Perl_sv_utf8_downgrade
+Perl_sv_utf8_encode
+Perl_sv_utf8_decode
Perl_sv_force_normal
Perl_tmps_grow
Perl_sv_rvweaken
#define Perl_sv_pvbyte pPerl->Perl_sv_pvbyte
#undef sv_pvbyte
#define sv_pvbyte Perl_sv_pvbyte
+#undef Perl_sv_utf8_upgrade
+#define Perl_sv_utf8_upgrade pPerl->Perl_sv_utf8_upgrade
+#undef sv_utf8_upgrade
+#define sv_utf8_upgrade Perl_sv_utf8_upgrade
+#undef Perl_sv_utf8_downgrade
+#define Perl_sv_utf8_downgrade pPerl->Perl_sv_utf8_downgrade
+#undef sv_utf8_downgrade
+#define sv_utf8_downgrade Perl_sv_utf8_downgrade
+#undef Perl_sv_utf8_encode
+#define Perl_sv_utf8_encode pPerl->Perl_sv_utf8_encode
+#undef sv_utf8_encode
+#define sv_utf8_encode Perl_sv_utf8_encode
+#undef Perl_sv_utf8_decode
+#define Perl_sv_utf8_decode pPerl->Perl_sv_utf8_decode
+#undef sv_utf8_decode
+#define sv_utf8_decode Perl_sv_utf8_decode
#undef Perl_sv_force_normal
#define Perl_sv_force_normal pPerl->Perl_sv_force_normal
#undef sv_force_normal
return ((CPerlObj*)pPerl)->Perl_sv_pvbyte(sv);
}
+#undef Perl_sv_utf8_upgrade
+void
+Perl_sv_utf8_upgrade(pTHXo_ SV *sv)
+{
+ ((CPerlObj*)pPerl)->Perl_sv_utf8_upgrade(sv);
+}
+
+#undef Perl_sv_utf8_downgrade
+bool
+Perl_sv_utf8_downgrade(pTHXo_ SV *sv, bool fail_ok)
+{
+ return ((CPerlObj*)pPerl)->Perl_sv_utf8_downgrade(sv, fail_ok);
+}
+
+#undef Perl_sv_utf8_encode
+void
+Perl_sv_utf8_encode(pTHXo_ SV *sv)
+{
+ ((CPerlObj*)pPerl)->Perl_sv_utf8_encode(sv);
+}
+
+#undef Perl_sv_utf8_decode
+bool
+Perl_sv_utf8_decode(pTHXo_ SV *sv)
+{
+ return ((CPerlObj*)pPerl)->Perl_sv_utf8_decode(sv);
+}
+
#undef Perl_sv_force_normal
void
Perl_sv_force_normal(pTHXo_ SV *sv)
}
}
#endif
+ if (DO_UTF8(right))
+ sv_utf8_upgrade(TARG);
sv_catpvn(TARG,s,len);
+ if (!IN_BYTE) {
+ if (SvUTF8(right))
+ SvUTF8_on(TARG);
+ }
+ else if (!SvUTF8(right)) {
+ SvUTF8_off(TARG);
+ }
}
else
sv_setpvn(TARG,s,len); /* suppress warning */
PERL_CALLCONV char* Perl_sv_pv(pTHX_ SV *sv);
PERL_CALLCONV char* Perl_sv_pvutf8(pTHX_ SV *sv);
PERL_CALLCONV char* Perl_sv_pvbyte(pTHX_ SV *sv);
+PERL_CALLCONV void Perl_sv_utf8_upgrade(pTHX_ SV *sv);
+PERL_CALLCONV bool Perl_sv_utf8_downgrade(pTHX_ SV *sv, bool fail_ok);
+PERL_CALLCONV void Perl_sv_utf8_encode(pTHX_ SV *sv);
+PERL_CALLCONV bool Perl_sv_utf8_decode(pTHX_ SV *sv);
PERL_CALLCONV void Perl_sv_force_normal(pTHX_ SV *sv);
PERL_CALLCONV void Perl_tmps_grow(pTHX_ I32 n);
PERL_CALLCONV SV* Perl_sv_rvweaken(pTHX_ SV *sv);
char *
Perl_sv_2pvbyte_nolen(pTHX_ register SV *sv)
{
- return sv_2pv_nolen(sv);
+ STRLEN n_a;
+ return sv_2pvbyte(sv, &n_a);
}
char *
char *
Perl_sv_2pvutf8_nolen(pTHX_ register SV *sv)
{
- return sv_2pv_nolen(sv);
+ STRLEN n_a;
+ return sv_2pvutf8(sv, &n_a);
}
char *
Perl_sv_2pvutf8(pTHX_ register SV *sv, STRLEN *lp)
{
+ sv_utf8_upgrade(sv);
return sv_2pv(sv,lp);
}
}
}
+void
+Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
+{
+ int hicount;
+ char *c;
+
+ if (!sv || !SvPOK(sv) || SvUTF8(sv))
+ return;
+
+ /* This function could be much more efficient if we had a FLAG
+ * to signal if there are any hibit chars in the string
+ */
+ hicount = 0;
+ for (c = SvPVX(sv); c < SvEND(sv); c++) {
+ if (*c & 0x80)
+ hicount++;
+ }
+
+ if (hicount) {
+ char *src, *dst;
+ SvGROW(sv, SvCUR(sv) + hicount + 1);
+
+ src = SvEND(sv) - 1;
+ SvCUR_set(sv, SvCUR(sv) + hicount);
+ dst = SvEND(sv) - 1;
+
+ while (src < dst) {
+ if (*src & 0x80) {
+ dst--;
+ uv_to_utf8((U8*)dst, (U8)*src--);
+ dst--;
+ }
+ else {
+ *dst-- = *src--;
+ }
+ }
+
+ SvUTF8_on(sv);
+ }
+}
+
+bool
+Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok)
+{
+ if (SvPOK(sv) && SvUTF8(sv)) {
+ char *c = SvPVX(sv);
+ char *first_hi = 0;
+ /* need to figure out if this is possible at all first */
+ while (c < SvEND(sv)) {
+ if (*c & 0x80) {
+ I32 len;
+ UV uv = utf8_to_uv(c, &len);
+ if (uv >= 256) {
+ if (fail_ok)
+ return FALSE;
+ else {
+ /* XXX might want to make a callback here instead */
+ croak("Big byte");
+ }
+ }
+ if (!first_hi)
+ first_hi = c;
+ c += len;
+ }
+ else {
+ c++;
+ }
+ }
+
+ if (first_hi) {
+ char *src = first_hi;
+ char *dst = first_hi;
+ while (src < SvEND(sv)) {
+ if (*src & 0x80) {
+ I32 len;
+ U8 u = (U8)utf8_to_uv(src, &len);
+ *dst++ = u;
+ src += len;
+ }
+ else {
+ *dst++ = *src++;
+ }
+ }
+ SvCUR_set(sv, dst - SvPVX(sv));
+ }
+ SvUTF8_off(sv);
+ }
+ return TRUE;
+}
+
+void
+Perl_sv_utf8_encode(pTHX_ register SV *sv)
+{
+ sv_utf8_upgrade(sv);
+ SvUTF8_off(sv);
+}
+
+bool
+Perl_sv_utf8_decode(pTHX_ register SV *sv)
+{
+ if (SvPOK(sv)) {
+ char *c;
+ bool has_utf = FALSE;
+ if (!sv_utf8_downgrade(sv, TRUE))
+ return FALSE;
+
+ /* it is actually just a matter of turning the utf8 flag on, but
+ * we want to make sure everything inside is valid utf8 first.
+ */
+ c = SvPVX(sv);
+ while (c < SvEND(sv)) {
+ if (*c & 0x80) {
+ I32 len;
+ (void)utf8_to_uv((U8*)c, &len);
+ if (len == 1) {
+ /* bad utf8 */
+ return FALSE;
+ }
+ c += len;
+ has_utf = TRUE;
+ }
+ else {
+ c++;
+ }
+ }
+
+ if (has_utf)
+ SvUTF8_on(sv);
+ }
+ return TRUE;
+}
+
+
/* Note: sv_setsv() should not be called with a source string that needs
* to be reused, since it may destroy the source string if it is marked
* as temporary.
STRLEN len;
if (!sstr)
return;
- if (s = SvPV(sstr, len))
+ if (s = SvPV(sstr, len)) {
+ if (SvUTF8(sstr))
+ sv_utf8_upgrade(dstr);
sv_catpvn(dstr,s,len);
- if (SvUTF8(sstr))
- SvUTF8_on(dstr);
+ if (SvUTF8(sstr))
+ SvUTF8_on(dstr);
+ }
}
/*
I32
Perl_sv_cmp(pTHX_ register SV *str1, register SV *str2)
{
- STRLEN cur1 = 0;
- char *pv1 = str1 ? SvPV(str1, cur1) : (char *) NULL;
- STRLEN cur2 = 0;
- char *pv2 = str2 ? SvPV(str2, cur2) : (char *) NULL;
+ STRLEN cur1, cur2;
+ char *pv1, *pv2;
I32 retval;
+ bool utf1;
+
+ if (str1) {
+ pv1 = SvPV(str1, cur1);
+ }
+ else {
+ cur1 = 0;
+ }
+
+ if (str2) {
+ if (SvPOK(str2)) {
+ if (SvPOK(str1) && SvUTF8(str1) != SvUTF8(str2) && !IN_BYTE) {
+ /* must upgrade other to UTF8 first */
+ if (SvUTF8(str1)) {
+ sv_utf8_upgrade(str2);
+ }
+ else {
+ sv_utf8_upgrade(str1);
+ /* refresh pointer and length */
+ pv1 = SvPVX(str1);
+ cur1 = SvCUR(str1);
+ }
+ }
+ pv2 = SvPVX(str2);
+ cur2 = SvCUR(str2);
+ }
+ else {
+ pv2 = sv_2pv(str2, &cur2);
+ }
+ }
+ else {
+ cur2 = 0;
+ }
if (!cur1)
return cur2 ? -1 : 0;
char *
Perl_sv_pvutf8(pTHX_ SV *sv)
{
+ sv_utf8_upgrade(sv);
return sv_pv(sv);
}
char *
Perl_sv_pvutf8n(pTHX_ SV *sv, STRLEN *lp)
{
+ sv_utf8_upgrade(sv);
return sv_pvn(sv,lp);
}
char *
Perl_sv_pvutf8n_force(pTHX_ SV *sv, STRLEN *lp)
{
+ sv_utf8_upgrade(sv);
return sv_pvn_force(sv,lp);
}
pos++;
if (*pos == '.' && isDIGIT(pos[1])) {
UV rev;
- U8 tmpbuf[10];
+ U8 tmpbuf[UTF8_MAXLEN];
U8 *tmpend;
NV nshift = 1.0;
bool utf8 = FALSE;
tmpbuf[0] = (U8)rev;
tmpend = &tmpbuf[1];
}
- *tmpend = '\0';
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
if (rev > 0)
SvNVX(sv) += (NV)rev/nshift;
s = pos;
tmpend = uv_to_utf8(tmpbuf, rev);
utf8 = utf8 || rev > 127;
- *tmpend = '\0';
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
if (rev > 0)
SvNVX(sv) += (NV)rev/nshift;
SvPOK_on(sv);
SvNOK_on(sv);
SvREADONLY_on(sv);
- if (utf8)
+ if (utf8) {
SvUTF8_on(sv);
+ sv_utf8_downgrade(sv, TRUE);
+ }
}
}
break;