{
dVAR;
dSP;
- SV *sv = TOPs;
+ SV *source = TOPs;
STRLEN len;
+ STRLEN min;
+ SV *dest;
+ const U8 *s;
+ U8 *d;
- SvGETMAGIC(sv);
- if (DO_UTF8(sv)) {
+ SvGETMAGIC(source);
+
+ if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source)
+ && !DO_UTF8(source)) {
+ /* We can convert in place. */
+
+ dest = source;
+ s = d = (U8*)SvPV_force_nomg(source, len);
+ min = len + 1;
+ } else {
dTARGET;
- const U8 *s;
- STRLEN ulen;
- register U8 *d;
- const U8 *send;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
- s = (const U8*)SvPV_nomg_const(sv,len);
- if (!len) {
- SvUTF8_off(TARG); /* decontaminate */
- sv_setpvn(TARG, "", 0);
- sv = TARG;
- SETs(sv);
+ dest = TARG;
+
+ /* The old implementation would copy source into TARG at this point.
+ This had the side effect that if source was undef, TARG was now
+ an undefined SV with PADTMP set, and they don't warn inside
+ sv_2pv_flags(). However, we're now getting the PV direct from
+ source, which doesn't have PADTMP set, so it would warn. Hence the
+ little games. */
+
+ if (SvOK(source)) {
+ s = (const U8*)SvPV_nomg_const(source, len);
+ } else {
+ s = "";
+ len = 0;
}
- else {
- STRLEN min = len + 1;
+ min = len + 1;
- SvUPGRADE(TARG, SVt_PV);
- SvGROW(TARG, min);
- (void)SvPOK_only(TARG);
- d = (U8*)SvPVX(TARG);
- send = s + len;
- while (s < send) {
- const STRLEN u = UTF8SKIP(s);
- const UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
+ SvUPGRADE(dest, SVt_PV);
+ d = SvGROW(dest, min);
+ (void)SvPOK_only(dest);
-#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */
- if (uv == GREEK_CAPITAL_LETTER_SIGMA) {
- NOOP;
- /*
- * Now if the sigma is NOT followed by
- * /$ignorable_sequence$cased_letter/;
- * and it IS preceded by
- * /$cased_letter$ignorable_sequence/;
- * where $ignorable_sequence is
- * [\x{2010}\x{AD}\p{Mn}]*
- * and $cased_letter is
- * [\p{Ll}\p{Lo}\p{Lt}]
- * then it should be mapped to 0x03C2,
- * (GREEK SMALL LETTER FINAL SIGMA),
- * instead of staying 0x03A3.
- * "should be": in other words,
- * this is not implemented yet.
- * See lib/unicore/SpecialCasing.txt.
- */
- }
- if (ulen > u && (SvLEN(TARG) < (min += ulen - u))) {
- /* If the eventually required minimum size outgrows
- * the available space, we need to grow. */
- const UV o = d - (U8*)SvPVX_const(TARG);
+ SETs(dest);
+ }
- /* If someone lowercases one million U+0130s we
- * SvGROW() one million times. Or we could try
- * guessing how much to allocate without allocating.
- * too much. Such is life. */
- SvGROW(TARG, min);
- d = (U8*)SvPVX(TARG) + o;
- }
- Copy(tmpbuf, d, ulen, U8);
- d += ulen;
- s += u;
+ /* Overloaded values may have toggled the UTF-8 flag on source, so we need
+ to check DO_UTF8 again here. */
+
+ if (DO_UTF8(source)) {
+ const U8 *const send = s + len;
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+
+ while (s < send) {
+ const STRLEN u = UTF8SKIP(s);
+ STRLEN ulen;
+ const UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
+
+#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */
+ if (uv == GREEK_CAPITAL_LETTER_SIGMA) {
+ NOOP;
+ /*
+ * Now if the sigma is NOT followed by
+ * /$ignorable_sequence$cased_letter/;
+ * and it IS preceded by /$cased_letter$ignorable_sequence/;
+ * where $ignorable_sequence is [\x{2010}\x{AD}\p{Mn}]*
+ * and $cased_letter is [\p{Ll}\p{Lo}\p{Lt}]
+ * then it should be mapped to 0x03C2,
+ * (GREEK SMALL LETTER FINAL SIGMA),
+ * instead of staying 0x03A3.
+ * "should be": in other words, this is not implemented yet.
+ * See lib/unicore/SpecialCasing.txt.
+ */
}
- *d = '\0';
- SvUTF8_on(TARG);
- SvCUR_set(TARG, d - (U8*)SvPVX_const(TARG));
- sv = TARG;
- SETs(sv);
- }
- }
- else {
- U8 *s;
- if (!SvPADTMP(sv) || SvREADONLY(sv)) {
- dTARGET;
- SvUTF8_off(TARG); /* decontaminate */
- sv_setsv_nomg(TARG, sv);
- sv = TARG;
- SETs(sv);
+ if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
+ /* If the eventually required minimum size outgrows
+ * the available space, we need to grow. */
+ const UV o = d - (U8*)SvPVX_const(dest);
+
+ /* If someone lowercases one million U+0130s we SvGROW() one
+ * million times. Or we could try guessing how much to
+ allocate without allocating too much. Such is life. */
+ SvGROW(dest, min);
+ d = (U8*)SvPVX(dest) + o;
+ }
+ Copy(tmpbuf, d, ulen, U8);
+ d += ulen;
+ s += u;
}
-
- s = (U8*)SvPV_force_nomg(sv, len);
+ SvUTF8_on(dest);
+ *d = '\0';
+ SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+ } else {
if (len) {
- register const U8 * const send = s + len;
-
+ const U8 *const send = s + len;
if (IN_LOCALE_RUNTIME) {
TAINT;
- SvTAINTED_on(sv);
- for (; s < send; s++)
- *s = toLOWER_LC(*s);
+ SvTAINTED_on(dest);
+ for (; s < send; d++, s++)
+ *d = toLOWER_LC(*s);
}
else {
- for (; s < send; s++)
- *s = toLOWER(*s);
+ for (; s < send; d++, s++)
+ *d = toLOWER(*s);
}
}
+ if (source != dest) {
+ *d = '\0';
+ SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+ }
}
- SvSETMAGIC(sv);
+ SvSETMAGIC(dest);
RETURN;
}
}
}
-use Test::More tests => 8;
+use Test::More tests => 12;
-package UTF8Field;
+package UTF8Toggle;
use strict;
use overload '""' => 'stringify';
foreach my $t ("ASCII", "B\366se") {
my $length = length $t;
- my $u = UTF8Field->new($t);
+ my $u = UTF8Toggle->new($t);
is (length $u, $length, "length of '$t'");
is (length $u, $length, "length of '$t'");
is (length $u, $length, "length of '$t'");
is (length $u, $length, "length of '$t'");
}
+
+my $have_setlocale = 0;
+eval {
+ require POSIX;
+ import POSIX ':locale_h';
+ $have_setlocale++;
+};
+
+SKIP: {
+ if (!$have_setlocale) {
+ skip "No setlocale", 4;
+ } elsif (!setlocale(&POSIX::LC_ALL, "en_GB.ISO8859-1")) {
+ skip "Could not setlocale to en_GB.ISO8859-1", 4;
+ } else {
+ use locale;
+ my $u = UTF8Toggle->new("\311");
+ my $lc = lc $u;
+ is (length $lc, 1);
+ is ($lc, "\351", "E accute -> e accute");
+ $lc = lc $u;
+ is (length $lc, 1);
+ is ($lc, "\351", "E accute -> e accute");
+ }
+}