/* FALL THROUGH */
default:
if (PerlIO_isutf8(fp)) {
- tmps = SvPVutf8(sv, len);
- }
- else {
- if (DO_UTF8(sv))
- sv_utf8_downgrade(sv, FALSE);
- tmps = SvPV(sv, len);
+ if (!SvUTF8(sv))
+ sv_utf8_upgrade(sv = sv_mortalcopy(sv));
}
+ else if (DO_UTF8(sv))
+ sv_utf8_downgrade((sv = sv_mortalcopy(sv)), FALSE);
+ tmps = SvPV(sv, len);
break;
}
/* To detect whether the process is about to overstep its
#define utf8_distance Perl_utf8_distance
#define utf8_hop Perl_utf8_hop
#define utf8_to_bytes Perl_utf8_to_bytes
+#define bytes_from_utf8 Perl_bytes_from_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
#define utf8_to_uv_simple Perl_utf8_to_uv_simple
#define utf8_to_uv Perl_utf8_to_uv
#define utf8_distance(a,b) Perl_utf8_distance(aTHX_ a,b)
#define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b)
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
+#define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
#define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
#define utf8_to_uv_simple(a,b) Perl_utf8_to_uv_simple(aTHX_ a,b)
#define utf8_to_uv(a,b,c,d) Perl_utf8_to_uv(aTHX_ a,b,c,d)
#define utf8_hop Perl_utf8_hop
#define Perl_utf8_to_bytes CPerlObj::Perl_utf8_to_bytes
#define utf8_to_bytes Perl_utf8_to_bytes
+#define Perl_bytes_from_utf8 CPerlObj::Perl_bytes_from_utf8
+#define bytes_from_utf8 Perl_bytes_from_utf8
#define Perl_bytes_to_utf8 CPerlObj::Perl_bytes_to_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
#define Perl_utf8_to_uv_simple CPerlObj::Perl_utf8_to_uv_simple
Apd |IV |utf8_distance |U8 *a|U8 *b
Apd |U8* |utf8_hop |U8 *s|I32 off
ApMd |U8* |utf8_to_bytes |U8 *s|STRLEN *len
+ApMd |U8* |bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
ApMd |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
Apd |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen
Adp |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
Perl_utf8_distance
Perl_utf8_hop
Perl_utf8_to_bytes
+Perl_bytes_from_utf8
Perl_bytes_to_utf8
Perl_utf8_to_uv_simple
Perl_utf8_to_uv
register HE *entry;
SV *sv;
bool is_utf8 = FALSE;
+ const char *keysave = key;
if (!hv)
return 0;
return 0;
}
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8);
+
PERL_HASH(hash, key, klen);
entry = ((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max];
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
return &HeVAL(entry);
}
#ifdef DYNAMIC_ENV_FETCH /* %ENV lookup? If so, try to fetch the value now */
if (env) {
sv = newSVpvn(env,len);
SvTAINTED_on(sv);
+ if (key != keysave)
+ Safefree(key);
return hv_store(hv,key,klen,sv,hash);
}
}
#endif
if (lval) { /* gonna assign to this, so it better be there */
sv = NEWSV(61,0);
- return hv_store(hv,key,is_utf8?-klen:klen,sv,hash);
+ if (key != keysave) { /* must be is_utf8 == 0 */
+ SV **ret = hv_store(hv,key,klen,sv,hash);
+ Safefree(key);
+ return ret;
+ }
+ else
+ return hv_store(hv,key,is_utf8?-klen:klen,sv,hash);
}
+ if (key != keysave)
+ Safefree(key);
return 0;
}
register HE *entry;
SV *sv;
bool is_utf8;
+ char *keysave;
if (!hv)
return 0;
return 0;
}
- key = SvPV(keysv, klen);
+ keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv)!=0);
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+
if (!hash)
PERL_HASH(hash, key, klen);
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
return entry;
}
#ifdef DYNAMIC_ENV_FETCH /* %ENV lookup? If so, try to fetch the value now */
}
}
#endif
+ if (key != keysave)
+ Safefree(key);
if (lval) { /* gonna assign to this, so it better be there */
sv = NEWSV(61,0);
return hv_store_ent(hv,keysv,sv,hash);
register HE *entry;
register HE **oentry;
bool is_utf8 = FALSE;
+ const char *keysave = key;
if (!hv)
return 0;
#endif
}
}
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8);
+
if (!hash)
PERL_HASH(hash, key, klen);
continue;
SvREFCNT_dec(HeVAL(entry));
HeVAL(entry) = val;
+ if (key != keysave)
+ Safefree(key);
return &HeVAL(entry);
}
HeKEY_hek(entry) = share_hek(key, is_utf8?-klen:klen, hash);
else /* gotta do the real thing */
HeKEY_hek(entry) = save_hek(key, is_utf8?-klen:klen, hash);
+ if (key != keysave)
+ Safefree(key);
HeVAL(entry) = val;
HeNEXT(entry) = *oentry;
*oentry = entry;
register HE *entry;
register HE **oentry;
bool is_utf8;
+ char *keysave;
if (!hv)
return 0;
}
}
- key = SvPV(keysv, klen);
+ keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv) != 0);
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+
if (!hash)
PERL_HASH(hash, key, klen);
continue;
SvREFCNT_dec(HeVAL(entry));
HeVAL(entry) = val;
+ if (key != keysave)
+ Safefree(key);
return entry;
}
HeKEY_hek(entry) = share_hek(key, is_utf8?-klen:klen, hash);
else /* gotta do the real thing */
HeKEY_hek(entry) = save_hek(key, is_utf8?-klen:klen, hash);
+ if (key != keysave)
+ Safefree(key);
HeVAL(entry) = val;
HeNEXT(entry) = *oentry;
*oentry = entry;
SV **svp;
SV *sv;
bool is_utf8 = FALSE;
+ const char *keysave = key;
if (!hv)
return Nullsv;
if (!xhv->xhv_array)
return Nullsv;
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8);
+
PERL_HASH(hash, key, klen);
oentry = &((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max];
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
*oentry = HeNEXT(entry);
if (i && !*oentry)
xhv->xhv_fill--;
--xhv->xhv_keys;
return sv;
}
+ if (key != keysave)
+ Safefree(key);
return Nullsv;
}
register HE **oentry;
SV *sv;
bool is_utf8;
+ char *keysave;
if (!hv)
return Nullsv;
if (!xhv->xhv_array)
return Nullsv;
- key = SvPV(keysv, klen);
+ keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv) != 0);
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+
if (!hash)
PERL_HASH(hash, key, klen);
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
*oentry = HeNEXT(entry);
if (i && !*oentry)
xhv->xhv_fill--;
--xhv->xhv_keys;
return sv;
}
+ if (key != keysave)
+ Safefree(key);
return Nullsv;
}
register HE *entry;
SV *sv;
bool is_utf8 = FALSE;
+ const char *keysave = key;
if (!hv)
return 0;
return 0;
#endif
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8);
+
PERL_HASH(hash, key, klen);
#ifdef DYNAMIC_ENV_FETCH
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
return TRUE;
}
#ifdef DYNAMIC_ENV_FETCH /* is it out there? */
}
}
#endif
+ if (key != keysave)
+ Safefree(key);
return FALSE;
}
register HE *entry;
SV *sv;
bool is_utf8;
+ char *keysave;
if (!hv)
return 0;
return 0;
#endif
- key = SvPV(keysv, klen);
+ keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv) != 0);
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
if (!hash)
PERL_HASH(hash, key, klen);
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
return TRUE;
}
#ifdef DYNAMIC_ENV_FETCH /* is it out there? */
}
}
#endif
+ if (key != keysave)
+ Safefree(key);
return FALSE;
}
register I32 i = 1;
I32 found = 0;
bool is_utf8 = FALSE;
+ const char *save = str;
if (len < 0) {
len = -len;
is_utf8 = TRUE;
+ if (!(PL_hints & HINT_UTF8_DISTINCT))
+ str = (char*)bytes_from_utf8((U8*)str, (STRLEN*)&len, &is_utf8);
}
/* what follows is the moral equivalent of:
break;
}
UNLOCK_STRTAB_MUTEX;
-
+ if (str != save)
+ Safefree(str);
if (!found && ckWARN_d(WARN_INTERNAL))
Perl_warner(aTHX_ WARN_INTERNAL, "Attempt to free non-existent shared string '%s'",str);
}
register I32 i = 1;
I32 found = 0;
bool is_utf8 = FALSE;
+ const char *save = str;
if (len < 0) {
len = -len;
is_utf8 = TRUE;
+ if (!(PL_hints & HINT_UTF8_DISTINCT))
+ str = (char*)bytes_from_utf8((U8*)str, (STRLEN*)&len, &is_utf8);
}
/* what follows is the moral equivalent of:
++HeVAL(entry); /* use value slot as REFCNT */
UNLOCK_STRTAB_MUTEX;
+ if (str != save)
+ Safefree(str);
return HeKEY_hek(entry);
}
-
-
-
#define Perl_utf8_to_bytes pPerl->Perl_utf8_to_bytes
#undef utf8_to_bytes
#define utf8_to_bytes Perl_utf8_to_bytes
+#undef Perl_bytes_from_utf8
+#define Perl_bytes_from_utf8 pPerl->Perl_bytes_from_utf8
+#undef bytes_from_utf8
+#define bytes_from_utf8 Perl_bytes_from_utf8
#undef Perl_bytes_to_utf8
#define Perl_bytes_to_utf8 pPerl->Perl_bytes_to_utf8
#undef bytes_to_utf8
return ((CPerlObj*)pPerl)->Perl_utf8_to_bytes(s, len);
}
+#undef Perl_bytes_from_utf8
+U8*
+Perl_bytes_from_utf8(pTHXo_ U8 *s, STRLEN *len, bool *is_utf8)
+{
+ return ((CPerlObj*)pPerl)->Perl_bytes_from_utf8(s, len, is_utf8);
+}
+
#undef Perl_bytes_to_utf8
U8*
Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
=for hackers
Found in file av.c
+=item bytes_from_utf8
+
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
+the newly-created string, and updates C<len> to contain the new length.
+Returns the original string if no conversion occurs, C<len> and
+C<is_utf8> are unchanged. Do nothing if C<is_utf8> points to 0. Sets
+C<is_utf8> to 0 if C<s> is converted or malformed .
+
+NOTE: this function is experimental and may change or be
+removed without notice.
+
+ U8* bytes_from_utf8(U8 *s, STRLEN *len, bool *is_utf8)
+
+=for hackers
+Found in file utf8.c
+
=item bytes_to_utf8
Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
=for hackers
Found in file sv.h
-=item svtype
+=item SvTYPE
-An enum of flags for Perl types. These are found in the file B<sv.h>
-in the C<svtype> enum. Test these flags with the C<SvTYPE> macro.
+Returns the type of the SV. See C<svtype>.
+
+ svtype SvTYPE(SV* sv)
=for hackers
Found in file sv.h
-=item SvTYPE
-
-Returns the type of the SV. See C<svtype>.
+=item svtype
- svtype SvTYPE(SV* sv)
+An enum of flags for Perl types. These are found in the file B<sv.h>
+in the C<svtype> enum. Test these flags with the C<SvTYPE> macro.
=for hackers
Found in file sv.h
PERL_CALLCONV IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
+PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV UV Perl_utf8_to_uv_simple(pTHX_ U8 *s, STRLEN* retlen);
PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
/* do not utf8ize the comparands as a side-effect */
if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) {
+ bool is_utf8 = TRUE;
+
if (PL_hints & HINT_UTF8_DISTINCT)
return FALSE;
if (SvUTF8(sv1)) {
- (void)utf8_to_bytes((U8*)(pv1 = savepvn(pv1, cur1)), &cur1);
- {
- IV scur1 = cur1;
- if (scur1 < 0) {
- Safefree(pv1);
- return 0;
- }
- }
- pv1tmp = TRUE;
+ char *pv = bytes_from_utf8((U8*)pv1, &cur1, &is_utf8);
+ if (is_utf8)
+ return 0;
+ pv1tmp = (pv != pv1);
+ pv1 = pv;
}
else {
- (void)utf8_to_bytes((U8*)(pv2 = savepvn(pv2, cur2)), &cur2);
- {
- IV scur2 = cur2;
- if (scur2 < 0) {
- Safefree(pv2);
- return 0;
- }
- }
- pv2tmp = TRUE;
+ char *pv = bytes_from_utf8((U8*)pv2, &cur2, &is_utf8);
+ if (is_utf8)
+ return 0;
+ pv2tmp = (pv != pv2);
+ pv2 = pv;
}
}
len = -len;
is_utf8 = TRUE;
}
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ src = (char*)bytes_from_utf8((U8*)src, (STRLEN*)&len, &is_utf8);
if (!hash)
PERL_HASH(hash, src, len);
new_SV(sv);
print "ok 17\n";
}
-print F $b,"\n"; # This upgrades $b!
+print F $b,"\n"; # Don't upgrades $b
{ # Check byte length of $b
use bytes; my $y = length($b);
-print "not " unless $y == 2;
+print "not ($y) " unless $y == 1;
print "ok 18\n";
}
{ my $x = tell(F);
{ use bytes; $y += 3;}
- print "not " unless $x == $y;
+ print "not ($x,$y) " unless $x == $y;
print "ok 19\n";
}
open F, "a" or die $!; # Not UTF
$x = <F>;
chomp($x);
-print "not " unless $x eq v196.172.194.130;
+printf "not (%vd) ", $x unless $x eq v196.172.194.130;
print "ok 20\n";
open F, "<:utf8", "a" or die $!;
$x = <F>;
chomp($x);
close F;
-print "not " unless $x eq chr(300).chr(130);
+printf "not (%vd) ", $x unless $x eq chr(300).chr(130);
print "ok 21\n";
# Now let's make it suffer.
}
$| = 1;
-print "1..13\n";
+print "1..14\n";
use charnames ':full';
$encoded_bet = "\327\221";
sub to_bytes {
- use bytes;
- "".shift;
+ pack"a*", shift;
}
{
print "ok 13\n";
}
+{
+ use charnames qw(:full);
+ use utf8;
+ print "not " unless "\x{100}\N{CENT SIGN}" eq "\x{100}"."\N{CENT SIGN}";
+ print "ok 14\n";
+}
+
#!./perl
-print "1..24\n";
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '.';
+ push @INC, '../lib';
+}
+
+print "1..25\n";
$h{'abc'} = 'ABC';
$h{'def'} = 'DEF';
print "ok 23\n";
print "#$u{$_}\n" for keys %u; # Used to core dump before change #8056.
print "ok 24\n";
+
+%u = (qu"\xe3\x81\x82" => "downglade");
+for (keys %u) {
+ use bytes;
+ print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
+ print "ok 25\n";
+}
@INC = '../lib';
}
-print "1..49\n";
+print "1..51\n";
$_ = "abcdefghijklmnopqrstuvwxyz";
print "not " unless sprintf("%vd", $a) eq '196.172.200';
print "ok 49\n";
+# UTF8 range
+
+($a = v300.196.172.302.197.172) =~ tr/\x{12c}-\x{130}/\xc0-\xc4/;
+print "not " unless $a eq v192.196.172.194.197.172;
+print "ok 50\n";
+
+($a = v300.196.172.302.197.172) =~ tr/\xc4-\xc8/\x{12c}-\x{130}/;
+print "not " unless $a eq v300.300.172.302.301.172;
+print "ok 51\n";
}
}
-print "1..106\n";
+print "1..107\n";
my $test = 1;
print "ok $test\n";
$test++; # 106
}
+
+{
+ use utf8;
+
+ my $w = 0;
+ local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
+ my $x = eval q/"\\/ . "\x{100}" . q/"/;;
+
+ print "not " unless $w == 0 && $x eq "\x{100}";
+ print "ok $test\n";
+ $test++; # 107
+}
+
"Unrecognized escape \\%c passed through",
*s);
/* default action is to copy the quoted character */
- *d++ = *s++;
- continue;
+ goto default_action;
}
/* \132 indicates an octal constant */
if (has_utf8 || uv > 255) {
d = (char*)uv_to_utf8((U8*)d, uv);
has_utf8 = TRUE;
+ if (PL_lex_inwhat == OP_TRANS &&
+ PL_sublex_info.sub_op) {
+ PL_sublex_info.sub_op->op_private |=
+ (PL_lex_repl ? OPpTRANS_FROM_UTF
+ : OPpTRANS_TO_UTF);
+ utf = TRUE;
+ }
}
else {
*d++ = (char)uv;
res = newSVpvn(s + 1, e - s - 1);
res = new_constant( Nullch, 0, "charnames",
res, Nullsv, "\\N{...}" );
+ if (has_utf8)
+ sv_utf8_upgrade(res);
str = SvPV(res,len);
if (!has_utf8 && SvUTF8(res)) {
char *ostart = SvPVX(sv);
continue;
} /* end if (backslash) */
- /* (now in tr/// code again) */
-
+ default_action:
if (UTF8_IS_CONTINUED(*s) && (this_utf8 || has_utf8)) {
STRLEN len = (STRLEN) -1;
UV uv;
*d++ = *s++;
}
has_utf8 = TRUE;
+ if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+ PL_sublex_info.sub_op->op_private |=
+ (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+ utf = TRUE;
+ }
continue;
}
- *d++ = *s++;
+ *d++ = *s++;
} /* while loop to process each character */
/* terminate the string and set up the sv */
case KEY_qq:
case KEY_qu:
s = scan_str(s,FALSE,FALSE);
- if (tmp == KEY_qu && is_utf8_string((U8*)s, SvCUR(PL_lex_stuff)))
+ if (tmp == KEY_qu &&
+ is_utf8_string((U8*)SvPVX(PL_lex_stuff), SvCUR(PL_lex_stuff)))
SvUTF8_on(PL_lex_stuff);
if (!s)
missingterm((char*)0);
}
/*
+=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
+
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
+the newly-created string, and updates C<len> to contain the new length.
+Returns the original string if no conversion occurs, C<len> and
+C<is_utf8> are unchanged. Do nothing if C<is_utf8> points to 0. Sets
+C<is_utf8> to 0 if C<s> is converted or malformed .
+
+=cut */
+
+U8 *
+Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
+{
+ U8 *send;
+ U8 *d;
+ U8 *start = s;
+ I32 count = 0;
+
+ if (!*is_utf8)
+ return start;
+
+ /* ensure valid UTF8 and chars < 256 before updating string */
+ for (send = s + *len; s < send;) {
+ U8 c = *s++;
+ if (!UTF8_IS_ASCII(c)) {
+ if (UTF8_IS_CONTINUATION(c) || s >= send ||
+ !UTF8_IS_CONTINUATION(*s)) {
+ *is_utf8 = 0;
+ return start;
+ }
+ if ((c & 0xfc) != 0xc0)
+ return start;
+ s++, count++;
+ }
+ }
+
+ *is_utf8 = 0;
+
+ if (!count)
+ return start;
+
+ Newz(801, d, (*len) - count + 1, U8);
+ d = s = start;
+ while (s < send) {
+ U8 c = *s++;
+ if (UTF8_IS_ASCII(c))
+ *d++ = c;
+ else
+ *d++ = UTF8_ACCUMULATE(c&3, *s++);
+ }
+ *d = '\0';
+ *len = d - start;
+ return start;
+}
+
+/*
=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.