#define utf8n_to_uvuni Perl_utf8n_to_uvuni
#define uvchr_to_utf8 Perl_uvchr_to_utf8
#define uvuni_to_utf8 Perl_uvuni_to_utf8
+#define uvchr_to_utf8_flags Perl_uvchr_to_utf8_flags
+#define uvuni_to_utf8_flags Perl_uvuni_to_utf8_flags
#define pv_uni_display Perl_pv_uni_display
#define sv_uni_display Perl_sv_uni_display
#define vivify_defelem Perl_vivify_defelem
#define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
#define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX_ a,b)
#define uvuni_to_utf8(a,b) Perl_uvuni_to_utf8(aTHX_ a,b)
+#define uvchr_to_utf8_flags(a,b,c) Perl_uvchr_to_utf8_flags(aTHX_ a,b,c)
+#define uvuni_to_utf8_flags(a,b,c) Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
#define pv_uni_display(a,b,c,d,e) Perl_pv_uni_display(aTHX_ a,b,c,d,e)
#define sv_uni_display(a,b,c,d) Perl_sv_uni_display(aTHX_ a,b,c,d)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
Adp |UV |utf8n_to_uvchr |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
Adp |UV |utf8n_to_uvuni |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
Apd |U8* |uvchr_to_utf8 |U8 *d|UV uv
-Apd |U8* |uvuni_to_utf8 |U8 *d|UV uv
+Ap |U8* |uvuni_to_utf8 |U8 *d|UV uv
+Ap |U8* |uvchr_to_utf8_flags |U8 *d|UV uv|UV flags
+Apd |U8* |uvuni_to_utf8_flags |U8 *d|UV uv|UV flags
Apd |char* |pv_uni_display |SV *dsv|U8 *spv|STRLEN len \
|STRLEN pvlim|UV flags
Apd |char* |sv_uni_display |SV *dsv|SV *ssv|STRLEN pvlim|UV flags
Perl_init_stacks
Perl_init_tm
Perl_instr
+Perl_is_lvalue_sub
+Perl_to_uni_upper_lc
+Perl_to_uni_title_lc
+Perl_to_uni_lower_lc
Perl_is_uni_alnum
Perl_is_uni_alnumc
Perl_is_uni_idfirst
Perl_utf8n_to_uvuni
Perl_uvchr_to_utf8
Perl_uvuni_to_utf8
+Perl_uvchr_to_utf8_flags
+Perl_uvuni_to_utf8_flags
Perl_pv_uni_display
Perl_sv_uni_display
Perl_warn
U8 range_mark = UTF_TO_NATIVE(0xff);
sv_catpvn(transv, (char *)&range_mark, 1);
}
- t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
+ t = uvuni_to_utf8_flags(tmpbuf, 0x7fffffff,
+ UNICODE_ALLOW_SUPER);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
t = (U8*)SvPVX(transv);
tlen = SvCUR(transv);
Returns a pointer to the next character after the parsed
vstring, as well as updating the passed in sv.
- *
-Function must be called like
+ *
+Function must be called like
sv = NEWSV(92,5);
s = new_vstring(s,sv);
=for hackers
Found in file utf8.c
-=item uvuni_to_utf8
+=item uvuni_to_utf8_flags
Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
+ d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
d = uvuni_to_utf8(d, uv);
+(which is equivalent to)
+
+ d = uvuni_to_utf8_flags(d, uv, 0);
+
is the recommended Unicode-aware way of saying
*(d++) = uv;
- U8* uvuni_to_utf8(U8 *d, UV uv)
+ U8* uvuni_to_utf8_flags(U8 *d, UV uv, UV flags)
=for hackers
Found in file utf8.c
while (tmps < send) {
UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
tmps += UTF8SKIP(tmps);
- result = uvchr_to_utf8(result, ~c);
+ result = uvchr_to_utf8_flags(result, ~c, UNICODE_ALLOW_ANY);
}
*result = '\0';
result -= targlen;
if (value > 255 && !IN_BYTES) {
SvGROW(TARG, UNISKIP(value)+1);
- tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value);
+ tmps = (char*)uvchr_to_utf8_flags((U8*)SvPVX(TARG), value,
+ UNICODE_ALLOW_SUPER);
SvCUR_set(TARG, tmps - SvPVX(TARG));
*tmps = '\0';
(void)SvPOK_only(TARG);
PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
PERL_CALLCONV U8* Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv);
+PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
+PERL_CALLCONV U8* Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
PERL_CALLCONV char* Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags);
PERL_CALLCONV char* Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags);
PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv);
# Check for Unicode hash keys.
%u = ("\x{12}", "f", "\x{123}", "fo", "\x{1234}", "foo");
$u{"\x{12345}"} = "bar";
-@u{"\x{123456}"} = "zap";
+@u{"\x{10FFFD}"} = "zap";
my %u2;
foreach (keys %u) {
{
# from Robin Houston
- my $x = "\x{12345678}";
+ my $x = "\x{10FFFD}";
$x =~ s/(.)/$1/g;
- print "not " unless ord($x) == 0x12345678 && length($x) == 1;
+ print "not " unless ord($x) == 0x10FFFD && length($x) == 1;
print "ok 587\n";
}
chr 114);
is ("\x{0_06_5}", chr 101);
is ("\x{1234}", chr 4660);
-is ("\x{98765432}", chr 2557891634);
+is ("\x{10FFFD}", chr 1114109);
/* Unicode support */
/*
-=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
+=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
+ d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
d = uvuni_to_utf8(d, uv);
+(which is equivalent to)
+
+ d = uvuni_to_utf8_flags(d, uv, 0);
+
is the recommended Unicode-aware way of saying
*(d++) = uv;
*/
U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
if (ckWARN_d(WARN_UTF8)) {
- if (UNICODE_IS_SURROGATE(uv))
+ if (UNICODE_IS_SURROGATE(uv) &&
+ !(flags & UNICODE_ALLOW_SURROGATE))
Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
- else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
- (uv == 0xFFFE || uv == 0xFFFF))
+ else if (
+ ((uv >= 0xFDD0 && uv <= 0xFDEF &&
+ !(flags & UNICODE_ALLOW_FDD0))
+ ||
+ ((uv & 0xFFFF) == 0xFFFE &&
+ !(flags & UNICODE_ALLOW_FFFE))
+ ||
+ ((uv & 0xFFFF) == 0xFFFF &&
+ !(flags & UNICODE_ALLOW_FFFF))) &&
+ /* UNICODE_ALLOW_SUPER includes
+ * FFFEs and FFFFs beyond 0x10FFFF. */
+ ((uv <= PERL_UNICODE_MAX) ||
+ !(flags & UNICODE_ALLOW_SUPER))
+ )
Perl_warner(aTHX_ WARN_UTF8,
"Unicode character 0x%04"UVxf" is illegal", uv);
}
#endif
#endif /* Loop style */
}
-
+
+U8 *
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+{
+ return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
+}
/*
U8 *
Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
{
- return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+ return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
}
+U8 *
+Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+ return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
+}
/*
=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
#define UNICODE_BYTER_ORDER_MARK 0xfffe
#define UNICODE_ILLEGAL 0xffff
+/* Though our UTF-8 encoding can go beyond this,
+ * let's be conservative. */
+#define PERL_UNICODE_MAX 0x10FFFF
+
+#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
+#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */
+#define UNICODE_ALLOW_FFFE 0x0004 /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */
+#define UNICODE_ALLOW_ANY 0xFFFF
+
#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
(c) <= UNICODE_SURROGATE_LAST)
#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT)