Make using U+FDD0..U+FDEF (noncharacters since Unicode 3.1),

diff --git a/embed.h b/embed.h

index a748737..fd65d07 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -755,6 +755,8 @@
 #define utf8n_to_uvuni         Perl_utf8n_to_uvuni
 #define uvchr_to_utf8          Perl_uvchr_to_utf8
 #define uvuni_to_utf8          Perl_uvuni_to_utf8
+#define uvchr_to_utf8_flags    Perl_uvchr_to_utf8_flags
+#define uvuni_to_utf8_flags    Perl_uvuni_to_utf8_flags
 #define pv_uni_display         Perl_pv_uni_display
 #define sv_uni_display         Perl_sv_uni_display
 #define vivify_defelem         Perl_vivify_defelem
@@ -2274,6 +2276,8 @@
 #define utf8n_to_uvuni(a,b,c,d)        Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
 #define uvchr_to_utf8(a,b)     Perl_uvchr_to_utf8(aTHX_ a,b)
 #define uvuni_to_utf8(a,b)     Perl_uvuni_to_utf8(aTHX_ a,b)
+#define uvchr_to_utf8_flags(a,b,c)     Perl_uvchr_to_utf8_flags(aTHX_ a,b,c)
+#define uvuni_to_utf8_flags(a,b,c)     Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
 #define pv_uni_display(a,b,c,d,e)      Perl_pv_uni_display(aTHX_ a,b,c,d,e)
 #define sv_uni_display(a,b,c,d)        Perl_sv_uni_display(aTHX_ a,b,c,d)
 #define vivify_defelem(a)      Perl_vivify_defelem(aTHX_ a)
diff --git a/embed.pl b/embed.pl

index 74fd9a5..adbfcc3 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1853,7 +1853,9 @@ Apd       |UV     |utf8_to_uvuni  |U8 *s|STRLEN* retlen
 Adp    |UV     |utf8n_to_uvchr |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
 Adp    |UV     |utf8n_to_uvuni |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
 Apd    |U8*    |uvchr_to_utf8  |U8 *d|UV uv
-Apd    |U8*    |uvuni_to_utf8  |U8 *d|UV uv
+Ap     |U8*    |uvuni_to_utf8  |U8 *d|UV uv
+Ap     |U8*    |uvchr_to_utf8_flags    |U8 *d|UV uv|UV flags
+Apd    |U8*    |uvuni_to_utf8_flags    |U8 *d|UV uv|UV flags
 Apd    |char*  |pv_uni_display |SV *dsv|U8 *spv|STRLEN len \
                                |STRLEN pvlim|UV flags
 Apd    |char*  |sv_uni_display |SV *dsv|SV *ssv|STRLEN pvlim|UV flags
diff --git a/global.sym b/global.sym

index b2a9225..c19e004 100644 (file)
--- a/global.sym
+++ b/global.sym
@@ -157,6 +157,10 @@ Perl_ibcmp_utf8
 Perl_init_stacks
 Perl_init_tm
 Perl_instr
+Perl_is_lvalue_sub
+Perl_to_uni_upper_lc
+Perl_to_uni_title_lc
+Perl_to_uni_lower_lc
 Perl_is_uni_alnum
 Perl_is_uni_alnumc
 Perl_is_uni_idfirst
@@ -496,6 +500,8 @@ Perl_utf8n_to_uvchr
 Perl_utf8n_to_uvuni
 Perl_uvchr_to_utf8
 Perl_uvuni_to_utf8
+Perl_uvchr_to_utf8_flags
+Perl_uvuni_to_utf8_flags
 Perl_pv_uni_display
 Perl_sv_uni_display
 Perl_warn
diff --git a/op.c b/op.c

index a35c919..9b1556e 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2866,7 +2866,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                U8 range_mark = UTF_TO_NATIVE(0xff);
                sv_catpvn(transv, (char *)&range_mark, 1);
            }
-           t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
+           t = uvuni_to_utf8_flags(tmpbuf, 0x7fffffff,
+                                   UNICODE_ALLOW_SUPER);
            sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
            t = (U8*)SvPVX(transv);
            tlen = SvCUR(transv);
diff --git a/pod/perlapi.pod b/pod/perlapi.pod

index 7bdf75c..397f52b 100644 (file)
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -1573,8 +1573,8 @@ Found in file handy.h
 
 Returns a pointer to the next character after the parsed
 vstring, as well as updating the passed in sv.
- * 
-Function must be called like 
+ *
+Function must be called like
        
         sv = NEWSV(92,5);
        s = new_vstring(s,sv);
@@ -4453,20 +4453,28 @@ is the recommended wide native character-aware way of saying
 =for hackers
 Found in file utf8.c
 
-=item uvuni_to_utf8
+=item uvuni_to_utf8_flags
 
 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 bytes available. The return value is the pointer to the byte after the
 end of the new character. In other words,
 
+    d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
     d = uvuni_to_utf8(d, uv);
 
+(which is equivalent to)
+
+    d = uvuni_to_utf8_flags(d, uv, 0);
+
 is the recommended Unicode-aware way of saying
 
     *(d++) = uv;
 
-       U8*     uvuni_to_utf8(U8 *d, UV uv)
+       U8*     uvuni_to_utf8_flags(U8 *d, UV uv, UV flags)
 
 =for hackers
 Found in file utf8.c
diff --git a/pp.c b/pp.c

index 0ddfefe..eb386ee 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -2258,7 +2258,7 @@ PP(pp_complement)
              while (tmps < send) {
                  UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
                  tmps += UTF8SKIP(tmps);
-                 result = uvchr_to_utf8(result, ~c);
+                 result = uvchr_to_utf8_flags(result, ~c, UNICODE_ALLOW_ANY);
              }
              *result = '\0';
              result -= targlen;
@@ -3148,7 +3148,8 @@ PP(pp_chr)
 
     if (value > 255 && !IN_BYTES) {
        SvGROW(TARG, UNISKIP(value)+1);
-       tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value);
+       tmps = (char*)uvchr_to_utf8_flags((U8*)SvPVX(TARG), value,
+                                         UNICODE_ALLOW_SUPER);
        SvCUR_set(TARG, tmps - SvPVX(TARG));
        *tmps = '\0';
        (void)SvPOK_only(TARG);
diff --git a/proto.h b/proto.h

index 33e8b82..b6ed287 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -832,6 +832,8 @@ PERL_CALLCONV UV    Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen,
 PERL_CALLCONV UV       Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
 PERL_CALLCONV U8*      Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
 PERL_CALLCONV U8*      Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv);
+PERL_CALLCONV U8*      Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
+PERL_CALLCONV U8*      Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
 PERL_CALLCONV char*    Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags);
 PERL_CALLCONV char*    Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags);
 PERL_CALLCONV void     Perl_vivify_defelem(pTHX_ SV* sv);
diff --git a/t/op/each.t b/t/op/each.t

index 556479e..8212264 100755 (executable)
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -135,7 +135,7 @@ ok ($i == 5);
 # Check for Unicode hash keys.
 %u = ("\x{12}", "f", "\x{123}", "fo", "\x{1234}",  "foo");
 $u{"\x{12345}"}  = "bar";
-@u{"\x{123456}"} = "zap";
+@u{"\x{10FFFD}"} = "zap";
 
 my %u2;
 foreach (keys %u) {
diff --git a/t/op/pat.t b/t/op/pat.t

index 6b4b061..077b957 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -1618,9 +1618,9 @@ EOT
 {
     # from Robin Houston
 
-    my $x = "\x{12345678}";
+    my $x = "\x{10FFFD}";
     $x =~ s/(.)/$1/g;
-    print "not " unless ord($x) == 0x12345678 && length($x) == 1;
+    print "not " unless ord($x) == 0x10FFFD && length($x) == 1;
     print "ok 587\n";
 }
 
diff --git a/t/op/qq.t b/t/op/qq.t

index 651cf18..d883169 100644 (file)
--- a/t/op/qq.t
+++ b/t/op/qq.t
@@ -60,4 +60,4 @@ is ("\x{000000000000000000000000000000000000000000000000000000000000000072}",
     chr 114);
 is ("\x{0_06_5}", chr 101);
 is ("\x{1234}", chr 4660);
-is ("\x{98765432}", chr 2557891634);
+is ("\x{10FFFD}", chr 1114109);
diff --git a/utf8.c b/utf8.c

index 81af397..debfb9c 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -27,15 +27,23 @@
 /* Unicode support */
 
 /*
-=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
+=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
 
 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 bytes available. The return value is the pointer to the byte after the
 end of the new character. In other words,
 
+    d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
     d = uvuni_to_utf8(d, uv);
 
+(which is equivalent to)
+
+    d = uvuni_to_utf8_flags(d, uv, 0);
+
 is the recommended Unicode-aware way of saying
 
     *(d++) = uv;
@@ -44,13 +52,26 @@ is the recommended Unicode-aware way of saying
 */
 
 U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 {
     if (ckWARN_d(WARN_UTF8)) {
-        if (UNICODE_IS_SURROGATE(uv))
+        if (UNICODE_IS_SURROGATE(uv) &&
+            !(flags & UNICODE_ALLOW_SURROGATE))
              Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
-        else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
-                 (uv == 0xFFFE || uv == 0xFFFF))
+        else if (
+                 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
+                   !(flags & UNICODE_ALLOW_FDD0))
+                  ||
+                  ((uv & 0xFFFF) == 0xFFFE &&
+                   !(flags & UNICODE_ALLOW_FFFE))
+                  ||
+                  ((uv & 0xFFFF) == 0xFFFF &&
+                   !(flags & UNICODE_ALLOW_FFFF))) &&
+                 /* UNICODE_ALLOW_SUPER includes
+                  * FFFEs and FFFFs beyond 0x10FFFF. */
+                 ((uv <= PERL_UNICODE_MAX) ||
+                  !(flags & UNICODE_ALLOW_SUPER))
+                 )
              Perl_warner(aTHX_ WARN_UTF8,
                         "Unicode character 0x%04"UVxf" is illegal", uv);
     }
@@ -138,7 +159,12 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 #endif
 #endif /* Loop style */
 }
-
+ 
+U8 *
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
+}
 
 
 /*
@@ -1544,9 +1570,14 @@ is the recommended wide native character-aware way of saying
 U8 *
 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 {
-    return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
 }
 
+U8 *
+Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
+}
 
 /*
 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
diff --git a/utf8.h b/utf8.h

index 1c2243e..b35cfeb 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -166,6 +166,17 @@ END_EXTERN_C
 #define UNICODE_BYTER_ORDER_MARK       0xfffe
 #define UNICODE_ILLEGAL                        0xffff
 
+/* Though our UTF-8 encoding can go beyond this,
+ * let's be conservative. */
+#define PERL_UNICODE_MAX       0x10FFFF
+
+#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
+#define UNICODE_ALLOW_FDD0     0x0002  /* Allow the U+FDD0...U+FDEF */
+#define UNICODE_ALLOW_FFFE     0x0004  /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_FFFF     0x0008  /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_SUPER    0x0010  /* Allow past 10xFFFF */
+#define UNICODE_ALLOW_ANY      0xFFFF
+
 #define UNICODE_IS_SURROGATE(c)                ((c) >= UNICODE_SURROGATE_FIRST && \
                                         (c) <= UNICODE_SURROGATE_LAST)
 #define UNICODE_IS_REPLACEMENT(c)      ((c) == UNICODE_REPLACEMENT)
embed.h		patch \| blob \| blame \| history
embed.pl		patch \| blob \| blame \| history
global.sym		patch \| blob \| blame \| history
op.c		patch \| blob \| blame \| history
pod/perlapi.pod		patch \| blob \| blame \| history
pp.c		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
t/op/each.t		patch \| blob \| blame \| history
t/op/pat.t		patch \| blob \| blame \| history
t/op/qq.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history