What started as a small nit (the charnames test, nit found

diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm

index 8522a79..4344616 100644 (file)
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -137,24 +137,26 @@ sub parseEntry
   # get element
   my($e, $k) = split /;/, $line;
   my @e = _getHexArray($e);
-  $ele = pack('U*', @e);
+  { no warnings 'utf8'; $ele = pack('U*', @e); }
   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 
   # get sort key
-  if(
+  { no warnings 'utf8';
+    if(
      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
-  )
-  {
-     $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
-  }
-  else
-  {
-    foreach my $arr ($k =~ /\[(\S+)\]/g) {
-      my $var = $arr =~ /\*/;
-      push @key, $self->altCE( $var, _getHexArray($arr) );
+       )
+    {
+       $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
+    }
+    else
+    {
+       foreach my $arr ($k =~ /\[(\S+)\]/g) {
+           my $var = $arr =~ /\*/;
+           push @key, $self->altCE( $var, _getHexArray($arr) );
+       }
+       $self->{entries}{$ele} = \@key;
     }
-    $self->{entries}{$ele} = \@key;
   }
   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 }
diff --git a/lib/charnames.t b/lib/charnames.t

index 1beecf3..3123127 100644 (file)
--- a/lib/charnames.t
+++ b/lib/charnames.t
@@ -12,7 +12,7 @@ BEGIN {
 
 $| = 1;
 
-print "1..38\n";
+print "1..39\n";
 
 use charnames ':full';
 
@@ -220,3 +220,9 @@ print "ok 33\n";
 print "not " unless charnames::viacode(0xFEFF) eq "ZERO WIDTH NO-BREAK SPACE";
 print "ok 38\n";
 
+{
+    use warnings;
+    print "not " unless ord("\N{BOM}") == 0xFEFF;
+    print "ok 39\n";
+}
+
diff --git a/pp.c b/pp.c

index 757b4f0..fbe4737 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -3241,7 +3241,9 @@ PP(pp_ord)
         argsv = tmpsv;
     }
 
-    XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff));
+    XPUSHu(DO_UTF8(argsv) ?
+          utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) :
+          (*s & 0xff));
 
     RETURN;
 }
diff --git a/pp_pack.c b/pp_pack.c

index 1c5ee31..452a2b0 100644 (file)
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -770,7 +770,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *
            if (checksum) {
                while (len-- > 0 && s < strend) {
                    STRLEN alen;
-                   auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+                   auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
                    along = alen;
                    s += along;
                    if (checksum > bits_in_uv)
@@ -784,7 +784,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *
                EXTEND_MORTAL(len);
                while (len-- > 0 && s < strend) {
                    STRLEN alen;
-                   auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+                   auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
                    along = alen;
                    s += along;
                    sv = NEWSV(37, 0);
diff --git a/regexec.c b/regexec.c

index 8db2dc2..29b8704 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
                to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
-               c1 = utf8_to_uvchr(tmpbuf1, 0);
-               c2 = utf8_to_uvchr(tmpbuf2, 0);
+               c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 
+                                   0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+               c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+                                   0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
            }
            else {
                c1 = *(U8*)m;
@@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                
                if (c1 == c2) {
                    while (s <= e) {
-                       c = utf8_to_uvchr((U8*)s, &len);
+                       c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY);
                        if ( c == c1
                             && (ln == len ||
                                 ibcmp_utf8(s, (char **)0, 0,  do_utf8,
@@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                }
                else {
                    while (s <= e) {
-                       c = utf8_to_uvchr((U8*)s, &len);
+                     c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY);
 
                        /* Handle some of the three Greek sigmas cases.
                         * Note that not all the possible combinations
@@ -2390,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                             sayNO;
                        if (NATIVE_TO_UNI(*(U8*)s) !=
-                           utf8_to_uvuni((U8*)l, &ulen))
+                           utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY))
                             sayNO;
                        l += ulen;
                        s ++;
@@ -2402,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                            sayNO;
                        if (NATIVE_TO_UNI(*((U8*)l)) !=
-                           utf8_to_uvuni((U8*)s, &ulen))
+                           utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY))
                            sayNO;
                        s += ulen;
                        l ++;
@@ -3545,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog)
                             to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
                             to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
-                            c1 = utf8_to_uvuni(tmpbuf1, 0);
-                            c2 = utf8_to_uvuni(tmpbuf2, 0);
+                            c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+                                                ckWARN(WARN_UTF8) ?
+                                                0 : UTF8_ALLOW_ANY);
+                            c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+                                                ckWARN(WARN_UTF8) ?
+                                                0 : UTF8_ALLOW_ANY);
                        }
                        else {
-                           c2 = c1 = utf8_to_uvchr(s, NULL);
+                           c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+                                                    ckWARN(WARN_UTF8) ?
+                                                    0 : UTF8_ALLOW_ANY);
                        }
                    }
                }
@@ -3605,16 +3621,24 @@ S_regmatch(pTHX_ regnode *prog)
                        else {
                            STRLEN len;
                            if (c1 == c2) {
-                               /* count initialised to utf8_distance(old, locinput) */
+                               /* count initialised to
+                                * utf8_distance(old, locinput) */
                                while (locinput <= e &&
-                                      utf8_to_uvchr((U8*)locinput, &len) != c1) {
+                                      utf8n_to_uvchr((U8*)locinput,
+                                                     UTF8_MAXLEN, &len,
+                                                     ckWARN(WARN_UTF8) ?
+                                                     0 : UTF8_ALLOW_ANY) != c1) {
                                    locinput += len;
                                    count++;
                                }
                            } else {
-                               /* count initialised to utf8_distance(old, locinput) */
+                               /* count initialised to
+                                * utf8_distance(old, locinput) */
                                while (locinput <= e) {
-                                   UV c = utf8_to_uvchr((U8*)locinput, &len);
+                                   UV c = utf8n_to_uvchr((U8*)locinput,
+                                                         UTF8_MAXLEN, &len,
+                                                         ckWARN(WARN_UTF8) ?
+                                                         0 : UTF8_ALLOW_ANY);
                                    if (c == c1 || c == c2)
                                        break;
                                    locinput += len;
@@ -3648,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog)
                    UV c;
                    if (c1 != -1000) {
                        if (do_utf8)
-                           c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+                           c = utf8n_to_uvchr((U8*)PL_reginput,
+                                              UTF8_MAXLEN, 0,
+                                              ckWARN(WARN_UTF8) ?
+                                              0 : UTF8_ALLOW_ANY);
                        else
                            c = UCHARAT(PL_reginput);
                        /* If it could work, try it. */
@@ -3695,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog)
                    while (n >= ln) {
                        if (c1 != -1000) {
                            if (do_utf8)
-                               c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+                               c = utf8n_to_uvchr((U8*)PL_reginput,
+                                                  UTF8_MAXLEN, 0,
+                                                  ckWARN(WARN_UTF8) ?
+                                                  0 : UTF8_ALLOW_ANY);
                            else
                                c = UCHARAT(PL_reginput);
                        }
@@ -3715,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog)
                    while (n >= ln) {
                        if (c1 != -1000) {
                            if (do_utf8)
-                               c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+                               c = utf8n_to_uvchr((U8*)PL_reginput,
+                                                  UTF8_MAXLEN, 0,
+                                                  ckWARN(WARN_UTF8) ?
+                                                  0 : UTF8_ALLOW_ANY);
                            else
                                c = UCHARAT(PL_reginput);
                        }
@@ -4297,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
     STRLEN len = 0;
     STRLEN plen;
 
-    c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
+    c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p;
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
     if (do_utf8 || (flags & ANYOF_UNICODE)) {
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8

index 747436a..5cd0e05 100644 (file)
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -39,7 +39,6 @@ my $d800  = chr(0xD800);
 my $dfff  = chr(0xDFFF);
 my $e000  = chr(0xE000);
 my $fffd  = chr(0xFFFD);
-my $fffe  = chr(0xFFFE);
 my $ffff  = chr(0xFFFF);
 my $hex4  = chr(0x10000);
 my $hex5  = chr(0x100000);
@@ -50,7 +49,6 @@ my $d800  = chr(0xD800);
 my $dfff  = chr(0xDFFF);
 my $e000  = chr(0xE000);
 my $fffd  = chr(0xFFFD);
-my $fffe  = chr(0xFFFE);
 my $ffff  = chr(0xFFFF);
 my $hex4  = chr(0x10000);
 my $hex5  = chr(0x100000);
@@ -58,9 +56,8 @@ my $max   = chr(0x10FFFF);
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
 ########
 use warnings 'utf8';
 my $d7ff  = pack("U", 0xD7FF);
@@ -68,7 +65,6 @@ my $d800  = pack("U", 0xD800);
 my $dfff  = pack("U", 0xDFFF);
 my $e000  = pack("U", 0xE000);
 my $fffd  = pack("U", 0xFFFD);
-my $fffe  = pack("U", 0xFFFE);
 my $ffff  = pack("U", 0xFFFF);
 my $hex4  = pack("U", 0x10000);
 my $hex5  = pack("U", 0x100000);
@@ -79,7 +75,6 @@ my $d800  = pack("U", 0xD800);
 my $dfff  = pack("U", 0xDFFF);
 my $e000  = pack("U", 0xE000);
 my $fffd  = pack("U", 0xFFFD);
-my $fffe  = pack("U", 0xFFFE);
 my $ffff  = pack("U", 0xFFFF);
 my $hex4  = pack("U", 0x10000);
 my $hex5  = pack("U", 0x100000);
@@ -87,9 +82,8 @@ my $max   = pack("U", 0x10FFFF);
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
 ########
 use warnings 'utf8';
 my $d7ff  = "\x{D7FF}";
@@ -97,7 +91,6 @@ my $d800  = "\x{D800}";
 my $dfff  = "\x{DFFF}";
 my $e000  = "\x{E000}";
 my $fffd  = "\x{FFFD}";
-my $fffe  = "\x{FFFE}";
 my $ffff  = "\x{FFFF}";
 my $hex4  = "\x{10000}";
 my $hex5  = "\x{100000}";
@@ -108,7 +101,6 @@ my $d800  = "\x{D800}";
 my $dfff  = "\x{DFFF}";
 my $e000  = "\x{E000}";
 my $fffd  = "\x{FFFD}";
-my $fffe  = "\x{FFFE}";
 my $ffff  = "\x{FFFF}";
 my $hex4  = "\x{10000}";
 my $hex5  = "\x{100000}";
@@ -116,6 +108,5 @@ my $max   = "\x{10FFFF}";
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
diff --git a/utf8.c b/utf8.c

index 85a22a1..0100eb1 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -64,13 +64,13 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
                  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
                    !(flags & UNICODE_ALLOW_FDD0))
                   ||
-                  ((uv & 0xFFFF) == 0xFFFE &&
-                   !(flags & UNICODE_ALLOW_FFFE))
+                  (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
+                   !(flags & UNICODE_ALLOW_BOM))
                   ||
                   ((uv & 0xFFFF) == 0xFFFF &&
                    !(flags & UNICODE_ALLOW_FFFF))) &&
                  /* UNICODE_ALLOW_SUPER includes
-                  * FFFEs and FFFFs beyond 0x10FFFF. */
+                  * FFFFs beyond 0x10FFFF. */
                  ((uv <= PERL_UNICODE_MAX) ||
                   !(flags & UNICODE_ALLOW_SUPER))
                  )
@@ -500,7 +500,8 @@ returned and retlen is set, if possible, to -1.
 UV
 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 {
-    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+                              ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
 /*
@@ -523,7 +524,8 @@ UV
 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 {
     /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+                              ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
 /*
@@ -1626,7 +1628,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
            /* We use utf8n_to_uvuni() as we want an index into
               Unicode tables, not a native character number.
             */
-           UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
+           UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY);
            SV *errsv_save;
            ENTER;
            SAVETMPS;
diff --git a/utf8.h b/utf8.h

index a5312ca..3787832 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -188,24 +188,24 @@ encoded character.
 #define UNICODE_SURROGATE_FIRST                0xd800
 #define UNICODE_SURROGATE_LAST         0xdfff
 #define UNICODE_REPLACEMENT            0xfffd
-#define UNICODE_BYTER_ORDER_MARK       0xfffe
+#define UNICODE_BYTE_ORDER_MARK                0xfeff
 #define UNICODE_ILLEGAL                        0xffff
 
 /* Though our UTF-8 encoding can go beyond this,
- * let's be conservative. */
+ * let's be conservative and do as Unicode 3.2 says. */
 #define PERL_UNICODE_MAX       0x10FFFF
 
 #define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
 #define UNICODE_ALLOW_FDD0     0x0002  /* Allow the U+FDD0...U+FDEF */
-#define UNICODE_ALLOW_FFFE     0x0004  /* Allow 0xFFFE, 0x1FFFE, ... */
-#define UNICODE_ALLOW_FFFF     0x0008  /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_BOM      0x0004  /* Allow 0xFEFF */
+#define UNICODE_ALLOW_FFFF     0x0008  /* Allow 0xFFFF, 0x1FFFF, ... */
 #define UNICODE_ALLOW_SUPER    0x0010  /* Allow past 10xFFFF */
 #define UNICODE_ALLOW_ANY      0xFFFF
 
 #define UNICODE_IS_SURROGATE(c)                ((c) >= UNICODE_SURROGATE_FIRST && \
                                         (c) <= UNICODE_SURROGATE_LAST)
 #define UNICODE_IS_REPLACEMENT(c)      ((c) == UNICODE_REPLACEMENT)
-#define UNICODE_IS_BYTE_ORDER_MARK(c)  ((c) == UNICODE_BYTER_ORDER_MARK)
+#define UNICODE_IS_BYTE_ORDER_MARK(c)  ((c) == UNICODE_BYTE_ORDER_MARK)
 #define UNICODE_IS_ILLEGAL(c)          ((c) == UNICODE_ILLEGAL)
 
 #ifdef HAS_QUAD
lib/Unicode/Collate.pm		patch \| blob \| blame \| history
lib/charnames.t		patch \| blob \| blame \| history
pp.c		patch \| blob \| blame \| history
pp_pack.c		patch \| blob \| blame \| history
regexec.c		patch \| blob \| blame \| history
t/lib/warnings/utf8		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history