From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Tue, 2 Apr 2002 20:35:13 +0000 (+0000)
Subject: What started as a small nit (the charnames test, nit found
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=872c91ae155f6880f8bf2b15c143bda5279a5794;p=p5sagit%2Fp5-mst-13.2.git

What started as a small nit (the charnames test, nit found
be Hugo), ballooned a bit... the goal is Larry's wish that
illegal Unicode (such as U+FFFF) by default doesn't warn,
since what if somebody WANTS to create illegal Unicode?
Now getting close to this in the regex runtime.
(Also, fix more of my fixation that BOM would be U+FFFE.)

p4raw-id: //depot/perl@15689
---

diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index 8522a79..4344616 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -137,24 +137,26 @@ sub parseEntry
   # get element
   my($e, $k) = split /;/, $line;
   my @e = _getHexArray($e);
-  $ele = pack('U*', @e);
+  { no warnings 'utf8'; $ele = pack('U*', @e); }
   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 
   # get sort key
-  if(
+  { no warnings 'utf8';
+    if(
      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
-  )
-  {
-     $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
-  }
-  else
-  {
-    foreach my $arr ($k =~ /\[(\S+)\]/g) {
-      my $var = $arr =~ /\*/;
-      push @key, $self->altCE( $var, _getHexArray($arr) );
+       )
+    {
+	$self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
+    }
+    else
+    {
+	foreach my $arr ($k =~ /\[(\S+)\]/g) {
+	    my $var = $arr =~ /\*/;
+	    push @key, $self->altCE( $var, _getHexArray($arr) );
+	}
+	$self->{entries}{$ele} = \@key;
     }
-    $self->{entries}{$ele} = \@key;
   }
   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 }
diff --git a/lib/charnames.t b/lib/charnames.t
index 1beecf3..3123127 100644
--- a/lib/charnames.t
+++ b/lib/charnames.t
@@ -12,7 +12,7 @@ BEGIN {
 
 $| = 1;
 
-print "1..38\n";
+print "1..39\n";
 
 use charnames ':full';
 
@@ -220,3 +220,9 @@ print "ok 33\n";
 print "not " unless charnames::viacode(0xFEFF) eq "ZERO WIDTH NO-BREAK SPACE";
 print "ok 38\n";
 
+{
+    use warnings;
+    print "not " unless ord("\N{BOM}") == 0xFEFF;
+    print "ok 39\n";
+}
+
diff --git a/pp.c b/pp.c
index 757b4f0..fbe4737 100644
--- a/pp.c
+++ b/pp.c
@@ -3241,7 +3241,9 @@ PP(pp_ord)
         argsv = tmpsv;
     }
 
-    XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff));
+    XPUSHu(DO_UTF8(argsv) ?
+	   utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) :
+	   (*s & 0xff));
 
     RETURN;
 }
diff --git a/pp_pack.c b/pp_pack.c
index 1c5ee31..452a2b0 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -770,7 +770,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *
 	    if (checksum) {
 		while (len-- > 0 && s < strend) {
 		    STRLEN alen;
-		    auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+		    auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
 		    along = alen;
 		    s += along;
 		    if (checksum > bits_in_uv)
@@ -784,7 +784,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *
 		EXTEND_MORTAL(len);
 		while (len-- > 0 && s < strend) {
 		    STRLEN alen;
-		    auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+		    auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
 		    along = alen;
 		    s += along;
 		    sv = NEWSV(37, 0);
diff --git a/regexec.c b/regexec.c
index 8db2dc2..29b8704 100644
--- a/regexec.c
+++ b/regexec.c
@@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
 		to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
-		c1 = utf8_to_uvchr(tmpbuf1, 0);
-		c2 = utf8_to_uvchr(tmpbuf2, 0);
+		c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 
+				    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+		c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+				    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 	    }
 	    else {
 		c1 = *(U8*)m;
@@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		
 		if (c1 == c2) {
 		    while (s <= e) {
-		        c = utf8_to_uvchr((U8*)s, &len);
+		        c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY);
 			if ( c == c1
 			     && (ln == len ||
 				 ibcmp_utf8(s, (char **)0, 0,  do_utf8,
@@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		}
 		else {
 		    while (s <= e) {
-		        c = utf8_to_uvchr((U8*)s, &len);
+		      c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY);
 
 			/* Handle some of the three Greek sigmas cases.
 			 * Note that not all the possible combinations
@@ -2390,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			     sayNO;
 			if (NATIVE_TO_UNI(*(U8*)s) !=
-			    utf8_to_uvuni((U8*)l, &ulen))
+			    utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY))
 			     sayNO;
 			l += ulen;
 			s ++;
@@ -2402,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			    sayNO;
 			if (NATIVE_TO_UNI(*((U8*)l)) !=
-			    utf8_to_uvuni((U8*)s, &ulen))
+			    utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY))
 			    sayNO;
 			s += ulen;
 			l ++;
@@ -3545,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog)
 			     to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
 			     to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
-			     c1 = utf8_to_uvuni(tmpbuf1, 0);
-			     c2 = utf8_to_uvuni(tmpbuf2, 0);
+			     c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+						 ckWARN(WARN_UTF8) ?
+						 0 : UTF8_ALLOW_ANY);
+			     c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+						 ckWARN(WARN_UTF8) ?
+						 0 : UTF8_ALLOW_ANY);
 			}
 			else {
-			    c2 = c1 = utf8_to_uvchr(s, NULL);
+			    c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+						     ckWARN(WARN_UTF8) ?
+						     0 : UTF8_ALLOW_ANY);
 			}
 		    }
 		}
@@ -3605,16 +3621,24 @@ S_regmatch(pTHX_ regnode *prog)
 			else {
 			    STRLEN len;
 			    if (c1 == c2) {
-				/* count initialised to utf8_distance(old, locinput) */
+				/* count initialised to
+				 * utf8_distance(old, locinput) */
 				while (locinput <= e &&
-				       utf8_to_uvchr((U8*)locinput, &len) != c1) {
+				       utf8n_to_uvchr((U8*)locinput,
+						      UTF8_MAXLEN, &len,
+						      ckWARN(WARN_UTF8) ?
+						      0 : UTF8_ALLOW_ANY) != c1) {
 				    locinput += len;
 				    count++;
 				}
 			    } else {
-				/* count initialised to utf8_distance(old, locinput) */
+				/* count initialised to
+				 * utf8_distance(old, locinput) */
 				while (locinput <= e) {
-				    UV c = utf8_to_uvchr((U8*)locinput, &len);
+				    UV c = utf8n_to_uvchr((U8*)locinput,
+							  UTF8_MAXLEN, &len,
+							  ckWARN(WARN_UTF8) ?
+							  0 : UTF8_ALLOW_ANY);
 				    if (c == c1 || c == c2)
 					break;
 				    locinput += len;
@@ -3648,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    UV c;
 		    if (c1 != -1000) {
 			if (do_utf8)
-			    c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+			    c = utf8n_to_uvchr((U8*)PL_reginput,
+					       UTF8_MAXLEN, 0,
+					       ckWARN(WARN_UTF8) ?
+					       0 : UTF8_ALLOW_ANY);
 			else
 			    c = UCHARAT(PL_reginput);
 			/* If it could work, try it. */
@@ -3695,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    while (n >= ln) {
 			if (c1 != -1000) {
 			    if (do_utf8)
-				c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+				c = utf8n_to_uvchr((U8*)PL_reginput,
+						   UTF8_MAXLEN, 0,
+						   ckWARN(WARN_UTF8) ?
+						   0 : UTF8_ALLOW_ANY);
 			    else
 				c = UCHARAT(PL_reginput);
 			}
@@ -3715,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    while (n >= ln) {
 			if (c1 != -1000) {
 			    if (do_utf8)
-				c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+				c = utf8n_to_uvchr((U8*)PL_reginput,
+						   UTF8_MAXLEN, 0,
+						   ckWARN(WARN_UTF8) ?
+						   0 : UTF8_ALLOW_ANY);
 			    else
 				c = UCHARAT(PL_reginput);
 			}
@@ -4297,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
     STRLEN len = 0;
     STRLEN plen;
 
-    c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
+    c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+				 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p;
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
     if (do_utf8 || (flags & ANYOF_UNICODE)) {
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8
index 747436a..5cd0e05 100644
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -39,7 +39,6 @@ my $d800  = chr(0xD800);
 my $dfff  = chr(0xDFFF);
 my $e000  = chr(0xE000);
 my $fffd  = chr(0xFFFD);
-my $fffe  = chr(0xFFFE);
 my $ffff  = chr(0xFFFF);
 my $hex4  = chr(0x10000);
 my $hex5  = chr(0x100000);
@@ -50,7 +49,6 @@ my $d800  = chr(0xD800);
 my $dfff  = chr(0xDFFF);
 my $e000  = chr(0xE000);
 my $fffd  = chr(0xFFFD);
-my $fffe  = chr(0xFFFE);
 my $ffff  = chr(0xFFFF);
 my $hex4  = chr(0x10000);
 my $hex5  = chr(0x100000);
@@ -58,9 +56,8 @@ my $max   = chr(0x10FFFF);
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
 ########
 use warnings 'utf8';
 my $d7ff  = pack("U", 0xD7FF);
@@ -68,7 +65,6 @@ my $d800  = pack("U", 0xD800);
 my $dfff  = pack("U", 0xDFFF);
 my $e000  = pack("U", 0xE000);
 my $fffd  = pack("U", 0xFFFD);
-my $fffe  = pack("U", 0xFFFE);
 my $ffff  = pack("U", 0xFFFF);
 my $hex4  = pack("U", 0x10000);
 my $hex5  = pack("U", 0x100000);
@@ -79,7 +75,6 @@ my $d800  = pack("U", 0xD800);
 my $dfff  = pack("U", 0xDFFF);
 my $e000  = pack("U", 0xE000);
 my $fffd  = pack("U", 0xFFFD);
-my $fffe  = pack("U", 0xFFFE);
 my $ffff  = pack("U", 0xFFFF);
 my $hex4  = pack("U", 0x10000);
 my $hex5  = pack("U", 0x100000);
@@ -87,9 +82,8 @@ my $max   = pack("U", 0x10FFFF);
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
 ########
 use warnings 'utf8';
 my $d7ff  = "\x{D7FF}";
@@ -97,7 +91,6 @@ my $d800  = "\x{D800}";
 my $dfff  = "\x{DFFF}";
 my $e000  = "\x{E000}";
 my $fffd  = "\x{FFFD}";
-my $fffe  = "\x{FFFE}";
 my $ffff  = "\x{FFFF}";
 my $hex4  = "\x{10000}";
 my $hex5  = "\x{100000}";
@@ -108,7 +101,6 @@ my $d800  = "\x{D800}";
 my $dfff  = "\x{DFFF}";
 my $e000  = "\x{E000}";
 my $fffd  = "\x{FFFD}";
-my $fffe  = "\x{FFFE}";
 my $ffff  = "\x{FFFF}";
 my $hex4  = "\x{10000}";
 my $hex5  = "\x{100000}";
@@ -116,6 +108,5 @@ my $max   = "\x{10FFFF}";
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
diff --git a/utf8.c b/utf8.c
index 85a22a1..0100eb1 100644
--- a/utf8.c
+++ b/utf8.c
@@ -64,13 +64,13 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 		  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
 		    !(flags & UNICODE_ALLOW_FDD0))
 		   ||
-		   ((uv & 0xFFFF) == 0xFFFE &&
-		    !(flags & UNICODE_ALLOW_FFFE))
+		   (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
+		    !(flags & UNICODE_ALLOW_BOM))
 		   ||
 		   ((uv & 0xFFFF) == 0xFFFF &&
 		    !(flags & UNICODE_ALLOW_FFFF))) &&
 		  /* UNICODE_ALLOW_SUPER includes
-		   * FFFEs and FFFFs beyond 0x10FFFF. */
+		   * FFFFs beyond 0x10FFFF. */
 		  ((uv <= PERL_UNICODE_MAX) ||
 		   !(flags & UNICODE_ALLOW_SUPER))
 		  )
@@ -500,7 +500,8 @@ returned and retlen is set, if possible, to -1.
 UV
 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 {
-    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
 /*
@@ -523,7 +524,8 @@ UV
 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 {
     /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
 /*
@@ -1626,7 +1628,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
 	    /* We use utf8n_to_uvuni() as we want an index into
 	       Unicode tables, not a native character number.
 	     */
-	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
+	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY);
 	    SV *errsv_save;
 	    ENTER;
 	    SAVETMPS;
diff --git a/utf8.h b/utf8.h
index a5312ca..3787832 100644
--- a/utf8.h
+++ b/utf8.h
@@ -188,24 +188,24 @@ encoded character.
 #define UNICODE_SURROGATE_FIRST		0xd800
 #define UNICODE_SURROGATE_LAST		0xdfff
 #define UNICODE_REPLACEMENT		0xfffd
-#define UNICODE_BYTER_ORDER_MARK	0xfffe
+#define UNICODE_BYTE_ORDER_MARK		0xfeff
 #define UNICODE_ILLEGAL			0xffff
 
 /* Though our UTF-8 encoding can go beyond this,
- * let's be conservative. */
+ * let's be conservative and do as Unicode 3.2 says. */
 #define PERL_UNICODE_MAX	0x10FFFF
 
 #define UNICODE_ALLOW_SURROGATE 0x0001	/* Allow UTF-16 surrogates (EVIL) */
 #define UNICODE_ALLOW_FDD0	0x0002	/* Allow the U+FDD0...U+FDEF */
-#define UNICODE_ALLOW_FFFE	0x0004	/* Allow 0xFFFE, 0x1FFFE, ... */
-#define UNICODE_ALLOW_FFFF	0x0008	/* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_BOM	0x0004	/* Allow 0xFEFF */
+#define UNICODE_ALLOW_FFFF	0x0008	/* Allow 0xFFFF, 0x1FFFF, ... */
 #define UNICODE_ALLOW_SUPER	0x0010	/* Allow past 10xFFFF */
 #define UNICODE_ALLOW_ANY	0xFFFF
 
 #define UNICODE_IS_SURROGATE(c)		((c) >= UNICODE_SURROGATE_FIRST && \
 					 (c) <= UNICODE_SURROGATE_LAST)
 #define UNICODE_IS_REPLACEMENT(c)	((c) == UNICODE_REPLACEMENT)
-#define UNICODE_IS_BYTE_ORDER_MARK(c)	((c) == UNICODE_BYTER_ORDER_MARK)
+#define UNICODE_IS_BYTE_ORDER_MARK(c)	((c) == UNICODE_BYTE_ORDER_MARK)
 #define UNICODE_IS_ILLEGAL(c)		((c) == UNICODE_ILLEGAL)
 
 #ifdef HAS_QUAD