From: Jarkko Hietaniemi Date: Tue, 2 Apr 2002 20:35:13 +0000 (+0000) Subject: What started as a small nit (the charnames test, nit found X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=872c91ae155f6880f8bf2b15c143bda5279a5794;p=p5sagit%2Fp5-mst-13.2.git What started as a small nit (the charnames test, nit found be Hugo), ballooned a bit... the goal is Larry's wish that illegal Unicode (such as U+FFFF) by default doesn't warn, since what if somebody WANTS to create illegal Unicode? Now getting close to this in the regex runtime. (Also, fix more of my fixation that BOM would be U+FFFE.) p4raw-id: //depot/perl@15689 --- diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 8522a79..4344616 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -137,24 +137,26 @@ sub parseEntry # get element my($e, $k) = split /;/, $line; my @e = _getHexArray($e); - $ele = pack('U*', @e); + { no warnings 'utf8'; $ele = pack('U*', @e); } return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; # get sort key - if( + { no warnings 'utf8'; + if( defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ || defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/ - ) - { - $self->{entries}{$ele} = $self->{ignored}{$ele} = 1; - } - else - { - foreach my $arr ($k =~ /\[(\S+)\]/g) { - my $var = $arr =~ /\*/; - push @key, $self->altCE( $var, _getHexArray($arr) ); + ) + { + $self->{entries}{$ele} = $self->{ignored}{$ele} = 1; + } + else + { + foreach my $arr ($k =~ /\[(\S+)\]/g) { + my $var = $arr =~ /\*/; + push @key, $self->altCE( $var, _getHexArray($arr) ); + } + $self->{entries}{$ele} = \@key; } - $self->{entries}{$ele} = \@key; } $self->{maxlength}{ord $ele} = scalar @e if @e > 1; } diff --git a/lib/charnames.t b/lib/charnames.t index 1beecf3..3123127 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -12,7 +12,7 @@ BEGIN { $| = 1; -print "1..38\n"; +print "1..39\n"; use charnames ':full'; @@ -220,3 +220,9 @@ print "ok 33\n"; print "not " unless charnames::viacode(0xFEFF) eq "ZERO WIDTH NO-BREAK SPACE"; print "ok 38\n"; +{ + use warnings; + print "not " unless ord("\N{BOM}") == 0xFEFF; + print "ok 39\n"; +} + diff --git a/pp.c b/pp.c index 757b4f0..fbe4737 100644 --- a/pp.c +++ b/pp.c @@ -3241,7 +3241,9 @@ PP(pp_ord) argsv = tmpsv; } - XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff)); + XPUSHu(DO_UTF8(argsv) ? + utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) : + (*s & 0xff)); RETURN; } diff --git a/pp_pack.c b/pp_pack.c index 1c5ee31..452a2b0 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -770,7 +770,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char * if (checksum) { while (len-- > 0 && s < strend) { STRLEN alen; - auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0)); + auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV)); along = alen; s += along; if (checksum > bits_in_uv) @@ -784,7 +784,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char * EXTEND_MORTAL(len); while (len-- > 0 && s < strend) { STRLEN alen; - auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0)); + auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV)); along = alen; s += along; sv = NEWSV(37, 0); diff --git a/regexec.c b/regexec.c index 8db2dc2..29b8704 100644 --- a/regexec.c +++ b/regexec.c @@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta to_utf8_lower((U8*)m, tmpbuf1, &ulen1); to_utf8_upper((U8*)m, tmpbuf2, &ulen2); - c1 = utf8_to_uvchr(tmpbuf1, 0); - c2 = utf8_to_uvchr(tmpbuf2, 0); + c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, + 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, + 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } else { c1 = *(U8*)m; @@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (c1 == c2) { while (s <= e) { - c = utf8_to_uvchr((U8*)s, &len); + c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); if ( c == c1 && (ln == len || ibcmp_utf8(s, (char **)0, 0, do_utf8, @@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else { while (s <= e) { - c = utf8_to_uvchr((U8*)s, &len); + c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); /* Handle some of the three Greek sigmas cases. * Note that not all the possible combinations @@ -2390,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*(U8*)s) != - utf8_to_uvuni((U8*)l, &ulen)) + utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY)) sayNO; l += ulen; s ++; @@ -2402,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*((U8*)l)) != - utf8_to_uvuni((U8*)s, &ulen)) + utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY)) sayNO; s += ulen; l ++; @@ -3545,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog) to_utf8_lower((U8*)s, tmpbuf1, &ulen1); to_utf8_upper((U8*)s, tmpbuf2, &ulen2); - c1 = utf8_to_uvuni(tmpbuf1, 0); - c2 = utf8_to_uvuni(tmpbuf2, 0); + c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); + c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); } else { - c2 = c1 = utf8_to_uvchr(s, NULL); + c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); } } } @@ -3605,16 +3621,24 @@ S_regmatch(pTHX_ regnode *prog) else { STRLEN len; if (c1 == c2) { - /* count initialised to utf8_distance(old, locinput) */ + /* count initialised to + * utf8_distance(old, locinput) */ while (locinput <= e && - utf8_to_uvchr((U8*)locinput, &len) != c1) { + utf8n_to_uvchr((U8*)locinput, + UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY) != c1) { locinput += len; count++; } } else { - /* count initialised to utf8_distance(old, locinput) */ + /* count initialised to + * utf8_distance(old, locinput) */ while (locinput <= e) { - UV c = utf8_to_uvchr((U8*)locinput, &len); + UV c = utf8n_to_uvchr((U8*)locinput, + UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); if (c == c1 || c == c2) break; locinput += len; @@ -3648,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog) UV c; if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); /* If it could work, try it. */ @@ -3695,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); } @@ -3715,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); } @@ -4297,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b STRLEN len = 0; STRLEN plen; - c = do_utf8 ? utf8_to_uvchr(p, &len) : *p; + c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p; plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); if (do_utf8 || (flags & ANYOF_UNICODE)) { diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index 747436a..5cd0e05 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -39,7 +39,6 @@ my $d800 = chr(0xD800); my $dfff = chr(0xDFFF); my $e000 = chr(0xE000); my $fffd = chr(0xFFFD); -my $fffe = chr(0xFFFE); my $ffff = chr(0xFFFF); my $hex4 = chr(0x10000); my $hex5 = chr(0x100000); @@ -50,7 +49,6 @@ my $d800 = chr(0xD800); my $dfff = chr(0xDFFF); my $e000 = chr(0xE000); my $fffd = chr(0xFFFD); -my $fffe = chr(0xFFFE); my $ffff = chr(0xFFFF); my $hex4 = chr(0x10000); my $hex5 = chr(0x100000); @@ -58,9 +56,8 @@ my $max = chr(0x10FFFF); EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xfffe is illegal at - line 7. -Unicode character 0xffff is illegal at - line 8. -Unicode character 0x10ffff is illegal at - line 11. +Unicode character 0xffff is illegal at - line 7. +Unicode character 0x10ffff is illegal at - line 10. ######## use warnings 'utf8'; my $d7ff = pack("U", 0xD7FF); @@ -68,7 +65,6 @@ my $d800 = pack("U", 0xD800); my $dfff = pack("U", 0xDFFF); my $e000 = pack("U", 0xE000); my $fffd = pack("U", 0xFFFD); -my $fffe = pack("U", 0xFFFE); my $ffff = pack("U", 0xFFFF); my $hex4 = pack("U", 0x10000); my $hex5 = pack("U", 0x100000); @@ -79,7 +75,6 @@ my $d800 = pack("U", 0xD800); my $dfff = pack("U", 0xDFFF); my $e000 = pack("U", 0xE000); my $fffd = pack("U", 0xFFFD); -my $fffe = pack("U", 0xFFFE); my $ffff = pack("U", 0xFFFF); my $hex4 = pack("U", 0x10000); my $hex5 = pack("U", 0x100000); @@ -87,9 +82,8 @@ my $max = pack("U", 0x10FFFF); EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xfffe is illegal at - line 7. -Unicode character 0xffff is illegal at - line 8. -Unicode character 0x10ffff is illegal at - line 11. +Unicode character 0xffff is illegal at - line 7. +Unicode character 0x10ffff is illegal at - line 10. ######## use warnings 'utf8'; my $d7ff = "\x{D7FF}"; @@ -97,7 +91,6 @@ my $d800 = "\x{D800}"; my $dfff = "\x{DFFF}"; my $e000 = "\x{E000}"; my $fffd = "\x{FFFD}"; -my $fffe = "\x{FFFE}"; my $ffff = "\x{FFFF}"; my $hex4 = "\x{10000}"; my $hex5 = "\x{100000}"; @@ -108,7 +101,6 @@ my $d800 = "\x{D800}"; my $dfff = "\x{DFFF}"; my $e000 = "\x{E000}"; my $fffd = "\x{FFFD}"; -my $fffe = "\x{FFFE}"; my $ffff = "\x{FFFF}"; my $hex4 = "\x{10000}"; my $hex5 = "\x{100000}"; @@ -116,6 +108,5 @@ my $max = "\x{10FFFF}"; EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xfffe is illegal at - line 7. -Unicode character 0xffff is illegal at - line 8. -Unicode character 0x10ffff is illegal at - line 11. +Unicode character 0xffff is illegal at - line 7. +Unicode character 0x10ffff is illegal at - line 10. diff --git a/utf8.c b/utf8.c index 85a22a1..0100eb1 100644 --- a/utf8.c +++ b/utf8.c @@ -64,13 +64,13 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) ((uv >= 0xFDD0 && uv <= 0xFDEF && !(flags & UNICODE_ALLOW_FDD0)) || - ((uv & 0xFFFF) == 0xFFFE && - !(flags & UNICODE_ALLOW_FFFE)) + (UNICODE_IS_BYTE_ORDER_MARK(uv) && + !(flags & UNICODE_ALLOW_BOM)) || ((uv & 0xFFFF) == 0xFFFF && !(flags & UNICODE_ALLOW_FFFF))) && /* UNICODE_ALLOW_SUPER includes - * FFFEs and FFFFs beyond 0x10FFFF. */ + * FFFFs beyond 0x10FFFF. */ ((uv <= PERL_UNICODE_MAX) || !(flags & UNICODE_ALLOW_SUPER)) ) @@ -500,7 +500,8 @@ returned and retlen is set, if possible, to -1. UV Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen) { - return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0); + return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* @@ -523,7 +524,8 @@ UV Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen) { /* Call the low level routine asking for checks */ - return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0); + return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* @@ -1626,7 +1628,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) /* We use utf8n_to_uvuni() as we want an index into Unicode tables, not a native character number. */ - UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0); + UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); SV *errsv_save; ENTER; SAVETMPS; diff --git a/utf8.h b/utf8.h index a5312ca..3787832 100644 --- a/utf8.h +++ b/utf8.h @@ -188,24 +188,24 @@ encoded character. #define UNICODE_SURROGATE_FIRST 0xd800 #define UNICODE_SURROGATE_LAST 0xdfff #define UNICODE_REPLACEMENT 0xfffd -#define UNICODE_BYTER_ORDER_MARK 0xfffe +#define UNICODE_BYTE_ORDER_MARK 0xfeff #define UNICODE_ILLEGAL 0xffff /* Though our UTF-8 encoding can go beyond this, - * let's be conservative. */ + * let's be conservative and do as Unicode 3.2 says. */ #define PERL_UNICODE_MAX 0x10FFFF #define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */ #define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */ -#define UNICODE_ALLOW_FFFE 0x0004 /* Allow 0xFFFE, 0x1FFFE, ... */ -#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFE, 0x1FFFE, ... */ +#define UNICODE_ALLOW_BOM 0x0004 /* Allow 0xFEFF */ +#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFF, 0x1FFFF, ... */ #define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */ #define UNICODE_ALLOW_ANY 0xFFFF #define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \ (c) <= UNICODE_SURROGATE_LAST) #define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT) -#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTER_ORDER_MARK) +#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTE_ORDER_MARK) #define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL) #ifdef HAS_QUAD