# get element
my($e, $k) = split /;/, $line;
my @e = _getHexArray($e);
- $ele = pack('U*', @e);
+ { no warnings 'utf8'; $ele = pack('U*', @e); }
return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
# get sort key
- if(
+ { no warnings 'utf8';
+ if(
defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
- )
- {
- $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
- }
- else
- {
- foreach my $arr ($k =~ /\[(\S+)\]/g) {
- my $var = $arr =~ /\*/;
- push @key, $self->altCE( $var, _getHexArray($arr) );
+ )
+ {
+ $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
+ }
+ else
+ {
+ foreach my $arr ($k =~ /\[(\S+)\]/g) {
+ my $var = $arr =~ /\*/;
+ push @key, $self->altCE( $var, _getHexArray($arr) );
+ }
+ $self->{entries}{$ele} = \@key;
}
- $self->{entries}{$ele} = \@key;
}
$self->{maxlength}{ord $ele} = scalar @e if @e > 1;
}
$| = 1;
-print "1..38\n";
+print "1..39\n";
use charnames ':full';
print "not " unless charnames::viacode(0xFEFF) eq "ZERO WIDTH NO-BREAK SPACE";
print "ok 38\n";
+{
+ use warnings;
+ print "not " unless ord("\N{BOM}") == 0xFEFF;
+ print "ok 39\n";
+}
+
argsv = tmpsv;
}
- XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff));
+ XPUSHu(DO_UTF8(argsv) ?
+ utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) :
+ (*s & 0xff));
RETURN;
}
if (checksum) {
while (len-- > 0 && s < strend) {
STRLEN alen;
- auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+ auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
along = alen;
s += along;
if (checksum > bits_in_uv)
EXTEND_MORTAL(len);
while (len-- > 0 && s < strend) {
STRLEN alen;
- auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+ auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
along = alen;
s += along;
sv = NEWSV(37, 0);
to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
- c1 = utf8_to_uvchr(tmpbuf1, 0);
- c2 = utf8_to_uvchr(tmpbuf2, 0);
+ c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
+ 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+ 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
else {
c1 = *(U8*)m;
if (c1 == c2) {
while (s <= e) {
- c = utf8_to_uvchr((U8*)s, &len);
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
if ( c == c1
&& (ln == len ||
ibcmp_utf8(s, (char **)0, 0, do_utf8,
}
else {
while (s <= e) {
- c = utf8_to_uvchr((U8*)s, &len);
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
/* Handle some of the three Greek sigmas cases.
* Note that not all the possible combinations
if (l >= PL_regeol)
sayNO;
if (NATIVE_TO_UNI(*(U8*)s) !=
- utf8_to_uvuni((U8*)l, &ulen))
+ utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY))
sayNO;
l += ulen;
s ++;
if (l >= PL_regeol)
sayNO;
if (NATIVE_TO_UNI(*((U8*)l)) !=
- utf8_to_uvuni((U8*)s, &ulen))
+ utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY))
sayNO;
s += ulen;
l ++;
to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
- c1 = utf8_to_uvuni(tmpbuf1, 0);
- c2 = utf8_to_uvuni(tmpbuf2, 0);
+ c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
+ c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
}
else {
- c2 = c1 = utf8_to_uvchr(s, NULL);
+ c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
}
}
}
else {
STRLEN len;
if (c1 == c2) {
- /* count initialised to utf8_distance(old, locinput) */
+ /* count initialised to
+ * utf8_distance(old, locinput) */
while (locinput <= e &&
- utf8_to_uvchr((U8*)locinput, &len) != c1) {
+ utf8n_to_uvchr((U8*)locinput,
+ UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY) != c1) {
locinput += len;
count++;
}
} else {
- /* count initialised to utf8_distance(old, locinput) */
+ /* count initialised to
+ * utf8_distance(old, locinput) */
while (locinput <= e) {
- UV c = utf8_to_uvchr((U8*)locinput, &len);
+ UV c = utf8n_to_uvchr((U8*)locinput,
+ UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
if (c == c1 || c == c2)
break;
locinput += len;
UV c;
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+ c = utf8n_to_uvchr((U8*)PL_reginput,
+ UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
else
c = UCHARAT(PL_reginput);
/* If it could work, try it. */
while (n >= ln) {
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+ c = utf8n_to_uvchr((U8*)PL_reginput,
+ UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
else
c = UCHARAT(PL_reginput);
}
while (n >= ln) {
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+ c = utf8n_to_uvchr((U8*)PL_reginput,
+ UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
else
c = UCHARAT(PL_reginput);
}
STRLEN len = 0;
STRLEN plen;
- c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
+ c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p;
plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
if (do_utf8 || (flags & ANYOF_UNICODE)) {
my $dfff = chr(0xDFFF);
my $e000 = chr(0xE000);
my $fffd = chr(0xFFFD);
-my $fffe = chr(0xFFFE);
my $ffff = chr(0xFFFF);
my $hex4 = chr(0x10000);
my $hex5 = chr(0x100000);
my $dfff = chr(0xDFFF);
my $e000 = chr(0xE000);
my $fffd = chr(0xFFFD);
-my $fffe = chr(0xFFFE);
my $ffff = chr(0xFFFF);
my $hex4 = chr(0x10000);
my $hex5 = chr(0x100000);
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
########
use warnings 'utf8';
my $d7ff = pack("U", 0xD7FF);
my $dfff = pack("U", 0xDFFF);
my $e000 = pack("U", 0xE000);
my $fffd = pack("U", 0xFFFD);
-my $fffe = pack("U", 0xFFFE);
my $ffff = pack("U", 0xFFFF);
my $hex4 = pack("U", 0x10000);
my $hex5 = pack("U", 0x100000);
my $dfff = pack("U", 0xDFFF);
my $e000 = pack("U", 0xE000);
my $fffd = pack("U", 0xFFFD);
-my $fffe = pack("U", 0xFFFE);
my $ffff = pack("U", 0xFFFF);
my $hex4 = pack("U", 0x10000);
my $hex5 = pack("U", 0x100000);
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
########
use warnings 'utf8';
my $d7ff = "\x{D7FF}";
my $dfff = "\x{DFFF}";
my $e000 = "\x{E000}";
my $fffd = "\x{FFFD}";
-my $fffe = "\x{FFFE}";
my $ffff = "\x{FFFF}";
my $hex4 = "\x{10000}";
my $hex5 = "\x{100000}";
my $dfff = "\x{DFFF}";
my $e000 = "\x{E000}";
my $fffd = "\x{FFFD}";
-my $fffe = "\x{FFFE}";
my $ffff = "\x{FFFF}";
my $hex4 = "\x{10000}";
my $hex5 = "\x{100000}";
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
((uv >= 0xFDD0 && uv <= 0xFDEF &&
!(flags & UNICODE_ALLOW_FDD0))
||
- ((uv & 0xFFFF) == 0xFFFE &&
- !(flags & UNICODE_ALLOW_FFFE))
+ (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
+ !(flags & UNICODE_ALLOW_BOM))
||
((uv & 0xFFFF) == 0xFFFF &&
!(flags & UNICODE_ALLOW_FFFF))) &&
/* UNICODE_ALLOW_SUPER includes
- * FFFEs and FFFFs beyond 0x10FFFF. */
+ * FFFFs beyond 0x10FFFF. */
((uv <= PERL_UNICODE_MAX) ||
!(flags & UNICODE_ALLOW_SUPER))
)
UV
Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
{
- return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+ return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
{
/* Call the low level routine asking for checks */
- return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+ return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
/* We use utf8n_to_uvuni() as we want an index into
Unicode tables, not a native character number.
*/
- UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
+ UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
SV *errsv_save;
ENTER;
SAVETMPS;
#define UNICODE_SURROGATE_FIRST 0xd800
#define UNICODE_SURROGATE_LAST 0xdfff
#define UNICODE_REPLACEMENT 0xfffd
-#define UNICODE_BYTER_ORDER_MARK 0xfffe
+#define UNICODE_BYTE_ORDER_MARK 0xfeff
#define UNICODE_ILLEGAL 0xffff
/* Though our UTF-8 encoding can go beyond this,
- * let's be conservative. */
+ * let's be conservative and do as Unicode 3.2 says. */
#define PERL_UNICODE_MAX 0x10FFFF
#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */
-#define UNICODE_ALLOW_FFFE 0x0004 /* Allow 0xFFFE, 0x1FFFE, ... */
-#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_BOM 0x0004 /* Allow 0xFEFF */
+#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFF, 0x1FFFF, ... */
#define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */
#define UNICODE_ALLOW_ANY 0xFFFF
#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
(c) <= UNICODE_SURROGATE_LAST)
#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT)
-#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTER_ORDER_MARK)
+#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTE_ORDER_MARK)
#define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL)
#ifdef HAS_QUAD