[p5sagit/p5-mst-13.2.git] / lib / utf8.t

#!./perl 

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    require './test.pl';
}

# NOTE!
#
# Think carefully before adding tests here.  In general this should be
# used only for about three categories of tests:
#
# (1) tests that absolutely require 'use utf8', and since that in general
#     shouldn't be needed as the utf8 is being obsoleted, this should
#     have rather few tests.  If you want to test Unicode and regexes,
#     you probably want to go to op/regexp or op/pat; if you want to test
#     split, go to op/split; pack, op/pack; appending or joining,
#     op/append or op/join, and so forth
#
# (2) tests that have to do with Unicode tokenizing (though it's likely
#     that all the other Unicode tests sprinkled around the t/**/*.t are
#     going to catch that)
#
# (3) complicated tests that simultaneously stress so many Unicode features
#     that deciding into which other test script the tests should go to
#     is hard -- maybe consider breaking up the complicated test
#
#

plan tests => 31;

{
    # bug id 20001009.001

    my ($a, $b);

    { use bytes; $a = "\xc3\xa4" }
    { use utf8;  $b = "\xe4"     }

    my $test = 68;

    ok($a ne $b);

    { use utf8; ok($a ne $b) }
}


{
    # bug id 20000730.004

    my $smiley = "\x{263a}";

    for my $s ("\x{263a}",
	       $smiley,
		
	       "" . $smiley,
	       "" . "\x{263a}",

	       $smiley    . "",
	       "\x{263a}" . "",
	       ) {
	my $length_chars = length($s);
	my $length_bytes;
	{ use bytes; $length_bytes = length($s) }
	my @regex_chars = $s =~ m/(.)/g;
	my $regex_chars = @regex_chars;
	my @split_chars = split //, $s;
	my $split_chars = @split_chars;
	ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
	   "1/1/1/3");
    }

    for my $s ("\x{263a}" . "\x{263a}",
	       $smiley    . $smiley,

	       "\x{263a}\x{263a}",
	       "$smiley$smiley",
	       
	       "\x{263a}" x 2,
	       $smiley    x 2,
	       ) {
	my $length_chars = length($s);
	my $length_bytes;
	{ use bytes; $length_bytes = length($s) }
	my @regex_chars = $s =~ m/(.)/g;
	my $regex_chars = @regex_chars;
	my @split_chars = split //, $s;
	my $split_chars = @split_chars;
	ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
	   "2/2/2/6");
    }
}


{
    my $w = 0;
    local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
    my $x = eval q/"\\/ . "\x{100}" . q/"/;;
   
    ok($w == 0 && $x eq "\x{100}");
}

{
    use warnings;
    my $progfile = 'utf' . $$;
    END {unlink_all $progfile}

    # If I'm right 60 is '>' in ASCII, ' ' in EBCDIC
    # 173 is not punctuation in either ASCII or EBCDIC
    my (@char);
    foreach (60, 173, 257, 65532) {
      my $char = chr $_;
      utf8::encode($char);
      # I don't want to use map {ord} and I've no need to hardcode the UTF
      # version
      my $charsubst = $char;
      $charsubst =~ s/(.)/ord ($1) . ','/ge;
      chop $charsubst;
      push @char, [$_, $char, $charsubst];
    }
    foreach (
             ['check our detection program works',
              '@a = ("'.chr(60).'\x2A", ""); display @a', qr/^>60,42<><$/],
             ['check literal 8 bit input',
              '$a = "' . chr (173) . '"; display $a', qr/^>173<$/],
             ['check no utf8; makes no change',
              'no utf8; $a = "' . chr (173) . '"; display $a', qr/^>173<$/],
             # Now we do the real byte sequences that are valid UTF8
             (map {
               ["the utf8 sequence for chr $_->[0]",
                qq(\$a = "$_->[1]"; display \$a), qr/^>$_->[2]<$/],
               ["no utf8; for the utf8 sequence for chr $_->[0]",
                qq(no utf8; \$a = "$_->[1]"; display \$a), qr/^>$_->[2]<$/],
               ["use utf8; for the utf8 sequence for chr $_->[0]",
                qq(use utf8; \$a = "$_->[1]"; display \$a), qr/^>$_->[0]<$/],
              } @char),
             # Interpolation of hex characters needs to take place now, as we're
             # testing feeding malformed utf8 into perl. Bug now fixed was an
             # "out of memory" error. We really need the "" [rather than qq()
             # or q()] to get the best explosion.
             ["!Feed malformed utf8 into perl.", <<"BANG",
    use utf8; %a = ("\xE1\xA0"=>"sterling");
    print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
BANG
	      qr/^Malformed UTF-8 character \(2 bytes, need 3\).*start\d+,end$/s
	     ],
            ) {
        my ($why, $prog, $expect) = @$_;
        open P, ">$progfile" or die "Can't open '$progfile': $!";
        print P q(
                  sub display {
                    print '>' . join (',', map {ord} split //, $_) . '<'
                    foreach @_;
                  }
                 );
	print P $prog;
        close P or die "Can't close '$progfile': $!";
        if ($why =~ s/^!//) {
            print "# Possible delay...\n";
        } else {
            print "# $prog\n";
        }
        my $result = runperl ( stderr => 1, progfile => $progfile );
        like ($result, $expect, $why);
    }
}
Commit	Line	Data
f96ec2a2	1	#!./perl
	2
	3	BEGIN {
	4	chdir 't' if -d 't';
20822f61	5	@INC = '../lib';
8ae6c9f9	6	require './test.pl';
f96ec2a2	7	}
f96ec2a2	8
4765795a	9	# NOTE!
	10	#
	11	# Think carefully before adding tests here. In general this should be
	12	# used only for about three categories of tests:
	13	#
	14	# (1) tests that absolutely require 'use utf8', and since that in general
	15	# shouldn't be needed as the utf8 is being obsoleted, this should
	16	# have rather few tests. If you want to test Unicode and regexes,
	17	# you probably want to go to op/regexp or op/pat; if you want to test
	18	# split, go to op/split; pack, op/pack; appending or joining,
	19	# op/append or op/join, and so forth
	20	#
	21	# (2) tests that have to do with Unicode tokenizing (though it's likely
	22	# that all the other Unicode tests sprinkled around the t/*/.t are
	23	# going to catch that)
	24	#
	25	# (3) complicated tests that simultaneously stress so many Unicode features
	26	# that deciding into which other test script the tests should go to
	27	# is hard -- maybe consider breaking up the complicated test
	28	#
	29	#
	30
435e7af6	31	plan tests => 31;
31067593	32
7bbb0251	33	{
da450f52	34	# bug id 20001009.001
da450f52	35
89491803	36	my ($a, $b);
	37
	38	{ use bytes; $a = "\xc3\xa4" }
4765795a	39	{ use utf8; $b = "\xe4" }
89491803	40
4765795a	41	my $test = 68;
31067593	42
4765795a	43	ok($a ne $b);
da450f52	44
4765795a	45	{ use utf8; ok($a ne $b) }
da450f52	46	}
da450f52	47
60ff4832	48
	49	{
	50	# bug id 20000730.004
	51
60ff4832	52	my $smiley = "\x{263a}";
60ff4832	53
4765795a	54	for my $s ("\x{263a}",
4765795a	55	$smiley,
60ff4832	56
4765795a	57	"" . $smiley,
4765795a	58	"" . "\x{263a}",
60ff4832	59
4765795a	60	$smiley . "",
4765795a	61	"\x{263a}" . "",
60ff4832	62	) {
	63	my $length_chars = length($s);
	64	my $length_bytes;
	65	{ use bytes; $length_bytes = length($s) }
	66	my @regex_chars = $s =~ m/(.)/g;
	67	my $regex_chars = @regex_chars;
	68	my @split_chars = split //, $s;
	69	my $split_chars = @split_chars;
4765795a	70	ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
4765795a	71	"1/1/1/3");
60ff4832	72	}
60ff4832	73
4765795a	74	for my $s ("\x{263a}" . "\x{263a}",
4765795a	75	$smiley . $smiley,
60ff4832	76
4765795a	77	"\x{263a}\x{263a}",
4765795a	78	"$smiley$smiley",
60ff4832	79
4765795a	80	"\x{263a}" x 2,
4765795a	81	$smiley x 2,
60ff4832	82	) {
	83	my $length_chars = length($s);
	84	my $length_bytes;
	85	{ use bytes; $length_bytes = length($s) }
	86	my @regex_chars = $s =~ m/(.)/g;
	87	my $regex_chars = @regex_chars;
	88	my @split_chars = split //, $s;
	89	my $split_chars = @split_chars;
4765795a	90	ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
4765795a	91	"2/2/2/6");
60ff4832	92	}
60ff4832	93	}
ffc61ed2	94
ffc61ed2	95
ffc61ed2	96	{
f9a63242	97	my $w = 0;
	98	local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
	99	my $x = eval q/"\\/ . "\x{100}" . q/"/;;
	100
4765795a	101	ok($w == 0 && $x eq "\x{100}");
f9a63242	102	}
f9a63242	103
8ae6c9f9	104	{
435e7af6	105	use warnings;
8ae6c9f9	106	my $progfile = 'utf' . $$;
435e7af6	107	END {unlink_all $progfile}
	108
	109	# If I'm right 60 is '>' in ASCII, ' ' in EBCDIC
	110	# 173 is not punctuation in either ASCII or EBCDIC
	111	my (@char);
	112	foreach (60, 173, 257, 65532) {
	113	my $char = chr $_;
	114	utf8::encode($char);
	115	# I don't want to use map {ord} and I've no need to hardcode the UTF
	116	# version
	117	my $charsubst = $char;
	118	$charsubst =~ s/(.)/ord ($1) . ','/ge;
	119	chop $charsubst;
	120	push @char, [$_, $char, $charsubst];
	121	}
	122	foreach (
	123	['check our detection program works',
	124	'@a = ("'.chr(60).'\x2A", ""); display @a', qr/^>60,42<><$/],
	125	['check literal 8 bit input',
	126	'$a = "' . chr (173) . '"; display $a', qr/^>173<$/],
	127	['check no utf8; makes no change',
	128	'no utf8; $a = "' . chr (173) . '"; display $a', qr/^>173<$/],
	129	# Now we do the real byte sequences that are valid UTF8
	130	(map {
	131	["the utf8 sequence for chr $_->[0]",
	132	qq(\$a = "$_->[1]"; display \$a), qr/^>$_->[2]<$/],
	133	["no utf8; for the utf8 sequence for chr $_->[0]",
	134	qq(no utf8; \$a = "$_->[1]"; display \$a), qr/^>$_->[2]<$/],
	135	["use utf8; for the utf8 sequence for chr $_->[0]",
	136	qq(use utf8; \$a = "$_->[1]"; display \$a), qr/^>$_->[0]<$/],
	137	} @char),
	138	# Interpolation of hex characters needs to take place now, as we're
	139	# testing feeding malformed utf8 into perl. Bug now fixed was an
	140	# "out of memory" error. We really need the "" [rather than qq()
	141	# or q()] to get the best explosion.
	142	["!Feed malformed utf8 into perl.", <<"BANG",
8ae6c9f9	143	use utf8; %a = ("\xE1\xA0"=>"sterling");
435e7af6	144	print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
8ae6c9f9	145	BANG
435e7af6	146	qr/^Malformed UTF-8 character \(2 bytes, need 3\).*start\d+,end$/s
	147	],
	148	) {
	149	my ($why, $prog, $expect) = @$_;
	150	open P, ">$progfile" or die "Can't open '$progfile': $!";
	151	print P q(
	152	sub display {
	153	print '>' . join (',', map {ord} split //, $_) . '<'
	154	foreach @_;
	155	}
	156	);
	157	print P $prog;
	158	close P or die "Can't close '$progfile': $!";
	159	if ($why =~ s/^!//) {
	160	print "# Possible delay...\n";
	161	} else {
	162	print "# $prog\n";
	163	}
	164	my $result = runperl ( stderr => 1, progfile => $progfile );
	165	like ($result, $expect, $why);
	166	}
8ae6c9f9	167	}