[catagits/Gitalist.git] / local-lib5 / lib / perl5 / Test / utf8.pm

package Test::utf8;

use 5.007003;

use strict;
use warnings;

use Encode;
use charnames ':full';

use vars qw(@ISA @EXPORT $VERSION %allowed $valid_utf8_regexp);
$VERSION = "0.02";

require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(is_valid_string is_dodgy_utf8 is_sane_utf8
	      is_within_ascii is_within_latin1 is_within_latin_1
              is_flagged_utf8 isnt_flagged_utf8);

# A Regexp string to match valid UTF8 bytes
# this info comes from page 78 of "The Unicode Standard 4.0"
# published by the Unicode Consortium
$valid_utf8_regexp = <<'.' ;
        [\x{00}-\x{7f}]
      | [\x{c2}-\x{df}][\x{80}-\x{bf}]
      |         \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
      | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
      |         \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
      | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
      |         \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
      | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
      |         \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
.

=head1 NAME

Test::utf8 - handy utf8 tests

=head1 SYNOPSIS

  is_valid_string($string);   # check the string is valid
  is_sane_utf8($string);      # check not double encoded
  is_flagged_utf8($string);   # has utf8 flag set
  is_within_latin_1($string); # but only has latin_1 chars in it

=head1 DESCRIPTION

This module is a collection of tests that's useful when dealing
with utf8 strings in Perl.

=head2 Validity

These two tests check if a string is valid, and if you've probably
made a mistake with your string

=over

=item is_valid_string($string, $testname)

This passes and returns true true if and only if the scalar isn't a
invalid string; In short, it checks that the utf8 flag hasn't been set
for a string that isn't a valid utf8 encoding.

=cut

sub is_valid_string($;$)
{
  my $string = shift;
  my $name = shift || "valid string test";

  # check we're a utf8 string - if not, we pass.
  unless (Encode::is_utf8($string))
    { return pass($name) }

  # work out at what byte (if any) we have an invalid byte sequence
  # and return the correct test result
  my $pos = _invalid_sequence_at_byte($string);
  ok(!defined($pos), $name)
    or diag("malformed byte sequence starting at byte $pos");
}

sub _invalid_sequence_at_byte($)
{
  my $string = shift;

  # examine the bytes that make up the string (not the chars)
  # by turning off the utf8 flag (no, use bytes doens't
  # work, we're dealing with a regexp)
  Encode::_utf8_off($string);

  # work out the index of the first non matching byte
  my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;

  # if we matched all the string return the empty list
  my $pos = pos $string || 0;
  return if $pos == length($string);

  # otherwise return the position we found
  return $pos
}

=item is_sane_utf8($string, $name)

This test fails if the string contains something that looks like it
might be dodgy utf8, i.e. containing something that looks like the
multi-byte sequence for a latin-1 character but perl hasn't been
instructed to treat as such.  Strings that are not utf8 always
automatically pass.

Some examples may help:

  # This will pass as it's a normal latin-1 string
  is_sane_utf8("Hello L\x{e9}eon");

  # this will fail because the \x{c3}\x{a9} looks like the
  # utf8 byte sequence for e-acute
  my $string = "Hello L\x{c3}\x{a9}on";
  is_sane_utf8($string);

  # this will pass because the utf8 is correctly interpreted as utf8
  Encode::_utf8_on($string)
  is_sane_utf8($string);

Obviously this isn't a hundred percent reliable.  The edge case where
this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
WITH TILDE") followed by one of the latin-1 punctuation symbols.

  # a capital letter A with tilde surrounded by smart quotes
  # this will fail because it'll see the "\x{c2}\x{94}" and think
  # it's actually the utf8 sequence for the end smart quote
  is_sane_utf8("\x{93}\x{c2}\x{94}");

However, since this hardly comes up this test is reasonably reliable
in most cases.  Still, care should be applied in cases where dynamic
data is placed next to latin-1 punctuation to avoid false negatives.

There exists two situations to cause this test to fail; The string
contains utf8 byte sequences and the string hasn't been flagged as
utf8 (this normally means that you got it from an external source like
a C library; When Perl needs to store a string internally as utf8 it
does it's own encoding and flagging transparently) or a utf8 flagged
string contains byte sequences that when translated to characters
themselves look like a utf8 byte sequence.  The test diagnostics tells
you which is the case.

=cut

# build my regular expression out of the latin-1 bytes
# NOTE: This won't work if our locale is nonstandard will it?
my $re_bit = join "|", map { Encode::encode("utf8",chr($_)) } (127..255);

#binmode STDERR, ":utf8";
#print STDERR $re_bit;

sub is_sane_utf8($;$)
{
  my $string = shift;
  my $name   = shift || "sane utf8";

  # regexp in scalar context with 'g', meaning this loop will run for
  # each match.  Should only have to run it once, but will redo if
  # the failing case turns out to be allowed in %allowed.
  while ($string =~ /($re_bit)/o)
  {
    # work out what the double encoded string was
    my $bytes = $1;

    my $index = $+[0] - length($bytes);
    my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;

    # what charecter does that represent?
    my $char = Encode::decode("utf8",$bytes);
    my $ord  = ord($char);
    my $hex  = sprintf '%00x', $ord;
    $char = charnames::viacode($ord);

    # print out diagnostic messages
    fail($name);
    diag(qq{Found dodgy chars "$codes" at char $index\n});
    if (Encode::is_utf8($string))
      { diag("Chars in utf8 string look like utf8 byte sequence.") }
    else
      { diag("String not flagged as utf8...was it meant to be?\n") }
    diag("Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n");

    return 0;
  }

  # got this far, must have passed.
  ok(1,$name);
  return 1;
}

# historic name of method; deprecated
sub is_dodgy_utf8
{
  # report errors not here but further up the calling stack
  local $Test::Builder::Level = $Test::Builder::Level + 1;

  # call without prototype with all args
  &is_sane_utf8(@_);
}

=back

=head2 Checking the Range of Characters in a String

These routines allow you to check the range of characters in a string.
Note that these routines are blind to the actual encoding perl
internally uses to store the characters, they just check if the
string contains only characters that can be represented in the named
encoding.

=over

=item is_within_ascii

Tests that a string only contains characters that are in the ASCII
charecter set.

=cut

sub is_within_ascii($;$)
{
  my $string = shift;
  my $name   = shift || "within ascii";

  # look for anything that isn't ascii or pass
  $string =~ /([^\x{00}-\x{7f}])/ or return pass($name);

  # explain why we failed
  my $dec = ord($1);
  my $hex = sprintf '%02x', $dec;

  fail($name);
  diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");

  return 0;
}

=item is_within_latin_1

Tests that a string only contains characters that are in latin-1.

=cut

sub is_within_latin_1($;$)
{
  my $string = shift;
  my $name   = shift || "within latin-1";

  # look for anything that isn't ascii or pass
  $string =~ /([^\x{00}-\x{ff}])/ or return pass($name);

  # explain why we failed
  my $dec = ord($1);
  my $hex = sprintf '%x', $dec;

  fail($name);
  diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");

  return 0;
}

sub is_within_latin1
{
  # report errors not here but further up the calling stack
  local $Test::Builder::Level = $Test::Builder::Level + 1;

  # call without prototype with all args
  &is_within_latin_1(@_);
}

=back

=head2 Simple utf8 Flag Tests

Simply check if a scalar is or isn't flagged as utf8 by perl's
internals.

=over

=item is_flagged_utf8($string, $name)

Passes if the string is flagged by perl's internals as utf8, fails if
it's not.

=cut

sub is_flagged_utf8
{
  my $string = shift;
  my $name = shift || "flagged as utf8";
  return ok(Encode::is_utf8($string),$name);
}

=item isnt_flagged_utf8($string,$name)

The opposite of C<is_flagged_utf8>, passes if and only if the string
isn't flagged as utf8 by perl's internals.

Note: you can refer to this function as C<isn't_flagged_utf8> if you
really want to.

=cut

sub isnt_flagged_utf8($;$)
{
  my $string = shift;
  my $name   = shift || "not flagged as utf8";
  return ok(!Encode::is_utf8($string), $name);
}

sub isn::t_flagged_utf8($;$)
{
  my $string = shift;
  my $name   = shift || "not flagged as utf8";
  return ok(!Encode::is_utf8($string), $name);
}

=back

=head1 AUTHOR

  Copyright Mark Fowler 2004.  All rights reserved.

  This program is free software; you can redistribute it
  and/or modify it under the same terms as Perl itself.

=head1 BUGS

None known.  Please report any to me via the CPAN RT system.  See
http://rt.cpan.org/ for more details.

=head1 SEE ALSO

L<Test::DoubleEncodedEntities> for testing for double encoded HTML
entities.

=cut

##########

# shortcuts for Test::Builder.

use Test::Builder;
my $Tester = Test::Builder->new();

sub ok
{
  local $Test::Builder::Level = $Test::Builder::Level + 1;
  $Tester->ok(@_)
}
sub diag
{
  local $Test::Builder::Level = $Test::Builder::Level + 1;
   $Tester->diag(@_)
}

sub fail
{
  local $Test::Builder::Level = $Test::Builder::Level + 1;
  ok(0,@_)
}

sub pass
{
  local $Test::Builder::Level = $Test::Builder::Level + 1;
  ok(1,@_)
}


1;
Commit	Line	Data
3fea05b9	1	package Test::utf8;
	2
	3	use 5.007003;
	4
	5	use strict;
	6	use warnings;
	7
	8	use Encode;
	9	use charnames ':full';
	10
	11	use vars qw(@ISA @EXPORT $VERSION %allowed $valid_utf8_regexp);
	12	$VERSION = "0.02";
	13
	14	require Exporter;
	15	@ISA = qw(Exporter);
	16	@EXPORT = qw(is_valid_string is_dodgy_utf8 is_sane_utf8
	17	is_within_ascii is_within_latin1 is_within_latin_1
	18	is_flagged_utf8 isnt_flagged_utf8);
	19
	20	# A Regexp string to match valid UTF8 bytes
	21	# this info comes from page 78 of "The Unicode Standard 4.0"
	22	# published by the Unicode Consortium
	23	$valid_utf8_regexp = <<'.' ;
	24	[\x{00}-\x{7f}]
	25	\| [\x{c2}-\x{df}][\x{80}-\x{bf}]
	26	\| \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
	27	\| [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
	28	\| \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
	29	\| [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
	30	\| \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
	31	\| [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
	32	\| \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
	33	.
	34
	35	=head1 NAME
	36
	37	Test::utf8 - handy utf8 tests
	38
	39	=head1 SYNOPSIS
	40
	41	is_valid_string($string); # check the string is valid
	42	is_sane_utf8($string); # check not double encoded
	43	is_flagged_utf8($string); # has utf8 flag set
	44	is_within_latin_1($string); # but only has latin_1 chars in it
	45
	46	=head1 DESCRIPTION
	47
	48	This module is a collection of tests that's useful when dealing
	49	with utf8 strings in Perl.
	50
	51	=head2 Validity
	52
	53	These two tests check if a string is valid, and if you've probably
	54	made a mistake with your string
	55
	56	=over
	57
	58	=item is_valid_string($string, $testname)
	59
	60	This passes and returns true true if and only if the scalar isn't a
	61	invalid string; In short, it checks that the utf8 flag hasn't been set
	62	for a string that isn't a valid utf8 encoding.
	63
	64	=cut
65
66	sub is_valid_string($;$)
67	{
68	my $string = shift;
69	my $name = shift \|\| "valid string test";
70
71	# check we're a utf8 string - if not, we pass.
72	unless (Encode::is_utf8($string))
73	{ return pass($name) }
74
75	# work out at what byte (if any) we have an invalid byte sequence
76	# and return the correct test result
77	my $pos = _invalid_sequence_at_byte($string);
78	ok(!defined($pos), $name)
79	or diag("malformed byte sequence starting at byte $pos");
80	}
81
82	sub _invalid_sequence_at_byte($)
83	{
84	my $string = shift;
85
86	# examine the bytes that make up the string (not the chars)
87	# by turning off the utf8 flag (no, use bytes doens't
88	# work, we're dealing with a regexp)
89	Encode::_utf8_off($string);
90
91	# work out the index of the first non matching byte
92	my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;
93
94	# if we matched all the string return the empty list
95	my $pos = pos $string \|\| 0;
96	return if $pos == length($string);
97
98	# otherwise return the position we found
99	return $pos
100	}
101
102	=item is_sane_utf8($string, $name)
103
104	This test fails if the string contains something that looks like it
105	might be dodgy utf8, i.e. containing something that looks like the
106	multi-byte sequence for a latin-1 character but perl hasn't been
107	instructed to treat as such. Strings that are not utf8 always
108	automatically pass.
109
110	Some examples may help:
111
112	# This will pass as it's a normal latin-1 string
113	is_sane_utf8("Hello L\x{e9}eon");
114
115	# this will fail because the \x{c3}\x{a9} looks like the
116	# utf8 byte sequence for e-acute
117	my $string = "Hello L\x{c3}\x{a9}on";
118	is_sane_utf8($string);
119
120	# this will pass because the utf8 is correctly interpreted as utf8
121	Encode::_utf8_on($string)
122	is_sane_utf8($string);
123
124	Obviously this isn't a hundred percent reliable. The edge case where
125	this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
126	LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
127	WITH TILDE") followed by one of the latin-1 punctuation symbols.
128
129	# a capital letter A with tilde surrounded by smart quotes
130	# this will fail because it'll see the "\x{c2}\x{94}" and think
131	# it's actually the utf8 sequence for the end smart quote
132	is_sane_utf8("\x{93}\x{c2}\x{94}");
133
134	However, since this hardly comes up this test is reasonably reliable
135	in most cases. Still, care should be applied in cases where dynamic
136	data is placed next to latin-1 punctuation to avoid false negatives.
137
138	There exists two situations to cause this test to fail; The string
139	contains utf8 byte sequences and the string hasn't been flagged as
140	utf8 (this normally means that you got it from an external source like
141	a C library; When Perl needs to store a string internally as utf8 it
142	does it's own encoding and flagging transparently) or a utf8 flagged
143	string contains byte sequences that when translated to characters
144	themselves look like a utf8 byte sequence. The test diagnostics tells
145	you which is the case.
146
147	=cut
148
149	# build my regular expression out of the latin-1 bytes
150	# NOTE: This won't work if our locale is nonstandard will it?
151	my $re_bit = join "\|", map { Encode::encode("utf8",chr($_)) } (127..255);
152
153	#binmode STDERR, ":utf8";
154	#print STDERR $re_bit;
155
156	sub is_sane_utf8($;$)
157	{
158	my $string = shift;
159	my $name = shift \|\| "sane utf8";
160
161	# regexp in scalar context with 'g', meaning this loop will run for
162	# each match. Should only have to run it once, but will redo if
163	# the failing case turns out to be allowed in %allowed.
164	while ($string =~ /($re_bit)/o)
165	{
166	# work out what the double encoded string was
167	my $bytes = $1;
168
169	my $index = $+[0] - length($bytes);
170	my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;
171
172	# what charecter does that represent?
173	my $char = Encode::decode("utf8",$bytes);
174	my $ord = ord($char);
175	my $hex = sprintf '%00x', $ord;
176	$char = charnames::viacode($ord);
177
178	# print out diagnostic messages
179	fail($name);
180	diag(qq{Found dodgy chars "$codes" at char $index\n});
181	if (Encode::is_utf8($string))
182	{ diag("Chars in utf8 string look like utf8 byte sequence.") }
183	else
184	{ diag("String not flagged as utf8...was it meant to be?\n") }
185	diag("Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n");
186
187	return 0;
188	}
189
190	# got this far, must have passed.
191	ok(1,$name);
192	return 1;
193	}
194
195	# historic name of method; deprecated
196	sub is_dodgy_utf8
197	{
198	# report errors not here but further up the calling stack
199	local $Test::Builder::Level = $Test::Builder::Level + 1;
200
201	# call without prototype with all args
202	&is_sane_utf8(@_);
203	}
204
205	=back
206
207	=head2 Checking the Range of Characters in a String
208
209	These routines allow you to check the range of characters in a string.
210	Note that these routines are blind to the actual encoding perl
211	internally uses to store the characters, they just check if the
212	string contains only characters that can be represented in the named
213	encoding.
214
215	=over
216
217	=item is_within_ascii
218
219	Tests that a string only contains characters that are in the ASCII
220	charecter set.
221
222	=cut
223
224	sub is_within_ascii($;$)
225	{
226	my $string = shift;
227	my $name = shift \|\| "within ascii";
228
229	# look for anything that isn't ascii or pass
230	$string =~ /([^\x{00}-\x{7f}])/ or return pass($name);
231
232	# explain why we failed
233	my $dec = ord($1);
234	my $hex = sprintf '%02x', $dec;
235
236	fail($name);
237	diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");
238
239	return 0;
240	}
241
242	=item is_within_latin_1
243
244	Tests that a string only contains characters that are in latin-1.
245
246	=cut
247
248	sub is_within_latin_1($;$)
249	{
250	my $string = shift;
251	my $name = shift \|\| "within latin-1";
252
253	# look for anything that isn't ascii or pass
254	$string =~ /([^\x{00}-\x{ff}])/ or return pass($name);
255
256	# explain why we failed
257	my $dec = ord($1);
258	my $hex = sprintf '%x', $dec;
259
260	fail($name);
261	diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");
262
263	return 0;
264	}
265
266	sub is_within_latin1
267	{
268	# report errors not here but further up the calling stack
269	local $Test::Builder::Level = $Test::Builder::Level + 1;
270
271	# call without prototype with all args
272	&is_within_latin_1(@_);
273	}
274
275	=back
276
277	=head2 Simple utf8 Flag Tests
278
279	Simply check if a scalar is or isn't flagged as utf8 by perl's
280	internals.
281
282	=over
283
284	=item is_flagged_utf8($string, $name)
285
286	Passes if the string is flagged by perl's internals as utf8, fails if
287	it's not.
288
289	=cut
290
291	sub is_flagged_utf8
292	{
293	my $string = shift;
294	my $name = shift \|\| "flagged as utf8";
295	return ok(Encode::is_utf8($string),$name);
296	}
297
298	=item isnt_flagged_utf8($string,$name)
299
300	The opposite of C<is_flagged_utf8>, passes if and only if the string
301	isn't flagged as utf8 by perl's internals.
302
303	Note: you can refer to this function as C<isn't_flagged_utf8> if you
304	really want to.
305
306	=cut
307
308	sub isnt_flagged_utf8($;$)
309	{
310	my $string = shift;
311	my $name = shift \|\| "not flagged as utf8";
312	return ok(!Encode::is_utf8($string), $name);
313	}
314
315	sub isn::t_flagged_utf8($;$)
316	{
317	my $string = shift;
318	my $name = shift \|\| "not flagged as utf8";
319	return ok(!Encode::is_utf8($string), $name);
320	}
321
322	=back
323
324	=head1 AUTHOR
325
326	Copyright Mark Fowler 2004. All rights reserved.
327
328	This program is free software; you can redistribute it
329	and/or modify it under the same terms as Perl itself.
330
331	=head1 BUGS
332
333	None known. Please report any to me via the CPAN RT system. See
334	http://rt.cpan.org/ for more details.
335
336	=head1 SEE ALSO
337
338	L<Test::DoubleEncodedEntities> for testing for double encoded HTML
339	entities.
340
341	=cut
342
343	##########
344
345	# shortcuts for Test::Builder.
346
347	use Test::Builder;
348	my $Tester = Test::Builder->new();
349
350	sub ok
351	{
352	local $Test::Builder::Level = $Test::Builder::Level + 1;
353	$Tester->ok(@_)
354	}
355	sub diag
356	{
357	local $Test::Builder::Level = $Test::Builder::Level + 1;
358	$Tester->diag(@_)
359	}
360
361	sub fail
362	{
363	local $Test::Builder::Level = $Test::Builder::Level + 1;
364	ok(0,@_)
365	}
366
367	sub pass
368	{
369	local $Test::Builder::Level = $Test::Builder::Level + 1;
370	ok(1,@_)
371	}
372
373
374	1;
375
376