11 use vars qw(@ISA @EXPORT $VERSION %allowed $valid_utf8_regexp);
16 @EXPORT = qw(is_valid_string is_dodgy_utf8 is_sane_utf8
17 is_within_ascii is_within_latin1 is_within_latin_1
18 is_flagged_utf8 isnt_flagged_utf8);
20 # A Regexp string to match valid UTF8 bytes
21 # this info comes from page 78 of "The Unicode Standard 4.0"
22 # published by the Unicode Consortium
23 $valid_utf8_regexp = <<'.' ;
25 | [\x{c2}-\x{df}][\x{80}-\x{bf}]
26 | \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
27 | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
28 | \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
29 | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
30 | \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
31 | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
32 | \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
37 Test::utf8 - handy utf8 tests
41 is_valid_string($string); # check the string is valid
42 is_sane_utf8($string); # check not double encoded
43 is_flagged_utf8($string); # has utf8 flag set
44 is_within_latin_1($string); # but only has latin_1 chars in it
48 This module is a collection of tests that's useful when dealing
49 with utf8 strings in Perl.
53 These two tests check if a string is valid, and if you've probably
54 made a mistake with your string
58 =item is_valid_string($string, $testname)
60 This passes and returns true true if and only if the scalar isn't a
61 invalid string; In short, it checks that the utf8 flag hasn't been set
62 for a string that isn't a valid utf8 encoding.
66 sub is_valid_string($;$)
69 my $name = shift || "valid string test";
71 # check we're a utf8 string - if not, we pass.
72 unless (Encode::is_utf8($string))
73 { return pass($name) }
75 # work out at what byte (if any) we have an invalid byte sequence
76 # and return the correct test result
77 my $pos = _invalid_sequence_at_byte($string);
78 ok(!defined($pos), $name)
79 or diag("malformed byte sequence starting at byte $pos");
82 sub _invalid_sequence_at_byte($)
86 # examine the bytes that make up the string (not the chars)
87 # by turning off the utf8 flag (no, use bytes doens't
88 # work, we're dealing with a regexp)
89 Encode::_utf8_off($string);
91 # work out the index of the first non matching byte
92 my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;
94 # if we matched all the string return the empty list
95 my $pos = pos $string || 0;
96 return if $pos == length($string);
98 # otherwise return the position we found
102 =item is_sane_utf8($string, $name)
104 This test fails if the string contains something that looks like it
105 might be dodgy utf8, i.e. containing something that looks like the
106 multi-byte sequence for a latin-1 character but perl hasn't been
107 instructed to treat as such. Strings that are not utf8 always
110 Some examples may help:
112 # This will pass as it's a normal latin-1 string
113 is_sane_utf8("Hello L\x{e9}eon");
115 # this will fail because the \x{c3}\x{a9} looks like the
116 # utf8 byte sequence for e-acute
117 my $string = "Hello L\x{c3}\x{a9}on";
118 is_sane_utf8($string);
120 # this will pass because the utf8 is correctly interpreted as utf8
121 Encode::_utf8_on($string)
122 is_sane_utf8($string);
124 Obviously this isn't a hundred percent reliable. The edge case where
125 this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
126 LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
127 WITH TILDE") followed by one of the latin-1 punctuation symbols.
129 # a capital letter A with tilde surrounded by smart quotes
130 # this will fail because it'll see the "\x{c2}\x{94}" and think
131 # it's actually the utf8 sequence for the end smart quote
132 is_sane_utf8("\x{93}\x{c2}\x{94}");
134 However, since this hardly comes up this test is reasonably reliable
135 in most cases. Still, care should be applied in cases where dynamic
136 data is placed next to latin-1 punctuation to avoid false negatives.
138 There exists two situations to cause this test to fail; The string
139 contains utf8 byte sequences and the string hasn't been flagged as
140 utf8 (this normally means that you got it from an external source like
141 a C library; When Perl needs to store a string internally as utf8 it
142 does it's own encoding and flagging transparently) or a utf8 flagged
143 string contains byte sequences that when translated to characters
144 themselves look like a utf8 byte sequence. The test diagnostics tells
145 you which is the case.
149 # build my regular expression out of the latin-1 bytes
150 # NOTE: This won't work if our locale is nonstandard will it?
151 my $re_bit = join "|", map { Encode::encode("utf8",chr($_)) } (127..255);
153 #binmode STDERR, ":utf8";
154 #print STDERR $re_bit;
156 sub is_sane_utf8($;$)
159 my $name = shift || "sane utf8";
161 # regexp in scalar context with 'g', meaning this loop will run for
162 # each match. Should only have to run it once, but will redo if
163 # the failing case turns out to be allowed in %allowed.
164 while ($string =~ /($re_bit)/o)
166 # work out what the double encoded string was
169 my $index = $+[0] - length($bytes);
170 my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;
172 # what charecter does that represent?
173 my $char = Encode::decode("utf8",$bytes);
174 my $ord = ord($char);
175 my $hex = sprintf '%00x', $ord;
176 $char = charnames::viacode($ord);
178 # print out diagnostic messages
180 diag(qq{Found dodgy chars "$codes" at char $index\n});
181 if (Encode::is_utf8($string))
182 { diag("Chars in utf8 string look like utf8 byte sequence.") }
184 { diag("String not flagged as utf8...was it meant to be?\n") }
185 diag("Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n");
190 # got this far, must have passed.
195 # historic name of method; deprecated
198 # report errors not here but further up the calling stack
199 local $Test::Builder::Level = $Test::Builder::Level + 1;
201 # call without prototype with all args
207 =head2 Checking the Range of Characters in a String
209 These routines allow you to check the range of characters in a string.
210 Note that these routines are blind to the actual encoding perl
211 internally uses to store the characters, they just check if the
212 string contains only characters that can be represented in the named
217 =item is_within_ascii
219 Tests that a string only contains characters that are in the ASCII
224 sub is_within_ascii($;$)
227 my $name = shift || "within ascii";
229 # look for anything that isn't ascii or pass
230 $string =~ /([^\x{00}-\x{7f}])/ or return pass($name);
232 # explain why we failed
234 my $hex = sprintf '%02x', $dec;
237 diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");
242 =item is_within_latin_1
244 Tests that a string only contains characters that are in latin-1.
248 sub is_within_latin_1($;$)
251 my $name = shift || "within latin-1";
253 # look for anything that isn't ascii or pass
254 $string =~ /([^\x{00}-\x{ff}])/ or return pass($name);
256 # explain why we failed
258 my $hex = sprintf '%x', $dec;
261 diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");
268 # report errors not here but further up the calling stack
269 local $Test::Builder::Level = $Test::Builder::Level + 1;
271 # call without prototype with all args
272 &is_within_latin_1(@_);
277 =head2 Simple utf8 Flag Tests
279 Simply check if a scalar is or isn't flagged as utf8 by perl's
284 =item is_flagged_utf8($string, $name)
286 Passes if the string is flagged by perl's internals as utf8, fails if
294 my $name = shift || "flagged as utf8";
295 return ok(Encode::is_utf8($string),$name);
298 =item isnt_flagged_utf8($string,$name)
300 The opposite of C<is_flagged_utf8>, passes if and only if the string
301 isn't flagged as utf8 by perl's internals.
303 Note: you can refer to this function as C<isn't_flagged_utf8> if you
308 sub isnt_flagged_utf8($;$)
311 my $name = shift || "not flagged as utf8";
312 return ok(!Encode::is_utf8($string), $name);
315 sub isn::t_flagged_utf8($;$)
318 my $name = shift || "not flagged as utf8";
319 return ok(!Encode::is_utf8($string), $name);
326 Copyright Mark Fowler 2004. All rights reserved.
328 This program is free software; you can redistribute it
329 and/or modify it under the same terms as Perl itself.
333 None known. Please report any to me via the CPAN RT system. See
334 http://rt.cpan.org/ for more details.
338 L<Test::DoubleEncodedEntities> for testing for double encoded HTML
345 # shortcuts for Test::Builder.
348 my $Tester = Test::Builder->new();
352 local $Test::Builder::Level = $Test::Builder::Level + 1;
357 local $Test::Builder::Level = $Test::Builder::Level + 1;
363 local $Test::Builder::Level = $Test::Builder::Level + 1;
369 local $Test::Builder::Level = $Test::Builder::Level + 1;