Add built local::lib
[catagits/Gitalist.git] / local-lib5 / lib / perl5 / Test / utf8.pm
CommitLineData
3fea05b9 1package Test::utf8;
2
3use 5.007003;
4
5use strict;
6use warnings;
7
8use Encode;
9use charnames ':full';
10
11use vars qw(@ISA @EXPORT $VERSION %allowed $valid_utf8_regexp);
12$VERSION = "0.02";
13
14require Exporter;
15@ISA = qw(Exporter);
16@EXPORT = qw(is_valid_string is_dodgy_utf8 is_sane_utf8
17 is_within_ascii is_within_latin1 is_within_latin_1
18 is_flagged_utf8 isnt_flagged_utf8);
19
20# A Regexp string to match valid UTF8 bytes
21# this info comes from page 78 of "The Unicode Standard 4.0"
22# published by the Unicode Consortium
23$valid_utf8_regexp = <<'.' ;
24 [\x{00}-\x{7f}]
25 | [\x{c2}-\x{df}][\x{80}-\x{bf}]
26 | \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
27 | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
28 | \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
29 | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
30 | \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
31 | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
32 | \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
33.
34
35=head1 NAME
36
37Test::utf8 - handy utf8 tests
38
39=head1 SYNOPSIS
40
41 is_valid_string($string); # check the string is valid
42 is_sane_utf8($string); # check not double encoded
43 is_flagged_utf8($string); # has utf8 flag set
44 is_within_latin_1($string); # but only has latin_1 chars in it
45
46=head1 DESCRIPTION
47
48This module is a collection of tests that's useful when dealing
49with utf8 strings in Perl.
50
51=head2 Validity
52
53These two tests check if a string is valid, and if you've probably
54made a mistake with your string
55
56=over
57
58=item is_valid_string($string, $testname)
59
60This passes and returns true true if and only if the scalar isn't a
61invalid string; In short, it checks that the utf8 flag hasn't been set
62for a string that isn't a valid utf8 encoding.
63
64=cut
65
66sub is_valid_string($;$)
67{
68 my $string = shift;
69 my $name = shift || "valid string test";
70
71 # check we're a utf8 string - if not, we pass.
72 unless (Encode::is_utf8($string))
73 { return pass($name) }
74
75 # work out at what byte (if any) we have an invalid byte sequence
76 # and return the correct test result
77 my $pos = _invalid_sequence_at_byte($string);
78 ok(!defined($pos), $name)
79 or diag("malformed byte sequence starting at byte $pos");
80}
81
82sub _invalid_sequence_at_byte($)
83{
84 my $string = shift;
85
86 # examine the bytes that make up the string (not the chars)
87 # by turning off the utf8 flag (no, use bytes doens't
88 # work, we're dealing with a regexp)
89 Encode::_utf8_off($string);
90
91 # work out the index of the first non matching byte
92 my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;
93
94 # if we matched all the string return the empty list
95 my $pos = pos $string || 0;
96 return if $pos == length($string);
97
98 # otherwise return the position we found
99 return $pos
100}
101
102=item is_sane_utf8($string, $name)
103
104This test fails if the string contains something that looks like it
105might be dodgy utf8, i.e. containing something that looks like the
106multi-byte sequence for a latin-1 character but perl hasn't been
107instructed to treat as such. Strings that are not utf8 always
108automatically pass.
109
110Some examples may help:
111
112 # This will pass as it's a normal latin-1 string
113 is_sane_utf8("Hello L\x{e9}eon");
114
115 # this will fail because the \x{c3}\x{a9} looks like the
116 # utf8 byte sequence for e-acute
117 my $string = "Hello L\x{c3}\x{a9}on";
118 is_sane_utf8($string);
119
120 # this will pass because the utf8 is correctly interpreted as utf8
121 Encode::_utf8_on($string)
122 is_sane_utf8($string);
123
124Obviously this isn't a hundred percent reliable. The edge case where
125this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
126LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
127WITH TILDE") followed by one of the latin-1 punctuation symbols.
128
129 # a capital letter A with tilde surrounded by smart quotes
130 # this will fail because it'll see the "\x{c2}\x{94}" and think
131 # it's actually the utf8 sequence for the end smart quote
132 is_sane_utf8("\x{93}\x{c2}\x{94}");
133
134However, since this hardly comes up this test is reasonably reliable
135in most cases. Still, care should be applied in cases where dynamic
136data is placed next to latin-1 punctuation to avoid false negatives.
137
138There exists two situations to cause this test to fail; The string
139contains utf8 byte sequences and the string hasn't been flagged as
140utf8 (this normally means that you got it from an external source like
141a C library; When Perl needs to store a string internally as utf8 it
142does it's own encoding and flagging transparently) or a utf8 flagged
143string contains byte sequences that when translated to characters
144themselves look like a utf8 byte sequence. The test diagnostics tells
145you which is the case.
146
147=cut
148
149# build my regular expression out of the latin-1 bytes
150# NOTE: This won't work if our locale is nonstandard will it?
151my $re_bit = join "|", map { Encode::encode("utf8",chr($_)) } (127..255);
152
153#binmode STDERR, ":utf8";
154#print STDERR $re_bit;
155
156sub is_sane_utf8($;$)
157{
158 my $string = shift;
159 my $name = shift || "sane utf8";
160
161 # regexp in scalar context with 'g', meaning this loop will run for
162 # each match. Should only have to run it once, but will redo if
163 # the failing case turns out to be allowed in %allowed.
164 while ($string =~ /($re_bit)/o)
165 {
166 # work out what the double encoded string was
167 my $bytes = $1;
168
169 my $index = $+[0] - length($bytes);
170 my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;
171
172 # what charecter does that represent?
173 my $char = Encode::decode("utf8",$bytes);
174 my $ord = ord($char);
175 my $hex = sprintf '%00x', $ord;
176 $char = charnames::viacode($ord);
177
178 # print out diagnostic messages
179 fail($name);
180 diag(qq{Found dodgy chars "$codes" at char $index\n});
181 if (Encode::is_utf8($string))
182 { diag("Chars in utf8 string look like utf8 byte sequence.") }
183 else
184 { diag("String not flagged as utf8...was it meant to be?\n") }
185 diag("Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n");
186
187 return 0;
188 }
189
190 # got this far, must have passed.
191 ok(1,$name);
192 return 1;
193}
194
195# historic name of method; deprecated
196sub is_dodgy_utf8
197{
198 # report errors not here but further up the calling stack
199 local $Test::Builder::Level = $Test::Builder::Level + 1;
200
201 # call without prototype with all args
202 &is_sane_utf8(@_);
203}
204
205=back
206
207=head2 Checking the Range of Characters in a String
208
209These routines allow you to check the range of characters in a string.
210Note that these routines are blind to the actual encoding perl
211internally uses to store the characters, they just check if the
212string contains only characters that can be represented in the named
213encoding.
214
215=over
216
217=item is_within_ascii
218
219Tests that a string only contains characters that are in the ASCII
220charecter set.
221
222=cut
223
224sub is_within_ascii($;$)
225{
226 my $string = shift;
227 my $name = shift || "within ascii";
228
229 # look for anything that isn't ascii or pass
230 $string =~ /([^\x{00}-\x{7f}])/ or return pass($name);
231
232 # explain why we failed
233 my $dec = ord($1);
234 my $hex = sprintf '%02x', $dec;
235
236 fail($name);
237 diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");
238
239 return 0;
240}
241
242=item is_within_latin_1
243
244Tests that a string only contains characters that are in latin-1.
245
246=cut
247
248sub is_within_latin_1($;$)
249{
250 my $string = shift;
251 my $name = shift || "within latin-1";
252
253 # look for anything that isn't ascii or pass
254 $string =~ /([^\x{00}-\x{ff}])/ or return pass($name);
255
256 # explain why we failed
257 my $dec = ord($1);
258 my $hex = sprintf '%x', $dec;
259
260 fail($name);
261 diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");
262
263 return 0;
264}
265
266sub is_within_latin1
267{
268 # report errors not here but further up the calling stack
269 local $Test::Builder::Level = $Test::Builder::Level + 1;
270
271 # call without prototype with all args
272 &is_within_latin_1(@_);
273}
274
275=back
276
277=head2 Simple utf8 Flag Tests
278
279Simply check if a scalar is or isn't flagged as utf8 by perl's
280internals.
281
282=over
283
284=item is_flagged_utf8($string, $name)
285
286Passes if the string is flagged by perl's internals as utf8, fails if
287it's not.
288
289=cut
290
291sub is_flagged_utf8
292{
293 my $string = shift;
294 my $name = shift || "flagged as utf8";
295 return ok(Encode::is_utf8($string),$name);
296}
297
298=item isnt_flagged_utf8($string,$name)
299
300The opposite of C<is_flagged_utf8>, passes if and only if the string
301isn't flagged as utf8 by perl's internals.
302
303Note: you can refer to this function as C<isn't_flagged_utf8> if you
304really want to.
305
306=cut
307
308sub isnt_flagged_utf8($;$)
309{
310 my $string = shift;
311 my $name = shift || "not flagged as utf8";
312 return ok(!Encode::is_utf8($string), $name);
313}
314
315sub isn::t_flagged_utf8($;$)
316{
317 my $string = shift;
318 my $name = shift || "not flagged as utf8";
319 return ok(!Encode::is_utf8($string), $name);
320}
321
322=back
323
324=head1 AUTHOR
325
326 Copyright Mark Fowler 2004. All rights reserved.
327
328 This program is free software; you can redistribute it
329 and/or modify it under the same terms as Perl itself.
330
331=head1 BUGS
332
333None known. Please report any to me via the CPAN RT system. See
334http://rt.cpan.org/ for more details.
335
336=head1 SEE ALSO
337
338L<Test::DoubleEncodedEntities> for testing for double encoded HTML
339entities.
340
341=cut
342
343##########
344
345# shortcuts for Test::Builder.
346
347use Test::Builder;
348my $Tester = Test::Builder->new();
349
350sub ok
351{
352 local $Test::Builder::Level = $Test::Builder::Level + 1;
353 $Tester->ok(@_)
354}
355sub diag
356{
357 local $Test::Builder::Level = $Test::Builder::Level + 1;
358 $Tester->diag(@_)
359}
360
361sub fail
362{
363 local $Test::Builder::Level = $Test::Builder::Level + 1;
364 ok(0,@_)
365}
366
367sub pass
368{
369 local $Test::Builder::Level = $Test::Builder::Level + 1;
370 ok(1,@_)
371}
372
373
3741;
375
376