local-lib5/lib/perl5/Test/utf8.pm

   1 package Test::utf8;
   2
   3 use 5.007003;
   4
   5 use strict;
   6 use warnings;
   7
   8 use Encode;
   9 use charnames ':full';
  10
  11 use vars qw(@ISA @EXPORT $VERSION %allowed $valid_utf8_regexp);
  12 $VERSION = "0.02";
  13
  14 require Exporter;
  15 @ISA = qw(Exporter);
  16 @EXPORT = qw(is_valid_string is_dodgy_utf8 is_sane_utf8
  17               is_within_ascii is_within_latin1 is_within_latin_1
  18               is_flagged_utf8 isnt_flagged_utf8);
  19
  20 # A Regexp string to match valid UTF8 bytes
  21 # this info comes from page 78 of "The Unicode Standard 4.0"
  22 # published by the Unicode Consortium
  23 $valid_utf8_regexp = <<'.' ;
  24         [\x{00}-\x{7f}]
  25       | [\x{c2}-\x{df}][\x{80}-\x{bf}]
  26       |         \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
  27       | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
  28       |         \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
  29       | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
  30       |         \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
  31       | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
  32       |         \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
  33 .
  34
  35 =head1 NAME
  36
  37 Test::utf8 - handy utf8 tests
  38
  39 =head1 SYNOPSIS
  40
  41   is_valid_string($string);   # check the string is valid
  42   is_sane_utf8($string);      # check not double encoded
  43   is_flagged_utf8($string);   # has utf8 flag set
  44   is_within_latin_1($string); # but only has latin_1 chars in it
  45
  46 =head1 DESCRIPTION
  47
  48 This module is a collection of tests that's useful when dealing
  49 with utf8 strings in Perl.
  50
  51 =head2 Validity
  52
  53 These two tests check if a string is valid, and if you've probably
  54 made a mistake with your string
  55
  56 =over
  57
  58 =item is_valid_string($string, $testname)
  59
  60 This passes and returns true true if and only if the scalar isn't a
  61 invalid string; In short, it checks that the utf8 flag hasn't been set
  62 for a string that isn't a valid utf8 encoding.
  63
  64 =cut
  65
  66 sub is_valid_string($;$)
  67 {
  68   my $string = shift;
  69   my $name = shift || "valid string test";
  70
  71   # check we're a utf8 string - if not, we pass.
  72   unless (Encode::is_utf8($string))
  73     { return pass($name) }
  74
  75   # work out at what byte (if any) we have an invalid byte sequence
  76   # and return the correct test result
  77   my $pos = _invalid_sequence_at_byte($string);
  78   ok(!defined($pos), $name)
  79     or diag("malformed byte sequence starting at byte $pos");
  80 }
  81
  82 sub _invalid_sequence_at_byte($)
  83 {
  84   my $string = shift;
  85
  86   # examine the bytes that make up the string (not the chars)
  87   # by turning off the utf8 flag (no, use bytes doens't
  88   # work, we're dealing with a regexp)
  89   Encode::_utf8_off($string);
  90
  91   # work out the index of the first non matching byte
  92   my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;
  93
  94   # if we matched all the string return the empty list
  95   my $pos = pos $string || 0;
  96   return if $pos == length($string);
  97
  98   # otherwise return the position we found
  99   return $pos
 100 }
 101
 102 =item is_sane_utf8($string, $name)
 103
 104 This test fails if the string contains something that looks like it
 105 might be dodgy utf8, i.e. containing something that looks like the
 106 multi-byte sequence for a latin-1 character but perl hasn't been
 107 instructed to treat as such.  Strings that are not utf8 always
 108 automatically pass.
 109
 110 Some examples may help:
 111
 112   # This will pass as it's a normal latin-1 string
 113   is_sane_utf8("Hello L\x{e9}eon");
 114
 115   # this will fail because the \x{c3}\x{a9} looks like the
 116   # utf8 byte sequence for e-acute
 117   my $string = "Hello L\x{c3}\x{a9}on";
 118   is_sane_utf8($string);
 119
 120   # this will pass because the utf8 is correctly interpreted as utf8
 121   Encode::_utf8_on($string)
 122   is_sane_utf8($string);
 123
 124 Obviously this isn't a hundred percent reliable.  The edge case where
 125 this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
 126 LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
 127 WITH TILDE") followed by one of the latin-1 punctuation symbols.
 128
 129   # a capital letter A with tilde surrounded by smart quotes
 130   # this will fail because it'll see the "\x{c2}\x{94}" and think
 131   # it's actually the utf8 sequence for the end smart quote
 132   is_sane_utf8("\x{93}\x{c2}\x{94}");
 133
 134 However, since this hardly comes up this test is reasonably reliable
 135 in most cases.  Still, care should be applied in cases where dynamic
 136 data is placed next to latin-1 punctuation to avoid false negatives.
 137
 138 There exists two situations to cause this test to fail; The string
 139 contains utf8 byte sequences and the string hasn't been flagged as
 140 utf8 (this normally means that you got it from an external source like
 141 a C library; When Perl needs to store a string internally as utf8 it
 142 does it's own encoding and flagging transparently) or a utf8 flagged
 143 string contains byte sequences that when translated to characters
 144 themselves look like a utf8 byte sequence.  The test diagnostics tells
 145 you which is the case.
 146
 147 =cut
 148
 149 # build my regular expression out of the latin-1 bytes
 150 # NOTE: This won't work if our locale is nonstandard will it?
 151 my $re_bit = join "|", map { Encode::encode("utf8",chr($_)) } (127..255);
 152
 153 #binmode STDERR, ":utf8";
 154 #print STDERR $re_bit;
 155
 156 sub is_sane_utf8($;$)
 157 {
 158   my $string = shift;
 159   my $name   = shift || "sane utf8";
 160
 161   # regexp in scalar context with 'g', meaning this loop will run for
 162   # each match.  Should only have to run it once, but will redo if
 163   # the failing case turns out to be allowed in %allowed.
 164   while ($string =~ /($re_bit)/o)
 165   {
 166     # work out what the double encoded string was
 167     my $bytes = $1;
 168
 169     my $index = $+[0] - length($bytes);
 170     my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;
 171
 172     # what charecter does that represent?
 173     my $char = Encode::decode("utf8",$bytes);
 174     my $ord  = ord($char);
 175     my $hex  = sprintf '%00x', $ord;
 176     $char = charnames::viacode($ord);
 177
 178     # print out diagnostic messages
 179     fail($name);
 180     diag(qq{Found dodgy chars "$codes" at char $index\n});
 181     if (Encode::is_utf8($string))
 182       { diag("Chars in utf8 string look like utf8 byte sequence.") }
 183     else
 184       { diag("String not flagged as utf8...was it meant to be?\n") }
 185     diag("Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n");
 186
 187     return 0;
 188   }
 189
 190   # got this far, must have passed.
 191   ok(1,$name);
 192   return 1;
 193 }
 194
 195 # historic name of method; deprecated
 196 sub is_dodgy_utf8
 197 {
 198   # report errors not here but further up the calling stack
 199   local $Test::Builder::Level = $Test::Builder::Level + 1;
 200
 201   # call without prototype with all args
 202   &is_sane_utf8(@_);
 203 }
 204
 205 =back
 206
 207 =head2 Checking the Range of Characters in a String
 208
 209 These routines allow you to check the range of characters in a string.
 210 Note that these routines are blind to the actual encoding perl
 211 internally uses to store the characters, they just check if the
 212 string contains only characters that can be represented in the named
 213 encoding.
 214
 215 =over
 216
 217 =item is_within_ascii
 218
 219 Tests that a string only contains characters that are in the ASCII
 220 charecter set.
 221
 222 =cut
 223
 224 sub is_within_ascii($;$)
 225 {
 226   my $string = shift;
 227   my $name   = shift || "within ascii";
 228
 229   # look for anything that isn't ascii or pass
 230   $string =~ /([^\x{00}-\x{7f}])/ or return pass($name);
 231
 232   # explain why we failed
 233   my $dec = ord($1);
 234   my $hex = sprintf '%02x', $dec;
 235
 236   fail($name);
 237   diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");
 238
 239   return 0;
 240 }
 241
 242 =item is_within_latin_1
 243
 244 Tests that a string only contains characters that are in latin-1.
 245
 246 =cut
 247
 248 sub is_within_latin_1($;$)
 249 {
 250   my $string = shift;
 251   my $name   = shift || "within latin-1";
 252
 253   # look for anything that isn't ascii or pass
 254   $string =~ /([^\x{00}-\x{ff}])/ or return pass($name);
 255
 256   # explain why we failed
 257   my $dec = ord($1);
 258   my $hex = sprintf '%x', $dec;
 259
 260   fail($name);
 261   diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");
 262
 263   return 0;
 264 }
 265
 266 sub is_within_latin1
 267 {
 268   # report errors not here but further up the calling stack
 269   local $Test::Builder::Level = $Test::Builder::Level + 1;
 270
 271   # call without prototype with all args
 272   &is_within_latin_1(@_);
 273 }
 274
 275 =back
 276
 277 =head2 Simple utf8 Flag Tests
 278
 279 Simply check if a scalar is or isn't flagged as utf8 by perl's
 280 internals.
 281
 282 =over
 283
 284 =item is_flagged_utf8($string, $name)
 285
 286 Passes if the string is flagged by perl's internals as utf8, fails if
 287 it's not.
 288
 289 =cut
 290
 291 sub is_flagged_utf8
 292 {
 293   my $string = shift;
 294   my $name = shift || "flagged as utf8";
 295   return ok(Encode::is_utf8($string),$name);
 296 }
 297
 298 =item isnt_flagged_utf8($string,$name)
 299
 300 The opposite of C<is_flagged_utf8>, passes if and only if the string
 301 isn't flagged as utf8 by perl's internals.
 302
 303 Note: you can refer to this function as C<isn't_flagged_utf8> if you
 304 really want to.
 305
 306 =cut
 307
 308 sub isnt_flagged_utf8($;$)
 309 {
 310   my $string = shift;
 311   my $name   = shift || "not flagged as utf8";
 312   return ok(!Encode::is_utf8($string), $name);
 313 }
 314
 315 sub isn::t_flagged_utf8($;$)
 316 {
 317   my $string = shift;
 318   my $name   = shift || "not flagged as utf8";
 319   return ok(!Encode::is_utf8($string), $name);
 320 }
 321
 322 =back
 323
 324 =head1 AUTHOR
 325
 326   Copyright Mark Fowler 2004.  All rights reserved.
 327
 328   This program is free software; you can redistribute it
 329   and/or modify it under the same terms as Perl itself.
 330
 331 =head1 BUGS
 332
 333 None known.  Please report any to me via the CPAN RT system.  See
 334 http://rt.cpan.org/ for more details.
 335
 336 =head1 SEE ALSO
 337
 338 L<Test::DoubleEncodedEntities> for testing for double encoded HTML
 339 entities.
 340
 341 =cut
 342
 343 ##########
 344
 345 # shortcuts for Test::Builder.
 346
 347 use Test::Builder;
 348 my $Tester = Test::Builder->new();
 349
 350 sub ok
 351 {
 352   local $Test::Builder::Level = $Test::Builder::Level + 1;
 353   $Tester->ok(@_)
 354 }
 355 sub diag
 356 {
 357   local $Test::Builder::Level = $Test::Builder::Level + 1;
 358    $Tester->diag(@_)
 359 }
 360
 361 sub fail
 362 {
 363   local $Test::Builder::Level = $Test::Builder::Level + 1;
 364   ok(0,@_)
 365 }
 366
 367 sub pass
 368 {
 369   local $Test::Builder::Level = $Test::Builder::Level + 1;
 370   ok(1,@_)
 371 }
 372
 373
 374 1;
 375
 376