Commit | Line | Data |
3fea05b9 |
1 | .\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.10) |
2 | .\" |
3 | .\" Standard preamble: |
4 | .\" ======================================================================== |
5 | .de Sp \" Vertical space (when we can't use .PP) |
6 | .if t .sp .5v |
7 | .if n .sp |
8 | .. |
9 | .de Vb \" Begin verbatim text |
10 | .ft CW |
11 | .nf |
12 | .ne \\$1 |
13 | .. |
14 | .de Ve \" End verbatim text |
15 | .ft R |
16 | .fi |
17 | .. |
18 | .\" Set up some character translations and predefined strings. \*(-- will |
19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left |
20 | .\" double quote, and \*(R" will give a right double quote. \*(C+ will |
21 | .\" give a nicer C++. Capital omega is used to do unbreakable dashes and |
22 | .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, |
23 | .\" nothing in troff, for use with C<>. |
24 | .tr \(*W- |
25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' |
26 | .ie n \{\ |
27 | . ds -- \(*W- |
28 | . ds PI pi |
29 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch |
30 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch |
31 | . ds L" "" |
32 | . ds R" "" |
33 | . ds C` "" |
34 | . ds C' "" |
35 | 'br\} |
36 | .el\{\ |
37 | . ds -- \|\(em\| |
38 | . ds PI \(*p |
39 | . ds L" `` |
40 | . ds R" '' |
41 | 'br\} |
42 | .\" |
43 | .\" Escape single quotes in literal strings from groff's Unicode transform. |
44 | .ie \n(.g .ds Aq \(aq |
45 | .el .ds Aq ' |
46 | .\" |
47 | .\" If the F register is turned on, we'll generate index entries on stderr for |
48 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index |
49 | .\" entries marked with X<> in POD. Of course, you'll have to process the |
50 | .\" output yourself in some meaningful fashion. |
51 | .ie \nF \{\ |
52 | . de IX |
53 | . tm Index:\\$1\t\\n%\t"\\$2" |
54 | .. |
55 | . nr % 0 |
56 | . rr F |
57 | .\} |
58 | .el \{\ |
59 | . de IX |
60 | .. |
61 | .\} |
62 | .\" |
63 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). |
64 | .\" Fear. Run. Save yourself. No user-serviceable parts. |
65 | . \" fudge factors for nroff and troff |
66 | .if n \{\ |
67 | . ds #H 0 |
68 | . ds #V .8m |
69 | . ds #F .3m |
70 | . ds #[ \f1 |
71 | . ds #] \fP |
72 | .\} |
73 | .if t \{\ |
74 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) |
75 | . ds #V .6m |
76 | . ds #F 0 |
77 | . ds #[ \& |
78 | . ds #] \& |
79 | .\} |
80 | . \" simple accents for nroff and troff |
81 | .if n \{\ |
82 | . ds ' \& |
83 | . ds ` \& |
84 | . ds ^ \& |
85 | . ds , \& |
86 | . ds ~ ~ |
87 | . ds / |
88 | .\} |
89 | .if t \{\ |
90 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" |
91 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' |
92 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' |
93 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' |
94 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' |
95 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' |
96 | .\} |
97 | . \" troff and (daisy-wheel) nroff accents |
98 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' |
99 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' |
100 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] |
101 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' |
102 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' |
103 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] |
104 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] |
105 | .ds ae a\h'-(\w'a'u*4/10)'e |
106 | .ds Ae A\h'-(\w'A'u*4/10)'E |
107 | . \" corrections for vroff |
108 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' |
109 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' |
110 | . \" for low resolution devices (crt and lpr) |
111 | .if \n(.H>23 .if \n(.V>19 \ |
112 | \{\ |
113 | . ds : e |
114 | . ds 8 ss |
115 | . ds o a |
116 | . ds d- d\h'-1'\(ga |
117 | . ds D- D\h'-1'\(hy |
118 | . ds th \o'bp' |
119 | . ds Th \o'LP' |
120 | . ds ae ae |
121 | . ds Ae AE |
122 | .\} |
123 | .rm #[ #] #H #V #F C |
124 | .\" ======================================================================== |
125 | .\" |
126 | .IX Title "Test::utf8 3" |
127 | .TH Test::utf8 3 "2004-09-10" "perl v5.8.8" "User Contributed Perl Documentation" |
128 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes |
129 | .\" way too many mistakes in technical documents. |
130 | .if n .ad l |
131 | .nh |
132 | .SH "NAME" |
133 | Test::utf8 \- handy utf8 tests |
134 | .SH "SYNOPSIS" |
135 | .IX Header "SYNOPSIS" |
136 | .Vb 4 |
137 | \& is_valid_string($string); # check the string is valid |
138 | \& is_sane_utf8($string); # check not double encoded |
139 | \& is_flagged_utf8($string); # has utf8 flag set |
140 | \& is_within_latin_1($string); # but only has latin_1 chars in it |
141 | .Ve |
142 | .SH "DESCRIPTION" |
143 | .IX Header "DESCRIPTION" |
144 | This module is a collection of tests that's useful when dealing |
145 | with utf8 strings in Perl. |
146 | .SS "Validity" |
147 | .IX Subsection "Validity" |
148 | These two tests check if a string is valid, and if you've probably |
149 | made a mistake with your string |
150 | .ie n .IP "is_valid_string($string, $testname)" 4 |
151 | .el .IP "is_valid_string($string, \f(CW$testname\fR)" 4 |
152 | .IX Item "is_valid_string($string, $testname)" |
153 | This passes and returns true true if and only if the scalar isn't a |
154 | invalid string; In short, it checks that the utf8 flag hasn't been set |
155 | for a string that isn't a valid utf8 encoding. |
156 | .ie n .IP "is_sane_utf8($string, $name)" 4 |
157 | .el .IP "is_sane_utf8($string, \f(CW$name\fR)" 4 |
158 | .IX Item "is_sane_utf8($string, $name)" |
159 | This test fails if the string contains something that looks like it |
160 | might be dodgy utf8, i.e. containing something that looks like the |
161 | multi-byte sequence for a latin\-1 character but perl hasn't been |
162 | instructed to treat as such. Strings that are not utf8 always |
163 | automatically pass. |
164 | .Sp |
165 | Some examples may help: |
166 | .Sp |
167 | .Vb 2 |
168 | \& # This will pass as it\*(Aqs a normal latin\-1 string |
169 | \& is_sane_utf8("Hello L\ex{e9}eon"); |
170 | \& |
171 | \& # this will fail because the \ex{c3}\ex{a9} looks like the |
172 | \& # utf8 byte sequence for e\-acute |
173 | \& my $string = "Hello L\ex{c3}\ex{a9}on"; |
174 | \& is_sane_utf8($string); |
175 | \& |
176 | \& # this will pass because the utf8 is correctly interpreted as utf8 |
177 | \& Encode::_utf8_on($string) |
178 | \& is_sane_utf8($string); |
179 | .Ve |
180 | .Sp |
181 | Obviously this isn't a hundred percent reliable. The edge case where |
182 | this will fail is where you have \f(CW\*(C`\ex{c2}\*(C'\fR (which is \*(L"\s-1LATIN\s0 \s-1CAPITAL\s0 |
183 | \&\s-1LETTER\s0 \s-1WITH\s0 \s-1CIRCUMFLEX\s0\*(R") or \f(CW\*(C`\ex{c3}\*(C'\fR (which is \*(L"\s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0 |
184 | \&\s-1WITH\s0 \s-1TILDE\s0\*(R") followed by one of the latin\-1 punctuation symbols. |
185 | .Sp |
186 | .Vb 4 |
187 | \& # a capital letter A with tilde surrounded by smart quotes |
188 | \& # this will fail because it\*(Aqll see the "\ex{c2}\ex{94}" and think |
189 | \& # it\*(Aqs actually the utf8 sequence for the end smart quote |
190 | \& is_sane_utf8("\ex{93}\ex{c2}\ex{94}"); |
191 | .Ve |
192 | .Sp |
193 | However, since this hardly comes up this test is reasonably reliable |
194 | in most cases. Still, care should be applied in cases where dynamic |
195 | data is placed next to latin\-1 punctuation to avoid false negatives. |
196 | .Sp |
197 | There exists two situations to cause this test to fail; The string |
198 | contains utf8 byte sequences and the string hasn't been flagged as |
199 | utf8 (this normally means that you got it from an external source like |
200 | a C library; When Perl needs to store a string internally as utf8 it |
201 | does it's own encoding and flagging transparently) or a utf8 flagged |
202 | string contains byte sequences that when translated to characters |
203 | themselves look like a utf8 byte sequence. The test diagnostics tells |
204 | you which is the case. |
205 | .SS "Checking the Range of Characters in a String" |
206 | .IX Subsection "Checking the Range of Characters in a String" |
207 | These routines allow you to check the range of characters in a string. |
208 | Note that these routines are blind to the actual encoding perl |
209 | internally uses to store the characters, they just check if the |
210 | string contains only characters that can be represented in the named |
211 | encoding. |
212 | .IP "is_within_ascii" 4 |
213 | .IX Item "is_within_ascii" |
214 | Tests that a string only contains characters that are in the \s-1ASCII\s0 |
215 | charecter set. |
216 | .IP "is_within_latin_1" 4 |
217 | .IX Item "is_within_latin_1" |
218 | Tests that a string only contains characters that are in latin\-1. |
219 | .SS "Simple utf8 Flag Tests" |
220 | .IX Subsection "Simple utf8 Flag Tests" |
221 | Simply check if a scalar is or isn't flagged as utf8 by perl's |
222 | internals. |
223 | .ie n .IP "is_flagged_utf8($string, $name)" 4 |
224 | .el .IP "is_flagged_utf8($string, \f(CW$name\fR)" 4 |
225 | .IX Item "is_flagged_utf8($string, $name)" |
226 | Passes if the string is flagged by perl's internals as utf8, fails if |
227 | it's not. |
228 | .IP "isnt_flagged_utf8($string,$name)" 4 |
229 | .IX Item "isnt_flagged_utf8($string,$name)" |
230 | The opposite of \f(CW\*(C`is_flagged_utf8\*(C'\fR, passes if and only if the string |
231 | isn't flagged as utf8 by perl's internals. |
232 | .Sp |
233 | Note: you can refer to this function as \f(CW\*(C`isn\*(Aqt_flagged_utf8\*(C'\fR if you |
234 | really want to. |
235 | .SH "AUTHOR" |
236 | .IX Header "AUTHOR" |
237 | .Vb 1 |
238 | \& Copyright Mark Fowler 2004. All rights reserved. |
239 | \& |
240 | \& This program is free software; you can redistribute it |
241 | \& and/or modify it under the same terms as Perl itself. |
242 | .Ve |
243 | .SH "BUGS" |
244 | .IX Header "BUGS" |
245 | None known. Please report any to me via the \s-1CPAN\s0 \s-1RT\s0 system. See |
246 | http://rt.cpan.org/ for more details. |
247 | .SH "SEE ALSO" |
248 | .IX Header "SEE ALSO" |
249 | Test::DoubleEncodedEntities for testing for double encoded \s-1HTML\s0 |
250 | entities. |