Add built local::lib
[catagits/Gitalist.git] / local-lib5 / man / man3 / Test::utf8.3pm
CommitLineData
3fea05b9 1.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.10)
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sp \" Vertical space (when we can't use .PP)
6.if t .sp .5v
7.if n .sp
8..
9.de Vb \" Begin verbatim text
10.ft CW
11.nf
12.ne \\$1
13..
14.de Ve \" End verbatim text
15.ft R
16.fi
17..
18.\" Set up some character translations and predefined strings. \*(-- will
19.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
20.\" double quote, and \*(R" will give a right double quote. \*(C+ will
21.\" give a nicer C++. Capital omega is used to do unbreakable dashes and
22.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
23.\" nothing in troff, for use with C<>.
24.tr \(*W-
25.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
26.ie n \{\
27. ds -- \(*W-
28. ds PI pi
29. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
30. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
31. ds L" ""
32. ds R" ""
33. ds C` ""
34. ds C' ""
35'br\}
36.el\{\
37. ds -- \|\(em\|
38. ds PI \(*p
39. ds L" ``
40. ds R" ''
41'br\}
42.\"
43.\" Escape single quotes in literal strings from groff's Unicode transform.
44.ie \n(.g .ds Aq \(aq
45.el .ds Aq '
46.\"
47.\" If the F register is turned on, we'll generate index entries on stderr for
48.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
49.\" entries marked with X<> in POD. Of course, you'll have to process the
50.\" output yourself in some meaningful fashion.
51.ie \nF \{\
52. de IX
53. tm Index:\\$1\t\\n%\t"\\$2"
54..
55. nr % 0
56. rr F
57.\}
58.el \{\
59. de IX
60..
61.\}
62.\"
63.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
64.\" Fear. Run. Save yourself. No user-serviceable parts.
65. \" fudge factors for nroff and troff
66.if n \{\
67. ds #H 0
68. ds #V .8m
69. ds #F .3m
70. ds #[ \f1
71. ds #] \fP
72.\}
73.if t \{\
74. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
75. ds #V .6m
76. ds #F 0
77. ds #[ \&
78. ds #] \&
79.\}
80. \" simple accents for nroff and troff
81.if n \{\
82. ds ' \&
83. ds ` \&
84. ds ^ \&
85. ds , \&
86. ds ~ ~
87. ds /
88.\}
89.if t \{\
90. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
91. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
92. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
93. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
94. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
95. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
96.\}
97. \" troff and (daisy-wheel) nroff accents
98.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
99.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
100.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
101.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
102.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
103.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
104.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
105.ds ae a\h'-(\w'a'u*4/10)'e
106.ds Ae A\h'-(\w'A'u*4/10)'E
107. \" corrections for vroff
108.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
109.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
110. \" for low resolution devices (crt and lpr)
111.if \n(.H>23 .if \n(.V>19 \
112\{\
113. ds : e
114. ds 8 ss
115. ds o a
116. ds d- d\h'-1'\(ga
117. ds D- D\h'-1'\(hy
118. ds th \o'bp'
119. ds Th \o'LP'
120. ds ae ae
121. ds Ae AE
122.\}
123.rm #[ #] #H #V #F C
124.\" ========================================================================
125.\"
126.IX Title "Test::utf8 3"
127.TH Test::utf8 3 "2004-09-10" "perl v5.8.8" "User Contributed Perl Documentation"
128.\" For nroff, turn off justification. Always turn off hyphenation; it makes
129.\" way too many mistakes in technical documents.
130.if n .ad l
131.nh
132.SH "NAME"
133Test::utf8 \- handy utf8 tests
134.SH "SYNOPSIS"
135.IX Header "SYNOPSIS"
136.Vb 4
137\& is_valid_string($string); # check the string is valid
138\& is_sane_utf8($string); # check not double encoded
139\& is_flagged_utf8($string); # has utf8 flag set
140\& is_within_latin_1($string); # but only has latin_1 chars in it
141.Ve
142.SH "DESCRIPTION"
143.IX Header "DESCRIPTION"
144This module is a collection of tests that's useful when dealing
145with utf8 strings in Perl.
146.SS "Validity"
147.IX Subsection "Validity"
148These two tests check if a string is valid, and if you've probably
149made a mistake with your string
150.ie n .IP "is_valid_string($string, $testname)" 4
151.el .IP "is_valid_string($string, \f(CW$testname\fR)" 4
152.IX Item "is_valid_string($string, $testname)"
153This passes and returns true true if and only if the scalar isn't a
154invalid string; In short, it checks that the utf8 flag hasn't been set
155for a string that isn't a valid utf8 encoding.
156.ie n .IP "is_sane_utf8($string, $name)" 4
157.el .IP "is_sane_utf8($string, \f(CW$name\fR)" 4
158.IX Item "is_sane_utf8($string, $name)"
159This test fails if the string contains something that looks like it
160might be dodgy utf8, i.e. containing something that looks like the
161multi-byte sequence for a latin\-1 character but perl hasn't been
162instructed to treat as such. Strings that are not utf8 always
163automatically pass.
164.Sp
165Some examples may help:
166.Sp
167.Vb 2
168\& # This will pass as it\*(Aqs a normal latin\-1 string
169\& is_sane_utf8("Hello L\ex{e9}eon");
170\&
171\& # this will fail because the \ex{c3}\ex{a9} looks like the
172\& # utf8 byte sequence for e\-acute
173\& my $string = "Hello L\ex{c3}\ex{a9}on";
174\& is_sane_utf8($string);
175\&
176\& # this will pass because the utf8 is correctly interpreted as utf8
177\& Encode::_utf8_on($string)
178\& is_sane_utf8($string);
179.Ve
180.Sp
181Obviously this isn't a hundred percent reliable. The edge case where
182this will fail is where you have \f(CW\*(C`\ex{c2}\*(C'\fR (which is \*(L"\s-1LATIN\s0 \s-1CAPITAL\s0
183\&\s-1LETTER\s0 \s-1WITH\s0 \s-1CIRCUMFLEX\s0\*(R") or \f(CW\*(C`\ex{c3}\*(C'\fR (which is \*(L"\s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0
184\&\s-1WITH\s0 \s-1TILDE\s0\*(R") followed by one of the latin\-1 punctuation symbols.
185.Sp
186.Vb 4
187\& # a capital letter A with tilde surrounded by smart quotes
188\& # this will fail because it\*(Aqll see the "\ex{c2}\ex{94}" and think
189\& # it\*(Aqs actually the utf8 sequence for the end smart quote
190\& is_sane_utf8("\ex{93}\ex{c2}\ex{94}");
191.Ve
192.Sp
193However, since this hardly comes up this test is reasonably reliable
194in most cases. Still, care should be applied in cases where dynamic
195data is placed next to latin\-1 punctuation to avoid false negatives.
196.Sp
197There exists two situations to cause this test to fail; The string
198contains utf8 byte sequences and the string hasn't been flagged as
199utf8 (this normally means that you got it from an external source like
200a C library; When Perl needs to store a string internally as utf8 it
201does it's own encoding and flagging transparently) or a utf8 flagged
202string contains byte sequences that when translated to characters
203themselves look like a utf8 byte sequence. The test diagnostics tells
204you which is the case.
205.SS "Checking the Range of Characters in a String"
206.IX Subsection "Checking the Range of Characters in a String"
207These routines allow you to check the range of characters in a string.
208Note that these routines are blind to the actual encoding perl
209internally uses to store the characters, they just check if the
210string contains only characters that can be represented in the named
211encoding.
212.IP "is_within_ascii" 4
213.IX Item "is_within_ascii"
214Tests that a string only contains characters that are in the \s-1ASCII\s0
215charecter set.
216.IP "is_within_latin_1" 4
217.IX Item "is_within_latin_1"
218Tests that a string only contains characters that are in latin\-1.
219.SS "Simple utf8 Flag Tests"
220.IX Subsection "Simple utf8 Flag Tests"
221Simply check if a scalar is or isn't flagged as utf8 by perl's
222internals.
223.ie n .IP "is_flagged_utf8($string, $name)" 4
224.el .IP "is_flagged_utf8($string, \f(CW$name\fR)" 4
225.IX Item "is_flagged_utf8($string, $name)"
226Passes if the string is flagged by perl's internals as utf8, fails if
227it's not.
228.IP "isnt_flagged_utf8($string,$name)" 4
229.IX Item "isnt_flagged_utf8($string,$name)"
230The opposite of \f(CW\*(C`is_flagged_utf8\*(C'\fR, passes if and only if the string
231isn't flagged as utf8 by perl's internals.
232.Sp
233Note: you can refer to this function as \f(CW\*(C`isn\*(Aqt_flagged_utf8\*(C'\fR if you
234really want to.
235.SH "AUTHOR"
236.IX Header "AUTHOR"
237.Vb 1
238\& Copyright Mark Fowler 2004. All rights reserved.
239\&
240\& This program is free software; you can redistribute it
241\& and/or modify it under the same terms as Perl itself.
242.Ve
243.SH "BUGS"
244.IX Header "BUGS"
245None known. Please report any to me via the \s-1CPAN\s0 \s-1RT\s0 system. See
246http://rt.cpan.org/ for more details.
247.SH "SEE ALSO"
248.IX Header "SEE ALSO"
249Test::DoubleEncodedEntities for testing for double encoded \s-1HTML\s0
250entities.