[catagits/Gitalist.git] / local-lib5 / man / man3 / Test::utf8.3pm

.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.10)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.ie \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.el \{\
.    de IX
..
.\}
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "Test::utf8 3"
.TH Test::utf8 3 "2004-09-10" "perl v5.8.8" "User Contributed Perl Documentation"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
Test::utf8 \- handy utf8 tests
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 4
\&  is_valid_string($string);   # check the string is valid
\&  is_sane_utf8($string);      # check not double encoded
\&  is_flagged_utf8($string);   # has utf8 flag set
\&  is_within_latin_1($string); # but only has latin_1 chars in it
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
This module is a collection of tests that's useful when dealing
with utf8 strings in Perl.
.SS "Validity"
.IX Subsection "Validity"
These two tests check if a string is valid, and if you've probably
made a mistake with your string
.ie n .IP "is_valid_string($string, $testname)" 4
.el .IP "is_valid_string($string, \f(CW$testname\fR)" 4
.IX Item "is_valid_string($string, $testname)"
This passes and returns true true if and only if the scalar isn't a
invalid string; In short, it checks that the utf8 flag hasn't been set
for a string that isn't a valid utf8 encoding.
.ie n .IP "is_sane_utf8($string, $name)" 4
.el .IP "is_sane_utf8($string, \f(CW$name\fR)" 4
.IX Item "is_sane_utf8($string, $name)"
This test fails if the string contains something that looks like it
might be dodgy utf8, i.e. containing something that looks like the
multi-byte sequence for a latin\-1 character but perl hasn't been
instructed to treat as such.  Strings that are not utf8 always
automatically pass.
.Sp
Some examples may help:
.Sp
.Vb 2
\&  # This will pass as it\*(Aqs a normal latin\-1 string
\&  is_sane_utf8("Hello L\ex{e9}eon");
\&
\&  # this will fail because the \ex{c3}\ex{a9} looks like the
\&  # utf8 byte sequence for e\-acute
\&  my $string = "Hello L\ex{c3}\ex{a9}on";
\&  is_sane_utf8($string);
\&
\&  # this will pass because the utf8 is correctly interpreted as utf8
\&  Encode::_utf8_on($string)
\&  is_sane_utf8($string);
.Ve
.Sp
Obviously this isn't a hundred percent reliable.  The edge case where
this will fail is where you have \f(CW\*(C`\ex{c2}\*(C'\fR (which is \*(L"\s-1LATIN\s0 \s-1CAPITAL\s0
\&\s-1LETTER\s0 \s-1WITH\s0 \s-1CIRCUMFLEX\s0\*(R") or \f(CW\*(C`\ex{c3}\*(C'\fR (which is \*(L"\s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0
\&\s-1WITH\s0 \s-1TILDE\s0\*(R") followed by one of the latin\-1 punctuation symbols.
.Sp
.Vb 4
\&  # a capital letter A with tilde surrounded by smart quotes
\&  # this will fail because it\*(Aqll see the "\ex{c2}\ex{94}" and think
\&  # it\*(Aqs actually the utf8 sequence for the end smart quote
\&  is_sane_utf8("\ex{93}\ex{c2}\ex{94}");
.Ve
.Sp
However, since this hardly comes up this test is reasonably reliable
in most cases.  Still, care should be applied in cases where dynamic
data is placed next to latin\-1 punctuation to avoid false negatives.
.Sp
There exists two situations to cause this test to fail; The string
contains utf8 byte sequences and the string hasn't been flagged as
utf8 (this normally means that you got it from an external source like
a C library; When Perl needs to store a string internally as utf8 it
does it's own encoding and flagging transparently) or a utf8 flagged
string contains byte sequences that when translated to characters
themselves look like a utf8 byte sequence.  The test diagnostics tells
you which is the case.
.SS "Checking the Range of Characters in a String"
.IX Subsection "Checking the Range of Characters in a String"
These routines allow you to check the range of characters in a string.
Note that these routines are blind to the actual encoding perl
internally uses to store the characters, they just check if the
string contains only characters that can be represented in the named
encoding.
.IP "is_within_ascii" 4
.IX Item "is_within_ascii"
Tests that a string only contains characters that are in the \s-1ASCII\s0
charecter set.
.IP "is_within_latin_1" 4
.IX Item "is_within_latin_1"
Tests that a string only contains characters that are in latin\-1.
.SS "Simple utf8 Flag Tests"
.IX Subsection "Simple utf8 Flag Tests"
Simply check if a scalar is or isn't flagged as utf8 by perl's
internals.
.ie n .IP "is_flagged_utf8($string, $name)" 4
.el .IP "is_flagged_utf8($string, \f(CW$name\fR)" 4
.IX Item "is_flagged_utf8($string, $name)"
Passes if the string is flagged by perl's internals as utf8, fails if
it's not.
.IP "isnt_flagged_utf8($string,$name)" 4
.IX Item "isnt_flagged_utf8($string,$name)"
The opposite of \f(CW\*(C`is_flagged_utf8\*(C'\fR, passes if and only if the string
isn't flagged as utf8 by perl's internals.
.Sp
Note: you can refer to this function as \f(CW\*(C`isn\*(Aqt_flagged_utf8\*(C'\fR if you
really want to.
.SH "AUTHOR"
.IX Header "AUTHOR"
.Vb 1
\&  Copyright Mark Fowler 2004.  All rights reserved.
\&
\&  This program is free software; you can redistribute it
\&  and/or modify it under the same terms as Perl itself.
.Ve
.SH "BUGS"
.IX Header "BUGS"
None known.  Please report any to me via the \s-1CPAN\s0 \s-1RT\s0 system.  See
http://rt.cpan.org/ for more details.
.SH "SEE ALSO"
.IX Header "SEE ALSO"
Test::DoubleEncodedEntities for testing for double encoded \s-1HTML\s0
entities.
Commit	Line	Data
3fea05b9	1	.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.10)
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sp \" Vertical space (when we can't use .PP)
	6	.if t .sp .5v
	7	.if n .sp
	8	..
	9	.de Vb \" Begin verbatim text
	10	.ft CW
	11	.nf
	12	.ne \\$1
	13	..
	14	.de Ve \" End verbatim text
	15	.ft R
	16	.fi
	17	..
	18	.\" Set up some character translations and predefined strings. \*(-- will
	19	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	20	.\" double quote, and \(R" will give a right double quote. \(C+ will
	21	.\" give a nicer C++. Capital omega is used to do unbreakable dashes and
	22	.\" therefore won't be available. \(C` and \(C' expand to `' in nroff,
	23	.\" nothing in troff, for use with C<>.
	24	.tr \(*W-
	25	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	26	.ie n \{\
	27	. ds -- \(*W-
	28	. ds PI pi
	29	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	30	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	31	. ds L" ""
	32	. ds R" ""
	33	. ds C` ""
	34	. ds C' ""
	35	'br\}
	36	.el\{\
	37	. ds -- \\|\(em\\|
	38	. ds PI \(*p
	39	. ds L" ``
	40	. ds R" ''
	41	'br\}
	42	.\"
	43	.\" Escape single quotes in literal strings from groff's Unicode transform.
	44	.ie \n(.g .ds Aq \(aq
	45	.el .ds Aq '
	46	.\"
	47	.\" If the F register is turned on, we'll generate index entries on stderr for
	48	.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
	49	.\" entries marked with X<> in POD. Of course, you'll have to process the
	50	.\" output yourself in some meaningful fashion.
	51	.ie \nF \{\
	52	. de IX
	53	. tm Index:\\$1\t\\n%\t"\\$2"
	54	..
	55	. nr % 0
	56	. rr F
	57	.\}
	58	.el \{\
	59	. de IX
	60	..
	61	.\}
	62	.\"
	63	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
	64	.\" Fear. Run. Save yourself. No user-serviceable parts.
65	. \" fudge factors for nroff and troff
66	.if n \{\
67	. ds #H 0
68	. ds #V .8m
69	. ds #F .3m
70	. ds #[ \f1
71	. ds #] \fP
72	.\}
73	.if t \{\
74	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
75	. ds #V .6m
76	. ds #F 0
77	. ds #[ \&
78	. ds #] \&
79	.\}
80	. \" simple accents for nroff and troff
81	.if n \{\
82	. ds ' \&
83	. ds ` \&
84	. ds ^ \&
85	. ds , \&
86	. ds ~ ~
87	. ds /
88	.\}
89	.if t \{\
90	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
91	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
92	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
93	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
94	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
95	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
96	.\}
97	. \" troff and (daisy-wheel) nroff accents
98	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
99	.ds 8 \h'\(#H'\(b\h'-\*(#H'
100	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
101	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
102	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
103	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
104	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
105	.ds ae a\h'-(\w'a'u*4/10)'e
106	.ds Ae A\h'-(\w'A'u*4/10)'E
107	. \" corrections for vroff
108	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
109	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
110	. \" for low resolution devices (crt and lpr)
111	.if \n(.H>23 .if \n(.V>19 \
112	\{\
113	. ds : e
114	. ds 8 ss
115	. ds o a
116	. ds d- d\h'-1'\(ga
117	. ds D- D\h'-1'\(hy
118	. ds th \o'bp'
119	. ds Th \o'LP'
120	. ds ae ae
121	. ds Ae AE
122	.\}
123	.rm #[ #] #H #V #F C
124	.\" ========================================================================
125	.\"
126	.IX Title "Test::utf8 3"
127	.TH Test::utf8 3 "2004-09-10" "perl v5.8.8" "User Contributed Perl Documentation"
128	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
129	.\" way too many mistakes in technical documents.
130	.if n .ad l
131	.nh
132	.SH "NAME"
133	Test::utf8 \- handy utf8 tests
134	.SH "SYNOPSIS"
135	.IX Header "SYNOPSIS"
136	.Vb 4
137	\& is_valid_string($string); # check the string is valid
138	\& is_sane_utf8($string); # check not double encoded
139	\& is_flagged_utf8($string); # has utf8 flag set
140	\& is_within_latin_1($string); # but only has latin_1 chars in it
141	.Ve
142	.SH "DESCRIPTION"
143	.IX Header "DESCRIPTION"
144	This module is a collection of tests that's useful when dealing
145	with utf8 strings in Perl.
146	.SS "Validity"
147	.IX Subsection "Validity"
148	These two tests check if a string is valid, and if you've probably
149	made a mistake with your string
150	.ie n .IP "is_valid_string($string, $testname)" 4
151	.el .IP "is_valid_string($string, \f(CW$testname\fR)" 4
152	.IX Item "is_valid_string($string, $testname)"
153	This passes and returns true true if and only if the scalar isn't a
154	invalid string; In short, it checks that the utf8 flag hasn't been set
155	for a string that isn't a valid utf8 encoding.
156	.ie n .IP "is_sane_utf8($string, $name)" 4
157	.el .IP "is_sane_utf8($string, \f(CW$name\fR)" 4
158	.IX Item "is_sane_utf8($string, $name)"
159	This test fails if the string contains something that looks like it
160	might be dodgy utf8, i.e. containing something that looks like the
161	multi-byte sequence for a latin\-1 character but perl hasn't been
162	instructed to treat as such. Strings that are not utf8 always
163	automatically pass.
164	.Sp
165	Some examples may help:
166	.Sp
167	.Vb 2
168	\& # This will pass as it\*(Aqs a normal latin\-1 string
169	\& is_sane_utf8("Hello L\ex{e9}eon");
170	\&
171	\& # this will fail because the \ex{c3}\ex{a9} looks like the
172	\& # utf8 byte sequence for e\-acute
173	\& my $string = "Hello L\ex{c3}\ex{a9}on";
174	\& is_sane_utf8($string);
175	\&
176	\& # this will pass because the utf8 is correctly interpreted as utf8
177	\& Encode::_utf8_on($string)
178	\& is_sane_utf8($string);
179	.Ve
180	.Sp
181	Obviously this isn't a hundred percent reliable. The edge case where
182	this will fail is where you have \f(CW\(C`\ex{c2}\(C'\fR (which is \*(L"\s-1LATIN\s0 \s-1CAPITAL\s0
183	\&\s-1LETTER\s0 \s-1WITH\s0 \s-1CIRCUMFLEX\s0\(R") or \f(CW\(C`\ex{c3}\(C'\fR (which is \(L"\s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0
184	\&\s-1WITH\s0 \s-1TILDE\s0\*(R") followed by one of the latin\-1 punctuation symbols.
185	.Sp
186	.Vb 4
187	\& # a capital letter A with tilde surrounded by smart quotes
188	\& # this will fail because it\*(Aqll see the "\ex{c2}\ex{94}" and think
189	\& # it\*(Aqs actually the utf8 sequence for the end smart quote
190	\& is_sane_utf8("\ex{93}\ex{c2}\ex{94}");
191	.Ve
192	.Sp
193	However, since this hardly comes up this test is reasonably reliable
194	in most cases. Still, care should be applied in cases where dynamic
195	data is placed next to latin\-1 punctuation to avoid false negatives.
196	.Sp
197	There exists two situations to cause this test to fail; The string
198	contains utf8 byte sequences and the string hasn't been flagged as
199	utf8 (this normally means that you got it from an external source like
200	a C library; When Perl needs to store a string internally as utf8 it
201	does it's own encoding and flagging transparently) or a utf8 flagged
202	string contains byte sequences that when translated to characters
203	themselves look like a utf8 byte sequence. The test diagnostics tells
204	you which is the case.
205	.SS "Checking the Range of Characters in a String"
206	.IX Subsection "Checking the Range of Characters in a String"
207	These routines allow you to check the range of characters in a string.
208	Note that these routines are blind to the actual encoding perl
209	internally uses to store the characters, they just check if the
210	string contains only characters that can be represented in the named
211	encoding.
212	.IP "is_within_ascii" 4
213	.IX Item "is_within_ascii"
214	Tests that a string only contains characters that are in the \s-1ASCII\s0
215	charecter set.
216	.IP "is_within_latin_1" 4
217	.IX Item "is_within_latin_1"
218	Tests that a string only contains characters that are in latin\-1.
219	.SS "Simple utf8 Flag Tests"
220	.IX Subsection "Simple utf8 Flag Tests"
221	Simply check if a scalar is or isn't flagged as utf8 by perl's
222	internals.
223	.ie n .IP "is_flagged_utf8($string, $name)" 4
224	.el .IP "is_flagged_utf8($string, \f(CW$name\fR)" 4
225	.IX Item "is_flagged_utf8($string, $name)"
226	Passes if the string is flagged by perl's internals as utf8, fails if
227	it's not.
228	.IP "isnt_flagged_utf8($string,$name)" 4
229	.IX Item "isnt_flagged_utf8($string,$name)"
230	The opposite of \f(CW\(C`is_flagged_utf8\(C'\fR, passes if and only if the string
231	isn't flagged as utf8 by perl's internals.
232	.Sp
233	Note: you can refer to this function as \f(CW\(C`isn\(Aqt_flagged_utf8\*(C'\fR if you
234	really want to.
235	.SH "AUTHOR"
236	.IX Header "AUTHOR"
237	.Vb 1
238	\& Copyright Mark Fowler 2004. All rights reserved.
239	\&
240	\& This program is free software; you can redistribute it
241	\& and/or modify it under the same terms as Perl itself.
242	.Ve
243	.SH "BUGS"
244	.IX Header "BUGS"
245	None known. Please report any to me via the \s-1CPAN\s0 \s-1RT\s0 system. See
246	http://rt.cpan.org/ for more details.
247	.SH "SEE ALSO"
248	.IX Header "SEE ALSO"
249	Test::DoubleEncodedEntities for testing for double encoded \s-1HTML\s0
250	entities.