Add built local::lib
[catagits/Gitalist.git] / local-lib5 / man / man3 / XML::LibXML::Parser.3pm
CommitLineData
3fea05b9 1.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.10)
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sp \" Vertical space (when we can't use .PP)
6.if t .sp .5v
7.if n .sp
8..
9.de Vb \" Begin verbatim text
10.ft CW
11.nf
12.ne \\$1
13..
14.de Ve \" End verbatim text
15.ft R
16.fi
17..
18.\" Set up some character translations and predefined strings. \*(-- will
19.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
20.\" double quote, and \*(R" will give a right double quote. \*(C+ will
21.\" give a nicer C++. Capital omega is used to do unbreakable dashes and
22.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
23.\" nothing in troff, for use with C<>.
24.tr \(*W-
25.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
26.ie n \{\
27. ds -- \(*W-
28. ds PI pi
29. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
30. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
31. ds L" ""
32. ds R" ""
33. ds C` ""
34. ds C' ""
35'br\}
36.el\{\
37. ds -- \|\(em\|
38. ds PI \(*p
39. ds L" ``
40. ds R" ''
41'br\}
42.\"
43.\" Escape single quotes in literal strings from groff's Unicode transform.
44.ie \n(.g .ds Aq \(aq
45.el .ds Aq '
46.\"
47.\" If the F register is turned on, we'll generate index entries on stderr for
48.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
49.\" entries marked with X<> in POD. Of course, you'll have to process the
50.\" output yourself in some meaningful fashion.
51.ie \nF \{\
52. de IX
53. tm Index:\\$1\t\\n%\t"\\$2"
54..
55. nr % 0
56. rr F
57.\}
58.el \{\
59. de IX
60..
61.\}
62.\"
63.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
64.\" Fear. Run. Save yourself. No user-serviceable parts.
65. \" fudge factors for nroff and troff
66.if n \{\
67. ds #H 0
68. ds #V .8m
69. ds #F .3m
70. ds #[ \f1
71. ds #] \fP
72.\}
73.if t \{\
74. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
75. ds #V .6m
76. ds #F 0
77. ds #[ \&
78. ds #] \&
79.\}
80. \" simple accents for nroff and troff
81.if n \{\
82. ds ' \&
83. ds ` \&
84. ds ^ \&
85. ds , \&
86. ds ~ ~
87. ds /
88.\}
89.if t \{\
90. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
91. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
92. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
93. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
94. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
95. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
96.\}
97. \" troff and (daisy-wheel) nroff accents
98.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
99.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
100.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
101.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
102.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
103.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
104.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
105.ds ae a\h'-(\w'a'u*4/10)'e
106.ds Ae A\h'-(\w'A'u*4/10)'E
107. \" corrections for vroff
108.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
109.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
110. \" for low resolution devices (crt and lpr)
111.if \n(.H>23 .if \n(.V>19 \
112\{\
113. ds : e
114. ds 8 ss
115. ds o a
116. ds d- d\h'-1'\(ga
117. ds D- D\h'-1'\(hy
118. ds th \o'bp'
119. ds Th \o'LP'
120. ds ae ae
121. ds Ae AE
122.\}
123.rm #[ #] #H #V #F C
124.\" ========================================================================
125.\"
126.IX Title "XML::LibXML::Parser 3"
127.TH XML::LibXML::Parser 3 "2009-10-07" "perl v5.8.7" "User Contributed Perl Documentation"
128.\" For nroff, turn off justification. Always turn off hyphenation; it makes
129.\" way too many mistakes in technical documents.
130.if n .ad l
131.nh
132.SH "NAME"
133XML::LibXML::Parser \- Parsing XML Data with XML::LibXML
134.SH "SYNOPSIS"
135.IX Header "SYNOPSIS"
136.Vb 1
137\& use XML::LibXML 1.70;
138\&
139\& # Parser constructor
140\&
141\& $parser = XML::LibXML\->new();
142\& $parser = XML::LibXML\->new(option=>value, ...);
143\& $parser = XML::LibXML\->new({option=>value, ...});
144\&
145\& # Parsing XML
146\&
147\& $dom = XML::LibXML\->load_xml(
148\& location => $file_or_url
149\& # parser options ...
150\& );
151\& $dom = XML::LibXML\->load_xml(
152\& string => $xml_string
153\& # parser options ...
154\& );
155\& $dom = XML::LibXML\->load_xml({
156\& IO => $perl_file_handle
157\& # parser options ...
158\& );
159\& $dom = $parser\->load_xml(...);
160\&
161\& # Parsing HTML
162\&
163\& $dom = XML::LibXML\->load_html(...);
164\& $dom = $parser\->load_html(...);
165\&
166\& # Parsing well\-balanced XML chunks
167\&
168\& $fragment = $parser\->parse_balanced_chunk( $wbxmlstring, $encoding );
169\&
170\& # Processing XInclude
171\&
172\& $parser\->process_xincludes( $doc );
173\& $parser\->processXIncludes( $doc );
174\&
175\& # Old\-style parser interfaces
176\&
177\& $doc = $parser\->parse_file( $xmlfilename );
178\& $doc = $parser\->parse_fh( $io_fh );
179\& $doc = $parser\->parse_string( $xmlstring);
180\& $doc = $parser\->parse_html_file( $htmlfile, \e%opts );
181\& $doc = $parser\->parse_html_fh( $io_fh, \e%opts );
182\& $doc = $parser\->parse_html_string( $htmlstring, \e%opts );
183\&
184\& # Push parser
185\&
186\& $parser\->parse_chunk($string, $terminate);
187\& $parser\->init_push();
188\& $parser\->push(@data);
189\& $doc = $parser\->finish_push( $recover );
190\&
191\& # Set/query parser options
192\&
193\& $parser\->option_exists($name);
194\& $parser\->get_option($name);
195\& $parser\->set_option($name,$value);
196\& $parser\->set_options({$name=>$value,...});
197\&
198\& # XML catalogs
199\&
200\& $parser\->load_catalog( $catalog_file );
201.Ve
202.SH "PARSING"
203.IX Header "PARSING"
204A \s-1XML\s0 document is read into a data structure such as a \s-1DOM\s0 tree by a piece of
205software, called a parser. XML::LibXML currently provides four different parser
206interfaces:
207.IP "\(bu" 4
208A \s-1DOM\s0 Pull-Parser
209.IP "\(bu" 4
210A \s-1DOM\s0 Push-Parser
211.IP "\(bu" 4
212A \s-1SAX\s0 Parser
213.IP "\(bu" 4
214A \s-1DOM\s0 based \s-1SAX\s0 Parser.
215.SS "Creating a Parser Instance"
216.IX Subsection "Creating a Parser Instance"
217XML::LibXML provides an \s-1OO\s0 interface to the libxml2 parser functions. Thus you
218have to create a parser instance before you can parse any \s-1XML\s0 data.
219.IP "new" 4
220.IX Item "new"
221.Vb 3
222\& $parser = XML::LibXML\->new();
223\& $parser = XML::LibXML\->new(option=>value, ...);
224\& $parser = XML::LibXML\->new({option=>value, ...});
225.Ve
226.Sp
227Create a new \s-1XML\s0 and \s-1HTML\s0 parser instance. Each parser instance holds default
228values for various parser options. Optionally, one can pass a hash reference or
229a list of option => value pairs to set a different default set of options.
230Unless specified otherwise, the options \f(CW\*(C`load_ext_dtd\*(C'\fR, \f(CW\*(C`expand_entities\*(C'\fR, and \f(CW\*(C`huge\*(C'\fR are set to 1. See \*(L"Parser Options\*(R" for a list of libxml2 parser's options.
231.SS "\s-1DOM\s0 Parser"
232.IX Subsection "DOM Parser"
233One of the common parser interfaces of XML::LibXML is the \s-1DOM\s0 parser. This
234parser reads \s-1XML\s0 data into a \s-1DOM\s0 like data structure, so each tag can get
235accessed and transformed.
236.PP
237XML::LibXML's \s-1DOM\s0 parser is not only capable to parse \s-1XML\s0 data, but also
238(strict) \s-1HTML\s0 files. There are three ways to parse documents \- as a string, as
239a Perl filehandle, or as a filename/URL. The return value from each is a XML::LibXML::Document object, which is a \s-1DOM\s0 object.
240.PP
241All of the functions listed below will throw an exception if the document is
242invalid. To prevent this causing your program exiting, wrap the call in an
243eval{} block
244.IP "load_xml" 4
245.IX Item "load_xml"
246.Vb 10
247\& $dom = XML::LibXML\->load_xml(
248\& location => $file_or_url
249\& # parser options ...
250\& );
251\& $dom = XML::LibXML\->load_xml(
252\& string => $xml_string
253\& # parser options ...
254\& );
255\& $dom = XML::LibXML\->load_xml({
256\& IO => $perl_file_handle
257\& # parser options ...
258\& );
259\& $dom = $parser\->load_xml(...);
260.Ve
261.Sp
262This function is available since XML::LibXML 1.70. It provides easy to use
263interface to the \s-1XML\s0 parser that parses given file (or \s-1URL\s0), string, or input
264stream to a \s-1DOM\s0 tree. The arguments can be passed in a \s-1HASH\s0 reference or as
265name => value pairs. The function can be called as a class method or an object
266method. In both cases it internally creates a new parser instance passing the
267specified parser options; if called as an object method, it clones the original
268parser (preserving its settings) and additionally applies the specified options
269to the new parser. See the constructor \f(CW\*(C`new\*(C'\fR and \*(L"Parser Options\*(R" for more information.
270.IP "load_xml" 4
271.IX Item "load_xml"
272.Vb 2
273\& $dom = XML::LibXML\->load_html(...);
274\& $dom = $parser\->load_html(...);
275.Ve
276.Sp
277This function is available since XML::LibXML 1.70. It has the same usage as \f(CW\*(C`load_xml\*(C'\fR, providing interface to the \s-1HTML\s0 parser. See \f(CW\*(C`load_xml\*(C'\fR for more information.
278.Sp
279Parsing \s-1HTML\s0 may cause problems, especially if the ampersand ('&') is used.
280This is a common problem if \s-1HTML\s0 code is parsed that contains links to
281CGI-scripts. Such links cause the parser to throw errors. In such cases libxml2
282still parses the entire document as there was no error, but the error causes
283XML::LibXML to stop the parsing process. However, the document is not lost.
284Such \s-1HTML\s0 documents should be parsed using the \fIrecover\fR flag. By default recovering is deactivated.
285.Sp
286The functions described above are implemented to parse well formed documents.
287In some cases a program gets well balanced \s-1XML\s0 instead of well formed documents
288(e.g. a \s-1XML\s0 fragment from a Database). With XML::LibXML it is not required to
289wrap such fragments in the code, because XML::LibXML is capable even to parse
290well balanced \s-1XML\s0 fragments.
291.RS 4
292.IP "parse_balanced_chunk" 4
293.IX Item "parse_balanced_chunk"
294.Vb 1
295\& $fragment = $parser\->parse_balanced_chunk( $wbxmlstring, $encoding );
296.Ve
297.Sp
298This function parses a well balanced \s-1XML\s0 string into a XML::LibXML::DocumentFragment. The first arguments contains the input string, the optional second argument
299can be used to specify character encoding of the input (\s-1UTF\-8\s0 is assumed by
300default).
301.IP "parse_xml_chunk" 4
302.IX Item "parse_xml_chunk"
303This is the old name of \fIparse_balanced_chunk()\fR. Because it may causes confusion
304with the push parser interface, this function should not be used anymore.
305.RE
306.RS 4
307.Sp
308By default XML::LibXML does not process XInclude tags within a \s-1XML\s0 Document
309(see options section below). XML::LibXML allows to post process a document to
310expand XInclude tags.
311.IP "process_xincludes" 4
312.IX Item "process_xincludes"
313.Vb 1
314\& $parser\->process_xincludes( $doc );
315.Ve
316.Sp
317After a document is parsed into a \s-1DOM\s0 structure, you may want to expand the
318documents XInclude tags. This function processes the given document structure
319and expands all XInclude tags (or throws an error) by using the flags and
320callbacks of the given parser instance.
321.Sp
322Note that the resulting Tree contains some extra nodes (of type
323\&\s-1XML_XINCLUDE_START\s0 and \s-1XML_XINCLUDE_END\s0) after successfully processing the
324document. These nodes indicate where data was included into the original tree.
325if the document is serialized, these extra nodes will not show up.
326.Sp
327Remember: A Document with processed XIncludes differs from the original
328document after serialization, because the original XInclude tags will not get
329restored!
330.Sp
331If the parser flag \*(L"expand_xincludes\*(R" is set to 1, you need not to post process
332the parsed document.
333.IP "processXIncludes" 4
334.IX Item "processXIncludes"
335.Vb 1
336\& $parser\->processXIncludes( $doc );
337.Ve
338.Sp
339This is an alias to process_xincludes, but through a \s-1JAVA\s0 like function name.
340.IP "parse_file" 4
341.IX Item "parse_file"
342.Vb 1
343\& $doc = $parser\->parse_file( $xmlfilename );
344.Ve
345.Sp
346This function parses an \s-1XML\s0 document from a file or network; \f(CW$xmlfilename\fR can
347be either a filename or an \s-1URL\s0. Note that for parsing files, this function is
348the fastest choice, about 6\-8 times faster then \fIparse_fh()\fR.
349.IP "parse_fh" 4
350.IX Item "parse_fh"
351.Vb 1
352\& $doc = $parser\->parse_fh( $io_fh );
353.Ve
354.Sp
355\&\fIparse_fh()\fR parses a \s-1IOREF\s0 or a subclass of IO::Handle.
356.Sp
357Because the data comes from an open handle, libxml2's parser does not know
358about the base \s-1URI\s0 of the document. To set the base \s-1URI\s0 one should use
359\&\fIparse_fh()\fR as follows:
360.Sp
361.Vb 1
362\& my $doc = $parser\->parse_fh( $io_fh, $baseuri );
363.Ve
364.IP "parse_string" 4
365.IX Item "parse_string"
366.Vb 1
367\& $doc = $parser\->parse_string( $xmlstring);
368.Ve
369.Sp
370This function is similar to \fIparse_fh()\fR, but it parses a \s-1XML\s0 document that is
371available as a single string in memory. Again, you can pass an optional base
372\&\s-1URI\s0 to the function.
373.Sp
374.Vb 1
375\& my $doc = $parser\->parse_string( $xmlstring, $baseuri );
376.Ve
377.IP "parse_html_file" 4
378.IX Item "parse_html_file"
379.Vb 1
380\& $doc = $parser\->parse_html_file( $htmlfile, \e%opts );
381.Ve
382.Sp
383Similar to \fIparse_file()\fR but parses \s-1HTML\s0 (strict) documents; \f(CW$htmlfile\fR can be
384filename or \s-1URL\s0.
385.Sp
386An optional second argument can be used to pass some options to the \s-1HTML\s0 parser
387as a \s-1HASH\s0 reference. See options labeled with \s-1HTML\s0 in \*(L"Parser Options\*(R".
388.IP "parse_html_fh" 4
389.IX Item "parse_html_fh"
390.Vb 1
391\& $doc = $parser\->parse_html_fh( $io_fh, \e%opts );
392.Ve
393.Sp
394Similar to \fIparse_fh()\fR but parses \s-1HTML\s0 (strict) streams.
395.Sp
396An optional second argument can be used to pass some options to the \s-1HTML\s0 parser
397as a \s-1HASH\s0 reference. See options labeled with \s-1HTML\s0 in \*(L"Parser Options\*(R".
398.Sp
399Note: encoding option may not work correctly with this function in libxml2 <
4002.6.27 if the \s-1HTML\s0 file declares charset using a \s-1META\s0 tag.
401.IP "parse_html_string" 4
402.IX Item "parse_html_string"
403.Vb 1
404\& $doc = $parser\->parse_html_string( $htmlstring, \e%opts );
405.Ve
406.Sp
407Similar to \fIparse_string()\fR but parses \s-1HTML\s0 (strict) strings.
408.Sp
409An optional second argument can be used to pass some options to the \s-1HTML\s0 parser
410as a \s-1HASH\s0 reference. See options labeled with \s-1HTML\s0 in \*(L"Parser Options\*(R".
411.RE
412.RS 4
413.RE
414.SS "Push Parser"
415.IX Subsection "Push Parser"
416XML::LibXML provides a push parser interface. Rather than pulling the data from
417a given source the push parser waits for the data to be pushed into it.
418.PP
419This allows one to parse large documents without waiting for the parser to
420finish. The interface is especially useful if a program needs to pre-process
421the incoming pieces of \s-1XML\s0 (e.g. to detect document boundaries).
422.PP
423While XML::LibXML parse_*() functions force the data to be a well-formed \s-1XML\s0,
424the push parser will take any arbitrary string that contains some \s-1XML\s0 data. The
425only requirement is that all the pushed strings are together a well formed
426document. With the push parser interface a program can interrupt the parsing
427process as required, where the parse_*() functions give not enough flexibility.
428.PP
429Different to the pull parser implemented in \fIparse_fh()\fR or \fIparse_file()\fR, the
430push parser is not able to find out about the documents end itself. Thus the
431calling program needs to indicate explicitly when the parsing is done.
432.PP
433In XML::LibXML this is done by a single function:
434.IP "parse_chunk" 4
435.IX Item "parse_chunk"
436.Vb 1
437\& $parser\->parse_chunk($string, $terminate);
438.Ve
439.Sp
440\&\fIparse_chunk()\fR tries to parse a given chunk of data, which isn't necessarily
441well balanced data. The function takes two parameters: The chunk of data as a
442string and optional a termination flag. If the termination flag is set to a
443true value (e.g. 1), the parsing will be stopped and the resulting document
444will be returned as the following example describes:
445.Sp
446.Vb 5
447\& my $parser = XML::LibXML\->new;
448\& for my $string ( "<", "foo", \*(Aq bar="hello world"\*(Aq, "/>") {
449\& $parser\->parse_chunk( $string );
450\& }
451\& my $doc = $parser\->parse_chunk("", 1); # terminate the parsing
452.Ve
453.PP
454Internally XML::LibXML provides three functions that control the push parser
455process:
456.IP "init_push" 4
457.IX Item "init_push"
458.Vb 1
459\& $parser\->init_push();
460.Ve
461.Sp
462Initializes the push parser.
463.IP "push" 4
464.IX Item "push"
465.Vb 1
466\& $parser\->push(@data);
467.Ve
468.Sp
469This function pushes the data stored inside the array to libxml2's parser. Each
470entry in \f(CW@data\fR must be a normal scalar! This method can be called repeatedly.
471.IP "finish_push" 4
472.IX Item "finish_push"
473.Vb 1
474\& $doc = $parser\->finish_push( $recover );
475.Ve
476.Sp
477This function returns the result of the parsing process. If this function is
478called without a parameter it will complain about non well-formed documents. If
479\&\f(CW$restore\fR is 1, the push parser can be used to restore broken or non well formed
480(\s-1XML\s0) documents as the following example shows:
481.Sp
482.Vb 7
483\& eval {
484\& $parser\->push( "<foo>", "bar" );
485\& $doc = $parser\->finish_push(); # will report broken XML
486\& };
487\& if ( $@ ) {
488\& # ...
489\& }
490.Ve
491.Sp
492This can be annoying if the closing tag is missed by accident. The following
493code will restore the document:
494.Sp
495.Vb 5
496\& eval {
497\& $parser\->push( "<foo>", "bar" );
498\& $doc = $parser\->finish_push(1); # will return the data parsed
499\& # unless an error happened
500\& };
501\&
502\& print $doc\->toString(); # returns "<foo>bar</foo>"
503.Ve
504.Sp
505Of course \fIfinish_push()\fR will return nothing if there was no data pushed to the
506parser before.
507.SS "Pull Parser (Reader)"
508.IX Subsection "Pull Parser (Reader)"
509XML::LibXML also provides a pull-parser interface similar to the XmlReader
510interface in .NET. This interface is almost streaming, and is usually faster
511and simpler to use than \s-1SAX\s0. See XML::LibXML::Reader.
512.SS "Direct \s-1SAX\s0 Parser"
513.IX Subsection "Direct SAX Parser"
514XML::LibXML provides a direct \s-1SAX\s0 parser in the XML::LibXML::SAX module.
515.SS "\s-1DOM\s0 based \s-1SAX\s0 Parser"
516.IX Subsection "DOM based SAX Parser"
517XML::LibXML also provides a \s-1DOM\s0 based \s-1SAX\s0 parser. The \s-1SAX\s0 parser is defined in
518the module XML::LibXML::SAX::Parser. As it is not a stream based parser, it
519parses documents into a \s-1DOM\s0 and traverses the \s-1DOM\s0 tree instead.
520.PP
521The \s-1API\s0 of this parser is exactly the same as any other Perl \s-1SAX2\s0 parser. See
522XML::SAX::Intro for details.
523.PP
524Aside from the regular parsing methods, you can access the \s-1DOM\s0 tree traverser
525directly, using the \fIgenerate()\fR method:
526.PP
527.Vb 3
528\& my $doc = build_yourself_a_document();
529\& my $saxparser = $XML::LibXML::SAX::Parser\->new( ... );
530\& $parser\->generate( $doc );
531.Ve
532.PP
533This is useful for serializing \s-1DOM\s0 trees, for example that you might have done
534prior processing on, or that you have as a result of \s-1XSLT\s0 processing.
535.PP
536\&\fI\s-1WARNING\s0\fR
537.PP
538This is \s-1NOT\s0 a streaming \s-1SAX\s0 parser. As I said above, this parser reads the
539entire document into a \s-1DOM\s0 and serialises it. Some people couldn't read that in
540the paragraph above so I've added this warning. If you want a streaming \s-1SAX\s0
541parser look at the XML::LibXML::SAX man page
542.SH "SERIALIZATION"
543.IX Header "SERIALIZATION"
544XML::LibXML provides some functions to serialize nodes and documents. The
545serialization functions are described on the XML::LibXML::Node manpage or the XML::LibXML::Document manpage. XML::LibXML checks three global flags that alter the serialization
546process:
547.IP "\(bu" 4
548skipXMLDeclaration
549.IP "\(bu" 4
550skipDTD
551.IP "\(bu" 4
552setTagCompression
553.PP
554of that three functions only setTagCompression is available for all
555serialization functions.
556.PP
557Because XML::LibXML does these flags not itself, one has to define them locally
558as the following example shows:
559.PP
560.Vb 3
561\& local $XML::LibXML::skipXMLDeclaration = 1;
562\& local $XML::LibXML::skipDTD = 1;
563\& local $XML::LibXML::setTagCompression = 1;
564.Ve
565.PP
566If skipXMLDeclaration is defined and not '0', the \s-1XML\s0 declaration is omitted
567during serialization.
568.PP
569If skipDTD is defined and not '0', an existing \s-1DTD\s0 would not be serialized with
570the document.
571.PP
572If setTagCompression is defined and not '0' empty tags are displayed as open
573and closing tags rather than the shortcut. For example the empty tag \fIfoo\fR will be rendered as \fI<foo></foo>\fR rather than \fI<foo/>\fR.
574.SH "PARSER OPTIONS"
575.IX Header "PARSER OPTIONS"
576Handling of libxml2 parser options has been unified and improved in XML::LibXML
5771.70. You can now set default options for a particular parser instance by
578passing them to the constructor as \f(CW\*(C`XML::LibXML\->new({name=>value, ...})\*(C'\fR or \f(CW\*(C`XML::LibXML\->new(name=>value,...)\*(C'\fR. The options can be queried and changed using the following methods (pre\-1.70
579interfaces such as \f(CW\*(C`$parser\->load_ext_dtd(0)\*(C'\fR also exist, see below):
580.IP "option_exists" 4
581.IX Item "option_exists"
582.Vb 1
583\& $parser\->option_exists($name);
584.Ve
585.Sp
586Returns 1 if the current XML::LibXML version supports the option \f(CW$name\fR, otherwise returns 0 (note that this does not necessarily mean that the option
587is supported by the underlying libxml2 library).
588.IP "get_option" 4
589.IX Item "get_option"
590.Vb 1
591\& $parser\->get_option($name);
592.Ve
593.Sp
594Returns the current value of the parser option \f(CW$name\fR.
595.IP "set_option" 4
596.IX Item "set_option"
597.Vb 1
598\& $parser\->set_option($name,$value);
599.Ve
600.Sp
601Sets option \f(CW$name\fR to value \f(CW$value\fR.
602.IP "set_options" 4
603.IX Item "set_options"
604.Vb 1
605\& $parser\->set_options({$name=>$value,...});
606.Ve
607.Sp
608Sets multiple parsing options at once.
609.PP
610\&\s-1IMPORTANT\s0 \s-1NOTE:\s0 This documentation reflects the parser flags available in
611libxml2 2.7.3. Some options have no effect if an older version of libxml2 is
612used.
613.PP
614Each of the flags listed below is labeled labeled
615.IP "/parser/" 4
616.IX Item "/parser/"
617if it can be used with a \f(CW\*(C`XML::LibXML\*(C'\fR parser object (i.e. passed to \f(CW\*(C`XML::LibXML\->new\*(C'\fR, \f(CW\*(C`XML::LibXML\->set_option\*(C'\fR, etc.)
618.IP "/html/" 4
619.IX Item "/html/"
620if it can be used passed to the \f(CW\*(C`parse_html_*\*(C'\fR methods
621.IP "/reader/" 4
622.IX Item "/reader/"
623if it can be used with the \f(CW\*(C`XML::LibXML::Reader\*(C'\fR.
624.PP
625Unless specified otherwise, the default for boolean valued options is 0
626(false).
627.PP
628The available options are:
629.IP "\s-1URI\s0" 4
630.IX Item "URI"
631/parser, html, reader/
632.Sp
633In case of parsing strings or file handles, XML::LibXML doesn't know about the
634base uri of the document. To make relative references such as XIncludes work,
635one has to set a base \s-1URI\s0, that is then used for the parsed document.
636.IP "line_numbers" 4
637.IX Item "line_numbers"
638/parser, html, reader/
639.Sp
640If this option is activated, libxml2 will store the line number of each element
641node in the parsed document. The line number can be obtained using the \f(CW\*(C`line_number()\*(C'\fR method of the \f(CW\*(C`XML::LibXML::Node\*(C'\fR class (for non-element nodes this may report the line number of the containing
642element). The line numbers are also used for reporting positions of validation
643errors.
644.Sp
645\&\s-1IMPORTANT:\s0 Due to limitations in the libxml2 library line numbers greater than
64665535 will be returned as 65535. Unfortunatelly, this is a long and sad story,
647please see <http://bugzilla.gnome.org/show_bug.cgi?id=325533> for more details.
648.IP "encoding" 4
649.IX Item "encoding"
650/html/
651.Sp
652character encoding of the input
653.IP "recover" 4
654.IX Item "recover"
655/parser, html, reader/
656.Sp
657recover from errors; possible values are 0, 1, and 2
658.Sp
659A true value turns on recovery mode which allows one to parse broken \s-1XML\s0 or
660\&\s-1HTML\s0 data. The recovery mode allows the parser to return the successfully
661parsed portion of the input document. This is useful for almost well-formed
662documents, where for example a closing tag is missing somewhere. Still,
663XML::LibXML will only parse until the first fatal (non-recoverable) error
664occurs, reporting recoverable parsing errors as warnings. To suppress even
665these warnings, use recover=>2.
666.Sp
667Note that validation is switched off automatically in recovery mode.
668.IP "expand_entities" 4
669.IX Item "expand_entities"
670/parser, reader/
671.Sp
672substitute entities; possible values are 0 and 1; default is 1
673.Sp
674Note that although this flag disables entity substitution, it does not prevent
675the parser from loading external entities; when substitution of an external
676entity is disabled, the entity will be represented in the document tree by a
677\&\s-1XML_ENTITY_REF_NODE\s0 node whose subtree will be the content obtained by parsing
678the external resource; Although this is level of nesting is visible from the
679\&\s-1DOM\s0 it is transparent to XPath data model, so it is possible to match nodes in
680an unexpanded entity by the same XPath expression as if the entity was
681expanded. See also ext_ent_handler.
682.IP "ext_ent_handler" 4
683.IX Item "ext_ent_handler"
684/parser/
685.Sp
686Provide a custom external entity handler to be used when expand_entities is set
687to 1. Possible value is a subroutine reference.
688.Sp
689This feature does not work properly in libxml2 < 2.6.27!
690.Sp
691The subroutine provided is called whenever the parser needs to retrieve the
692content of an external entity. It is called with two arguments: the system \s-1ID\s0
693(\s-1URI\s0) and the public \s-1ID\s0. The value returned by the subroutine is parsed as the
694content of the entity.
695.Sp
696This method can be used to completely disable entity loading, e.g. to prevent
697exploits of the type described at (<http://searchsecuritychannel.techtarget.com/generic/0,295582,sid97_gci1304703,00.html>), where a service is tricked to expose its private data by letting it parse a
698remote file (\s-1RSS\s0 feed) that contains an entity reference to a local file (e.g. \f(CW\*(C`/etc/fstab\*(C'\fR).
699.Sp
700A more granular solution to this problem, however, is provided by custom \s-1URL\s0
701resolvers, as in
702.Sp
703.Vb 9
704\& my $c = XML::LibXML::InputCallback\->new();
705\& sub match { # accept file:/ URIs except for XML catalogs in /etc/xml/
706\& my ($uri) = @_;
707\& return ($uri=~m{^file:/}
708\& and $uri !~ m{^file:///etc/xml/})
709\& ? 1 : 0;
710\& }
711\& $c\->register_callbacks([ \e&match, sub{}, sub{}, sub{} ]);
712\& $parser\->input_callbacks($c);
713.Ve
714.IP "load_ext_dtd" 4
715.IX Item "load_ext_dtd"
716/parser, reader/
717.Sp
718load the external \s-1DTD\s0 subset while parsing; possible values are 0 and 1. Unless
719specified, XML::LibXML sets this option to 1.
720.Sp
721This flag is also required for \s-1DTD\s0 Validation, to provide complete attribute,
722and to expand entities, regardless if the document has an internal subset. Thus
723switching off external \s-1DTD\s0 loading, will disable entity expansion, validation,
724and complete attributes on internal subsets as well.
725.IP "complete_attributes" 4
726.IX Item "complete_attributes"
727/parser, reader/
728.Sp
729create default \s-1DTD\s0 attributes; possible values are 0 and 1
730.IP "validation" 4
731.IX Item "validation"
732/parser, reader/
733.Sp
734validate with the \s-1DTD\s0; possible values are 0 and 1
735.IP "suppress_errors" 4
736.IX Item "suppress_errors"
737/parser, html, reader/
738.Sp
739suppress error reports; possible values are 0 and 1
740.IP "suppress_warnings" 4
741.IX Item "suppress_warnings"
742/parser, html, reader/
743.Sp
744suppress warning reports; possible values are 0 and 1
745.IP "pedantic_parser" 4
746.IX Item "pedantic_parser"
747/parser, html, reader/
748.Sp
749pedantic error reporting; possible values are 0 and 1
750.IP "no_blanks" 4
751.IX Item "no_blanks"
752/parser, html, reader/
753.Sp
754remove blank nodes; possible values are 0 and 1
755.IP "expand_xinclude or xinclude" 4
756.IX Item "expand_xinclude or xinclude"
757/parser, reader/
758.Sp
759Implement XInclude substitution; possible values are 0 and 1
760.Sp
761Expands XIinclude tags immediately while parsing the document. Note that the
762parser will use the \s-1URI\s0 resolvers installed via \f(CW\*(C`XML::LibXML::InputCallback\*(C'\fR to parse the included document (if any).
763.IP "no_xinclude_nodes" 4
764.IX Item "no_xinclude_nodes"
765/parser, reader/
766.Sp
767do not generate \s-1XINCLUDE\s0 \s-1START/END\s0 nodes; possible values are 0 and 1
768.IP "no_network" 4
769.IX Item "no_network"
770/parser, html, reader/
771.Sp
772Forbid network access; possible values are 0 and 1
773.Sp
774If set to true, all attempts to fetch non-local resources (such as \s-1DTD\s0 or
775external entities) will fail (unless custom callbacks are defined).
776.Sp
777It may be necessary to use the flag \f(CW\*(C`recover\*(C'\fR for processing documents requiring such resources while networking is off.
778.IP "clean_namespaces" 4
779.IX Item "clean_namespaces"
780/parser, reader/
781.Sp
782remove redundant namespaces declarations during parsing; possible values are 0
783and 1.
784.IP "no_cdata" 4
785.IX Item "no_cdata"
786/parser, html, reader/
787.Sp
788merge \s-1CDATA\s0 as text nodes; possible values are 0 and 1
789.IP "no_basefix" 4
790.IX Item "no_basefix"
791/parser, reader/
792.Sp
793not fixup \s-1XINCLUDE\s0 xml#base \s-1URIS\s0; possible values are 0 and 1
794.IP "huge" 4
795.IX Item "huge"
796/parser, html, reader/
797.Sp
798relax any hardcoded limit from the parser; possible values are 0 and 1. Unless
799specified, XML::LibXML sets this option to 1.
800.IP "gdome" 4
801.IX Item "gdome"
802/parser/
803.Sp
804\&\s-1THIS\s0 \s-1OPTION\s0 \s-1IS\s0 \s-1EXPERIMENTAL\s0!
805.Sp
806Although quite powerful, XML:LibXML's \s-1DOM\s0 implementation is incomplete with
807respect to the \s-1DOM\s0 level 2 or level 3 specifications. \s-1XML::GDOME\s0 is based on
808libxml2 as well and and provides a rather complete \s-1DOM\s0 implementation by
809wrapping libgdome. This flag allows you to make use of XML::LibXML's full
810parser options and \s-1XML::GDOME\s0's \s-1DOM\s0 implementation at the same time.
811.Sp
812To make use of this function, one has to install libgdome and configure
813XML::LibXML to use this library. For this you need to rebuild XML::LibXML!
814.Sp
815Note: this feature was not seriously tested in recent XML::LibXML releases.
816.PP
817For compatibility with XML::LibXML versions prior to 1.70, the following
818methods are also supported for querying and setting the corresponding parser
819options (if called without arguments, the methods return the current value of
820the corresponding parser options; with an argument sets the option to a given
821value):
822.PP
823.Vb 10
824\& $parser\->validation();
825\& $parser\->recover();
826\& $parser\->pedantic_parser();
827\& $parser\->line_numbers();
828\& $parser\->load_ext_dtd();
829\& $parser\->complete_attributes();
830\& $parser\->expand_xinclude();
831\& $parser\->gdome_dom();
832\& $parser\->clean_namespaces();
833\& $parser\->no_network();
834.Ve
835.PP
836The following obsolete methods trigger parser options in some special way:
837.IP "recover_silently" 4
838.IX Item "recover_silently"
839.Vb 1
840\& $parser\->recover_silently(1);;
841.Ve
842.Sp
843If called without an argument, returns true if the current value of the \f(CW\*(C`recover\*(C'\fR parser option is 2 and returns false otherwise. With a true argument sets the \f(CW\*(C`recover\*(C'\fR parser option to 2; with a false argument sets the \f(CW\*(C`recover\*(C'\fR parser option to 0.
844.IP "expand_entities" 4
845.IX Item "expand_entities"
846.Vb 1
847\& $parser\->expand_entities(0);
848.Ve
849.Sp
850Get/set the \f(CW\*(C`expand_entities\*(C'\fR option. If called with a true argument, also turns the \f(CW\*(C`load_ext_dtd\*(C'\fR option to 1.
851.IP "keep_blanks" 4
852.IX Item "keep_blanks"
853.Vb 1
854\& $parser\->keep_blanks(0);
855.Ve
856.Sp
857This is actually an oposite of the \f(CW\*(C`no_blanks\*(C'\fR parser option. If used without an argument retrieves negated value of \f(CW\*(C`no_blanks\*(C'\fR. If used with an argument sets \f(CW\*(C`no_blanks\*(C'\fR to the oposite value.
858.IP "base_uri" 4
859.IX Item "base_uri"
860.Vb 1
861\& $parser\->base_uri( $your_base_uri );
862.Ve
863.Sp
864Get/set the \f(CW\*(C`URI\*(C'\fR option.
865.SH "XML CATALOGS"
866.IX Header "XML CATALOGS"
867\&\f(CW\*(C`libxml2\*(C'\fR supports \s-1XML\s0 catalogs. Catalogs are used to map remote resources to their local
868copies. Using catalogs can speed up parsing processes if many external
869resources from remote addresses are loaded into the parsed documents (such as
870DTDs or XIncludes).
871.PP
872Note that libxml2 has a global pool of loaded catalogs, so if you apply the
873method \f(CW\*(C`load_catalog\*(C'\fR to one parser instance, all parser instances will start using the catalog (in
874addition to other previously loaded catalogs).
875.PP
876Note also that catalogs are not used when a custom external entity handler is
877specified. At the current state it is not possible to make use of both types of
878resolving systems at the same time.
879.IP "load_catalog" 4
880.IX Item "load_catalog"
881.Vb 1
882\& $parser\->load_catalog( $catalog_file );
883.Ve
884.Sp
885Loads the \s-1XML\s0 catalog file \f(CW$catalog_file\fR.
886.SH "ERROR REPORTING"
887.IX Header "ERROR REPORTING"
888XML::LibXML throws exceptions during parsing, validation or XPath processing
889(and some other occasions). These errors can be caught by using \fIeval\fR blocks. The error is stored in \fI$@\fR. There are two implementations: the old one throws $@ which is just a message
890string, in the new one $@ is an object from the class XML::LibXML::Error; this
891class overrides the operator "" so that when printed, the object flattens to
892the usual error message.
893.PP
894XML::LibXML throws errors as they occur. This is a very common misunderstanding
895in the use of XML::LibXML. If the eval is omitted, XML::LibXML will always halt
896your script by \*(L"croaking\*(R" (see Carp man page for details).
897.PP
898Also note that an increasing number of functions throw errors if bad data is
899passed as arguments. If you cannot assure valid data passed to XML::LibXML you
900should eval these functions.
901.PP
902Note: since version 1.59, \fIget_last_error()\fR is no longer available in
903XML::LibXML for thread-safety reasons.
904.SH "AUTHORS"
905.IX Header "AUTHORS"
906Matt Sergeant,
907Christian Glahn,
908Petr Pajas
909.SH "VERSION"
910.IX Header "VERSION"
9111.70
912.SH "COPYRIGHT"
913.IX Header "COPYRIGHT"
9142001\-2007, AxKit.com Ltd.
915.PP
9162002\-2006, Christian Glahn.
917.PP
9182006\-2009, Petr Pajas.