Commit | Line | Data |
3fea05b9 |
1 | .\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.10) |
2 | .\" |
3 | .\" Standard preamble: |
4 | .\" ======================================================================== |
5 | .de Sp \" Vertical space (when we can't use .PP) |
6 | .if t .sp .5v |
7 | .if n .sp |
8 | .. |
9 | .de Vb \" Begin verbatim text |
10 | .ft CW |
11 | .nf |
12 | .ne \\$1 |
13 | .. |
14 | .de Ve \" End verbatim text |
15 | .ft R |
16 | .fi |
17 | .. |
18 | .\" Set up some character translations and predefined strings. \*(-- will |
19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left |
20 | .\" double quote, and \*(R" will give a right double quote. \*(C+ will |
21 | .\" give a nicer C++. Capital omega is used to do unbreakable dashes and |
22 | .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, |
23 | .\" nothing in troff, for use with C<>. |
24 | .tr \(*W- |
25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' |
26 | .ie n \{\ |
27 | . ds -- \(*W- |
28 | . ds PI pi |
29 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch |
30 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch |
31 | . ds L" "" |
32 | . ds R" "" |
33 | . ds C` "" |
34 | . ds C' "" |
35 | 'br\} |
36 | .el\{\ |
37 | . ds -- \|\(em\| |
38 | . ds PI \(*p |
39 | . ds L" `` |
40 | . ds R" '' |
41 | 'br\} |
42 | .\" |
43 | .\" Escape single quotes in literal strings from groff's Unicode transform. |
44 | .ie \n(.g .ds Aq \(aq |
45 | .el .ds Aq ' |
46 | .\" |
47 | .\" If the F register is turned on, we'll generate index entries on stderr for |
48 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index |
49 | .\" entries marked with X<> in POD. Of course, you'll have to process the |
50 | .\" output yourself in some meaningful fashion. |
51 | .ie \nF \{\ |
52 | . de IX |
53 | . tm Index:\\$1\t\\n%\t"\\$2" |
54 | .. |
55 | . nr % 0 |
56 | . rr F |
57 | .\} |
58 | .el \{\ |
59 | . de IX |
60 | .. |
61 | .\} |
62 | .\" |
63 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). |
64 | .\" Fear. Run. Save yourself. No user-serviceable parts. |
65 | . \" fudge factors for nroff and troff |
66 | .if n \{\ |
67 | . ds #H 0 |
68 | . ds #V .8m |
69 | . ds #F .3m |
70 | . ds #[ \f1 |
71 | . ds #] \fP |
72 | .\} |
73 | .if t \{\ |
74 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) |
75 | . ds #V .6m |
76 | . ds #F 0 |
77 | . ds #[ \& |
78 | . ds #] \& |
79 | .\} |
80 | . \" simple accents for nroff and troff |
81 | .if n \{\ |
82 | . ds ' \& |
83 | . ds ` \& |
84 | . ds ^ \& |
85 | . ds , \& |
86 | . ds ~ ~ |
87 | . ds / |
88 | .\} |
89 | .if t \{\ |
90 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" |
91 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' |
92 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' |
93 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' |
94 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' |
95 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' |
96 | .\} |
97 | . \" troff and (daisy-wheel) nroff accents |
98 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' |
99 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' |
100 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] |
101 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' |
102 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' |
103 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] |
104 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] |
105 | .ds ae a\h'-(\w'a'u*4/10)'e |
106 | .ds Ae A\h'-(\w'A'u*4/10)'E |
107 | . \" corrections for vroff |
108 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' |
109 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' |
110 | . \" for low resolution devices (crt and lpr) |
111 | .if \n(.H>23 .if \n(.V>19 \ |
112 | \{\ |
113 | . ds : e |
114 | . ds 8 ss |
115 | . ds o a |
116 | . ds d- d\h'-1'\(ga |
117 | . ds D- D\h'-1'\(hy |
118 | . ds th \o'bp' |
119 | . ds Th \o'LP' |
120 | . ds ae ae |
121 | . ds Ae AE |
122 | .\} |
123 | .rm #[ #] #H #V #F C |
124 | .\" ======================================================================== |
125 | .\" |
126 | .IX Title "XML::LibXML::Parser 3" |
127 | .TH XML::LibXML::Parser 3 "2009-10-07" "perl v5.8.7" "User Contributed Perl Documentation" |
128 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes |
129 | .\" way too many mistakes in technical documents. |
130 | .if n .ad l |
131 | .nh |
132 | .SH "NAME" |
133 | XML::LibXML::Parser \- Parsing XML Data with XML::LibXML |
134 | .SH "SYNOPSIS" |
135 | .IX Header "SYNOPSIS" |
136 | .Vb 1 |
137 | \& use XML::LibXML 1.70; |
138 | \& |
139 | \& # Parser constructor |
140 | \& |
141 | \& $parser = XML::LibXML\->new(); |
142 | \& $parser = XML::LibXML\->new(option=>value, ...); |
143 | \& $parser = XML::LibXML\->new({option=>value, ...}); |
144 | \& |
145 | \& # Parsing XML |
146 | \& |
147 | \& $dom = XML::LibXML\->load_xml( |
148 | \& location => $file_or_url |
149 | \& # parser options ... |
150 | \& ); |
151 | \& $dom = XML::LibXML\->load_xml( |
152 | \& string => $xml_string |
153 | \& # parser options ... |
154 | \& ); |
155 | \& $dom = XML::LibXML\->load_xml({ |
156 | \& IO => $perl_file_handle |
157 | \& # parser options ... |
158 | \& ); |
159 | \& $dom = $parser\->load_xml(...); |
160 | \& |
161 | \& # Parsing HTML |
162 | \& |
163 | \& $dom = XML::LibXML\->load_html(...); |
164 | \& $dom = $parser\->load_html(...); |
165 | \& |
166 | \& # Parsing well\-balanced XML chunks |
167 | \& |
168 | \& $fragment = $parser\->parse_balanced_chunk( $wbxmlstring, $encoding ); |
169 | \& |
170 | \& # Processing XInclude |
171 | \& |
172 | \& $parser\->process_xincludes( $doc ); |
173 | \& $parser\->processXIncludes( $doc ); |
174 | \& |
175 | \& # Old\-style parser interfaces |
176 | \& |
177 | \& $doc = $parser\->parse_file( $xmlfilename ); |
178 | \& $doc = $parser\->parse_fh( $io_fh ); |
179 | \& $doc = $parser\->parse_string( $xmlstring); |
180 | \& $doc = $parser\->parse_html_file( $htmlfile, \e%opts ); |
181 | \& $doc = $parser\->parse_html_fh( $io_fh, \e%opts ); |
182 | \& $doc = $parser\->parse_html_string( $htmlstring, \e%opts ); |
183 | \& |
184 | \& # Push parser |
185 | \& |
186 | \& $parser\->parse_chunk($string, $terminate); |
187 | \& $parser\->init_push(); |
188 | \& $parser\->push(@data); |
189 | \& $doc = $parser\->finish_push( $recover ); |
190 | \& |
191 | \& # Set/query parser options |
192 | \& |
193 | \& $parser\->option_exists($name); |
194 | \& $parser\->get_option($name); |
195 | \& $parser\->set_option($name,$value); |
196 | \& $parser\->set_options({$name=>$value,...}); |
197 | \& |
198 | \& # XML catalogs |
199 | \& |
200 | \& $parser\->load_catalog( $catalog_file ); |
201 | .Ve |
202 | .SH "PARSING" |
203 | .IX Header "PARSING" |
204 | A \s-1XML\s0 document is read into a data structure such as a \s-1DOM\s0 tree by a piece of |
205 | software, called a parser. XML::LibXML currently provides four different parser |
206 | interfaces: |
207 | .IP "\(bu" 4 |
208 | A \s-1DOM\s0 Pull-Parser |
209 | .IP "\(bu" 4 |
210 | A \s-1DOM\s0 Push-Parser |
211 | .IP "\(bu" 4 |
212 | A \s-1SAX\s0 Parser |
213 | .IP "\(bu" 4 |
214 | A \s-1DOM\s0 based \s-1SAX\s0 Parser. |
215 | .SS "Creating a Parser Instance" |
216 | .IX Subsection "Creating a Parser Instance" |
217 | XML::LibXML provides an \s-1OO\s0 interface to the libxml2 parser functions. Thus you |
218 | have to create a parser instance before you can parse any \s-1XML\s0 data. |
219 | .IP "new" 4 |
220 | .IX Item "new" |
221 | .Vb 3 |
222 | \& $parser = XML::LibXML\->new(); |
223 | \& $parser = XML::LibXML\->new(option=>value, ...); |
224 | \& $parser = XML::LibXML\->new({option=>value, ...}); |
225 | .Ve |
226 | .Sp |
227 | Create a new \s-1XML\s0 and \s-1HTML\s0 parser instance. Each parser instance holds default |
228 | values for various parser options. Optionally, one can pass a hash reference or |
229 | a list of option => value pairs to set a different default set of options. |
230 | Unless specified otherwise, the options \f(CW\*(C`load_ext_dtd\*(C'\fR, \f(CW\*(C`expand_entities\*(C'\fR, and \f(CW\*(C`huge\*(C'\fR are set to 1. See \*(L"Parser Options\*(R" for a list of libxml2 parser's options. |
231 | .SS "\s-1DOM\s0 Parser" |
232 | .IX Subsection "DOM Parser" |
233 | One of the common parser interfaces of XML::LibXML is the \s-1DOM\s0 parser. This |
234 | parser reads \s-1XML\s0 data into a \s-1DOM\s0 like data structure, so each tag can get |
235 | accessed and transformed. |
236 | .PP |
237 | XML::LibXML's \s-1DOM\s0 parser is not only capable to parse \s-1XML\s0 data, but also |
238 | (strict) \s-1HTML\s0 files. There are three ways to parse documents \- as a string, as |
239 | a Perl filehandle, or as a filename/URL. The return value from each is a XML::LibXML::Document object, which is a \s-1DOM\s0 object. |
240 | .PP |
241 | All of the functions listed below will throw an exception if the document is |
242 | invalid. To prevent this causing your program exiting, wrap the call in an |
243 | eval{} block |
244 | .IP "load_xml" 4 |
245 | .IX Item "load_xml" |
246 | .Vb 10 |
247 | \& $dom = XML::LibXML\->load_xml( |
248 | \& location => $file_or_url |
249 | \& # parser options ... |
250 | \& ); |
251 | \& $dom = XML::LibXML\->load_xml( |
252 | \& string => $xml_string |
253 | \& # parser options ... |
254 | \& ); |
255 | \& $dom = XML::LibXML\->load_xml({ |
256 | \& IO => $perl_file_handle |
257 | \& # parser options ... |
258 | \& ); |
259 | \& $dom = $parser\->load_xml(...); |
260 | .Ve |
261 | .Sp |
262 | This function is available since XML::LibXML 1.70. It provides easy to use |
263 | interface to the \s-1XML\s0 parser that parses given file (or \s-1URL\s0), string, or input |
264 | stream to a \s-1DOM\s0 tree. The arguments can be passed in a \s-1HASH\s0 reference or as |
265 | name => value pairs. The function can be called as a class method or an object |
266 | method. In both cases it internally creates a new parser instance passing the |
267 | specified parser options; if called as an object method, it clones the original |
268 | parser (preserving its settings) and additionally applies the specified options |
269 | to the new parser. See the constructor \f(CW\*(C`new\*(C'\fR and \*(L"Parser Options\*(R" for more information. |
270 | .IP "load_xml" 4 |
271 | .IX Item "load_xml" |
272 | .Vb 2 |
273 | \& $dom = XML::LibXML\->load_html(...); |
274 | \& $dom = $parser\->load_html(...); |
275 | .Ve |
276 | .Sp |
277 | This function is available since XML::LibXML 1.70. It has the same usage as \f(CW\*(C`load_xml\*(C'\fR, providing interface to the \s-1HTML\s0 parser. See \f(CW\*(C`load_xml\*(C'\fR for more information. |
278 | .Sp |
279 | Parsing \s-1HTML\s0 may cause problems, especially if the ampersand ('&') is used. |
280 | This is a common problem if \s-1HTML\s0 code is parsed that contains links to |
281 | CGI-scripts. Such links cause the parser to throw errors. In such cases libxml2 |
282 | still parses the entire document as there was no error, but the error causes |
283 | XML::LibXML to stop the parsing process. However, the document is not lost. |
284 | Such \s-1HTML\s0 documents should be parsed using the \fIrecover\fR flag. By default recovering is deactivated. |
285 | .Sp |
286 | The functions described above are implemented to parse well formed documents. |
287 | In some cases a program gets well balanced \s-1XML\s0 instead of well formed documents |
288 | (e.g. a \s-1XML\s0 fragment from a Database). With XML::LibXML it is not required to |
289 | wrap such fragments in the code, because XML::LibXML is capable even to parse |
290 | well balanced \s-1XML\s0 fragments. |
291 | .RS 4 |
292 | .IP "parse_balanced_chunk" 4 |
293 | .IX Item "parse_balanced_chunk" |
294 | .Vb 1 |
295 | \& $fragment = $parser\->parse_balanced_chunk( $wbxmlstring, $encoding ); |
296 | .Ve |
297 | .Sp |
298 | This function parses a well balanced \s-1XML\s0 string into a XML::LibXML::DocumentFragment. The first arguments contains the input string, the optional second argument |
299 | can be used to specify character encoding of the input (\s-1UTF\-8\s0 is assumed by |
300 | default). |
301 | .IP "parse_xml_chunk" 4 |
302 | .IX Item "parse_xml_chunk" |
303 | This is the old name of \fIparse_balanced_chunk()\fR. Because it may causes confusion |
304 | with the push parser interface, this function should not be used anymore. |
305 | .RE |
306 | .RS 4 |
307 | .Sp |
308 | By default XML::LibXML does not process XInclude tags within a \s-1XML\s0 Document |
309 | (see options section below). XML::LibXML allows to post process a document to |
310 | expand XInclude tags. |
311 | .IP "process_xincludes" 4 |
312 | .IX Item "process_xincludes" |
313 | .Vb 1 |
314 | \& $parser\->process_xincludes( $doc ); |
315 | .Ve |
316 | .Sp |
317 | After a document is parsed into a \s-1DOM\s0 structure, you may want to expand the |
318 | documents XInclude tags. This function processes the given document structure |
319 | and expands all XInclude tags (or throws an error) by using the flags and |
320 | callbacks of the given parser instance. |
321 | .Sp |
322 | Note that the resulting Tree contains some extra nodes (of type |
323 | \&\s-1XML_XINCLUDE_START\s0 and \s-1XML_XINCLUDE_END\s0) after successfully processing the |
324 | document. These nodes indicate where data was included into the original tree. |
325 | if the document is serialized, these extra nodes will not show up. |
326 | .Sp |
327 | Remember: A Document with processed XIncludes differs from the original |
328 | document after serialization, because the original XInclude tags will not get |
329 | restored! |
330 | .Sp |
331 | If the parser flag \*(L"expand_xincludes\*(R" is set to 1, you need not to post process |
332 | the parsed document. |
333 | .IP "processXIncludes" 4 |
334 | .IX Item "processXIncludes" |
335 | .Vb 1 |
336 | \& $parser\->processXIncludes( $doc ); |
337 | .Ve |
338 | .Sp |
339 | This is an alias to process_xincludes, but through a \s-1JAVA\s0 like function name. |
340 | .IP "parse_file" 4 |
341 | .IX Item "parse_file" |
342 | .Vb 1 |
343 | \& $doc = $parser\->parse_file( $xmlfilename ); |
344 | .Ve |
345 | .Sp |
346 | This function parses an \s-1XML\s0 document from a file or network; \f(CW$xmlfilename\fR can |
347 | be either a filename or an \s-1URL\s0. Note that for parsing files, this function is |
348 | the fastest choice, about 6\-8 times faster then \fIparse_fh()\fR. |
349 | .IP "parse_fh" 4 |
350 | .IX Item "parse_fh" |
351 | .Vb 1 |
352 | \& $doc = $parser\->parse_fh( $io_fh ); |
353 | .Ve |
354 | .Sp |
355 | \&\fIparse_fh()\fR parses a \s-1IOREF\s0 or a subclass of IO::Handle. |
356 | .Sp |
357 | Because the data comes from an open handle, libxml2's parser does not know |
358 | about the base \s-1URI\s0 of the document. To set the base \s-1URI\s0 one should use |
359 | \&\fIparse_fh()\fR as follows: |
360 | .Sp |
361 | .Vb 1 |
362 | \& my $doc = $parser\->parse_fh( $io_fh, $baseuri ); |
363 | .Ve |
364 | .IP "parse_string" 4 |
365 | .IX Item "parse_string" |
366 | .Vb 1 |
367 | \& $doc = $parser\->parse_string( $xmlstring); |
368 | .Ve |
369 | .Sp |
370 | This function is similar to \fIparse_fh()\fR, but it parses a \s-1XML\s0 document that is |
371 | available as a single string in memory. Again, you can pass an optional base |
372 | \&\s-1URI\s0 to the function. |
373 | .Sp |
374 | .Vb 1 |
375 | \& my $doc = $parser\->parse_string( $xmlstring, $baseuri ); |
376 | .Ve |
377 | .IP "parse_html_file" 4 |
378 | .IX Item "parse_html_file" |
379 | .Vb 1 |
380 | \& $doc = $parser\->parse_html_file( $htmlfile, \e%opts ); |
381 | .Ve |
382 | .Sp |
383 | Similar to \fIparse_file()\fR but parses \s-1HTML\s0 (strict) documents; \f(CW$htmlfile\fR can be |
384 | filename or \s-1URL\s0. |
385 | .Sp |
386 | An optional second argument can be used to pass some options to the \s-1HTML\s0 parser |
387 | as a \s-1HASH\s0 reference. See options labeled with \s-1HTML\s0 in \*(L"Parser Options\*(R". |
388 | .IP "parse_html_fh" 4 |
389 | .IX Item "parse_html_fh" |
390 | .Vb 1 |
391 | \& $doc = $parser\->parse_html_fh( $io_fh, \e%opts ); |
392 | .Ve |
393 | .Sp |
394 | Similar to \fIparse_fh()\fR but parses \s-1HTML\s0 (strict) streams. |
395 | .Sp |
396 | An optional second argument can be used to pass some options to the \s-1HTML\s0 parser |
397 | as a \s-1HASH\s0 reference. See options labeled with \s-1HTML\s0 in \*(L"Parser Options\*(R". |
398 | .Sp |
399 | Note: encoding option may not work correctly with this function in libxml2 < |
400 | 2.6.27 if the \s-1HTML\s0 file declares charset using a \s-1META\s0 tag. |
401 | .IP "parse_html_string" 4 |
402 | .IX Item "parse_html_string" |
403 | .Vb 1 |
404 | \& $doc = $parser\->parse_html_string( $htmlstring, \e%opts ); |
405 | .Ve |
406 | .Sp |
407 | Similar to \fIparse_string()\fR but parses \s-1HTML\s0 (strict) strings. |
408 | .Sp |
409 | An optional second argument can be used to pass some options to the \s-1HTML\s0 parser |
410 | as a \s-1HASH\s0 reference. See options labeled with \s-1HTML\s0 in \*(L"Parser Options\*(R". |
411 | .RE |
412 | .RS 4 |
413 | .RE |
414 | .SS "Push Parser" |
415 | .IX Subsection "Push Parser" |
416 | XML::LibXML provides a push parser interface. Rather than pulling the data from |
417 | a given source the push parser waits for the data to be pushed into it. |
418 | .PP |
419 | This allows one to parse large documents without waiting for the parser to |
420 | finish. The interface is especially useful if a program needs to pre-process |
421 | the incoming pieces of \s-1XML\s0 (e.g. to detect document boundaries). |
422 | .PP |
423 | While XML::LibXML parse_*() functions force the data to be a well-formed \s-1XML\s0, |
424 | the push parser will take any arbitrary string that contains some \s-1XML\s0 data. The |
425 | only requirement is that all the pushed strings are together a well formed |
426 | document. With the push parser interface a program can interrupt the parsing |
427 | process as required, where the parse_*() functions give not enough flexibility. |
428 | .PP |
429 | Different to the pull parser implemented in \fIparse_fh()\fR or \fIparse_file()\fR, the |
430 | push parser is not able to find out about the documents end itself. Thus the |
431 | calling program needs to indicate explicitly when the parsing is done. |
432 | .PP |
433 | In XML::LibXML this is done by a single function: |
434 | .IP "parse_chunk" 4 |
435 | .IX Item "parse_chunk" |
436 | .Vb 1 |
437 | \& $parser\->parse_chunk($string, $terminate); |
438 | .Ve |
439 | .Sp |
440 | \&\fIparse_chunk()\fR tries to parse a given chunk of data, which isn't necessarily |
441 | well balanced data. The function takes two parameters: The chunk of data as a |
442 | string and optional a termination flag. If the termination flag is set to a |
443 | true value (e.g. 1), the parsing will be stopped and the resulting document |
444 | will be returned as the following example describes: |
445 | .Sp |
446 | .Vb 5 |
447 | \& my $parser = XML::LibXML\->new; |
448 | \& for my $string ( "<", "foo", \*(Aq bar="hello world"\*(Aq, "/>") { |
449 | \& $parser\->parse_chunk( $string ); |
450 | \& } |
451 | \& my $doc = $parser\->parse_chunk("", 1); # terminate the parsing |
452 | .Ve |
453 | .PP |
454 | Internally XML::LibXML provides three functions that control the push parser |
455 | process: |
456 | .IP "init_push" 4 |
457 | .IX Item "init_push" |
458 | .Vb 1 |
459 | \& $parser\->init_push(); |
460 | .Ve |
461 | .Sp |
462 | Initializes the push parser. |
463 | .IP "push" 4 |
464 | .IX Item "push" |
465 | .Vb 1 |
466 | \& $parser\->push(@data); |
467 | .Ve |
468 | .Sp |
469 | This function pushes the data stored inside the array to libxml2's parser. Each |
470 | entry in \f(CW@data\fR must be a normal scalar! This method can be called repeatedly. |
471 | .IP "finish_push" 4 |
472 | .IX Item "finish_push" |
473 | .Vb 1 |
474 | \& $doc = $parser\->finish_push( $recover ); |
475 | .Ve |
476 | .Sp |
477 | This function returns the result of the parsing process. If this function is |
478 | called without a parameter it will complain about non well-formed documents. If |
479 | \&\f(CW$restore\fR is 1, the push parser can be used to restore broken or non well formed |
480 | (\s-1XML\s0) documents as the following example shows: |
481 | .Sp |
482 | .Vb 7 |
483 | \& eval { |
484 | \& $parser\->push( "<foo>", "bar" ); |
485 | \& $doc = $parser\->finish_push(); # will report broken XML |
486 | \& }; |
487 | \& if ( $@ ) { |
488 | \& # ... |
489 | \& } |
490 | .Ve |
491 | .Sp |
492 | This can be annoying if the closing tag is missed by accident. The following |
493 | code will restore the document: |
494 | .Sp |
495 | .Vb 5 |
496 | \& eval { |
497 | \& $parser\->push( "<foo>", "bar" ); |
498 | \& $doc = $parser\->finish_push(1); # will return the data parsed |
499 | \& # unless an error happened |
500 | \& }; |
501 | \& |
502 | \& print $doc\->toString(); # returns "<foo>bar</foo>" |
503 | .Ve |
504 | .Sp |
505 | Of course \fIfinish_push()\fR will return nothing if there was no data pushed to the |
506 | parser before. |
507 | .SS "Pull Parser (Reader)" |
508 | .IX Subsection "Pull Parser (Reader)" |
509 | XML::LibXML also provides a pull-parser interface similar to the XmlReader |
510 | interface in .NET. This interface is almost streaming, and is usually faster |
511 | and simpler to use than \s-1SAX\s0. See XML::LibXML::Reader. |
512 | .SS "Direct \s-1SAX\s0 Parser" |
513 | .IX Subsection "Direct SAX Parser" |
514 | XML::LibXML provides a direct \s-1SAX\s0 parser in the XML::LibXML::SAX module. |
515 | .SS "\s-1DOM\s0 based \s-1SAX\s0 Parser" |
516 | .IX Subsection "DOM based SAX Parser" |
517 | XML::LibXML also provides a \s-1DOM\s0 based \s-1SAX\s0 parser. The \s-1SAX\s0 parser is defined in |
518 | the module XML::LibXML::SAX::Parser. As it is not a stream based parser, it |
519 | parses documents into a \s-1DOM\s0 and traverses the \s-1DOM\s0 tree instead. |
520 | .PP |
521 | The \s-1API\s0 of this parser is exactly the same as any other Perl \s-1SAX2\s0 parser. See |
522 | XML::SAX::Intro for details. |
523 | .PP |
524 | Aside from the regular parsing methods, you can access the \s-1DOM\s0 tree traverser |
525 | directly, using the \fIgenerate()\fR method: |
526 | .PP |
527 | .Vb 3 |
528 | \& my $doc = build_yourself_a_document(); |
529 | \& my $saxparser = $XML::LibXML::SAX::Parser\->new( ... ); |
530 | \& $parser\->generate( $doc ); |
531 | .Ve |
532 | .PP |
533 | This is useful for serializing \s-1DOM\s0 trees, for example that you might have done |
534 | prior processing on, or that you have as a result of \s-1XSLT\s0 processing. |
535 | .PP |
536 | \&\fI\s-1WARNING\s0\fR |
537 | .PP |
538 | This is \s-1NOT\s0 a streaming \s-1SAX\s0 parser. As I said above, this parser reads the |
539 | entire document into a \s-1DOM\s0 and serialises it. Some people couldn't read that in |
540 | the paragraph above so I've added this warning. If you want a streaming \s-1SAX\s0 |
541 | parser look at the XML::LibXML::SAX man page |
542 | .SH "SERIALIZATION" |
543 | .IX Header "SERIALIZATION" |
544 | XML::LibXML provides some functions to serialize nodes and documents. The |
545 | serialization functions are described on the XML::LibXML::Node manpage or the XML::LibXML::Document manpage. XML::LibXML checks three global flags that alter the serialization |
546 | process: |
547 | .IP "\(bu" 4 |
548 | skipXMLDeclaration |
549 | .IP "\(bu" 4 |
550 | skipDTD |
551 | .IP "\(bu" 4 |
552 | setTagCompression |
553 | .PP |
554 | of that three functions only setTagCompression is available for all |
555 | serialization functions. |
556 | .PP |
557 | Because XML::LibXML does these flags not itself, one has to define them locally |
558 | as the following example shows: |
559 | .PP |
560 | .Vb 3 |
561 | \& local $XML::LibXML::skipXMLDeclaration = 1; |
562 | \& local $XML::LibXML::skipDTD = 1; |
563 | \& local $XML::LibXML::setTagCompression = 1; |
564 | .Ve |
565 | .PP |
566 | If skipXMLDeclaration is defined and not '0', the \s-1XML\s0 declaration is omitted |
567 | during serialization. |
568 | .PP |
569 | If skipDTD is defined and not '0', an existing \s-1DTD\s0 would not be serialized with |
570 | the document. |
571 | .PP |
572 | If setTagCompression is defined and not '0' empty tags are displayed as open |
573 | and closing tags rather than the shortcut. For example the empty tag \fIfoo\fR will be rendered as \fI<foo></foo>\fR rather than \fI<foo/>\fR. |
574 | .SH "PARSER OPTIONS" |
575 | .IX Header "PARSER OPTIONS" |
576 | Handling of libxml2 parser options has been unified and improved in XML::LibXML |
577 | 1.70. You can now set default options for a particular parser instance by |
578 | passing them to the constructor as \f(CW\*(C`XML::LibXML\->new({name=>value, ...})\*(C'\fR or \f(CW\*(C`XML::LibXML\->new(name=>value,...)\*(C'\fR. The options can be queried and changed using the following methods (pre\-1.70 |
579 | interfaces such as \f(CW\*(C`$parser\->load_ext_dtd(0)\*(C'\fR also exist, see below): |
580 | .IP "option_exists" 4 |
581 | .IX Item "option_exists" |
582 | .Vb 1 |
583 | \& $parser\->option_exists($name); |
584 | .Ve |
585 | .Sp |
586 | Returns 1 if the current XML::LibXML version supports the option \f(CW$name\fR, otherwise returns 0 (note that this does not necessarily mean that the option |
587 | is supported by the underlying libxml2 library). |
588 | .IP "get_option" 4 |
589 | .IX Item "get_option" |
590 | .Vb 1 |
591 | \& $parser\->get_option($name); |
592 | .Ve |
593 | .Sp |
594 | Returns the current value of the parser option \f(CW$name\fR. |
595 | .IP "set_option" 4 |
596 | .IX Item "set_option" |
597 | .Vb 1 |
598 | \& $parser\->set_option($name,$value); |
599 | .Ve |
600 | .Sp |
601 | Sets option \f(CW$name\fR to value \f(CW$value\fR. |
602 | .IP "set_options" 4 |
603 | .IX Item "set_options" |
604 | .Vb 1 |
605 | \& $parser\->set_options({$name=>$value,...}); |
606 | .Ve |
607 | .Sp |
608 | Sets multiple parsing options at once. |
609 | .PP |
610 | \&\s-1IMPORTANT\s0 \s-1NOTE:\s0 This documentation reflects the parser flags available in |
611 | libxml2 2.7.3. Some options have no effect if an older version of libxml2 is |
612 | used. |
613 | .PP |
614 | Each of the flags listed below is labeled labeled |
615 | .IP "/parser/" 4 |
616 | .IX Item "/parser/" |
617 | if it can be used with a \f(CW\*(C`XML::LibXML\*(C'\fR parser object (i.e. passed to \f(CW\*(C`XML::LibXML\->new\*(C'\fR, \f(CW\*(C`XML::LibXML\->set_option\*(C'\fR, etc.) |
618 | .IP "/html/" 4 |
619 | .IX Item "/html/" |
620 | if it can be used passed to the \f(CW\*(C`parse_html_*\*(C'\fR methods |
621 | .IP "/reader/" 4 |
622 | .IX Item "/reader/" |
623 | if it can be used with the \f(CW\*(C`XML::LibXML::Reader\*(C'\fR. |
624 | .PP |
625 | Unless specified otherwise, the default for boolean valued options is 0 |
626 | (false). |
627 | .PP |
628 | The available options are: |
629 | .IP "\s-1URI\s0" 4 |
630 | .IX Item "URI" |
631 | /parser, html, reader/ |
632 | .Sp |
633 | In case of parsing strings or file handles, XML::LibXML doesn't know about the |
634 | base uri of the document. To make relative references such as XIncludes work, |
635 | one has to set a base \s-1URI\s0, that is then used for the parsed document. |
636 | .IP "line_numbers" 4 |
637 | .IX Item "line_numbers" |
638 | /parser, html, reader/ |
639 | .Sp |
640 | If this option is activated, libxml2 will store the line number of each element |
641 | node in the parsed document. The line number can be obtained using the \f(CW\*(C`line_number()\*(C'\fR method of the \f(CW\*(C`XML::LibXML::Node\*(C'\fR class (for non-element nodes this may report the line number of the containing |
642 | element). The line numbers are also used for reporting positions of validation |
643 | errors. |
644 | .Sp |
645 | \&\s-1IMPORTANT:\s0 Due to limitations in the libxml2 library line numbers greater than |
646 | 65535 will be returned as 65535. Unfortunatelly, this is a long and sad story, |
647 | please see <http://bugzilla.gnome.org/show_bug.cgi?id=325533> for more details. |
648 | .IP "encoding" 4 |
649 | .IX Item "encoding" |
650 | /html/ |
651 | .Sp |
652 | character encoding of the input |
653 | .IP "recover" 4 |
654 | .IX Item "recover" |
655 | /parser, html, reader/ |
656 | .Sp |
657 | recover from errors; possible values are 0, 1, and 2 |
658 | .Sp |
659 | A true value turns on recovery mode which allows one to parse broken \s-1XML\s0 or |
660 | \&\s-1HTML\s0 data. The recovery mode allows the parser to return the successfully |
661 | parsed portion of the input document. This is useful for almost well-formed |
662 | documents, where for example a closing tag is missing somewhere. Still, |
663 | XML::LibXML will only parse until the first fatal (non-recoverable) error |
664 | occurs, reporting recoverable parsing errors as warnings. To suppress even |
665 | these warnings, use recover=>2. |
666 | .Sp |
667 | Note that validation is switched off automatically in recovery mode. |
668 | .IP "expand_entities" 4 |
669 | .IX Item "expand_entities" |
670 | /parser, reader/ |
671 | .Sp |
672 | substitute entities; possible values are 0 and 1; default is 1 |
673 | .Sp |
674 | Note that although this flag disables entity substitution, it does not prevent |
675 | the parser from loading external entities; when substitution of an external |
676 | entity is disabled, the entity will be represented in the document tree by a |
677 | \&\s-1XML_ENTITY_REF_NODE\s0 node whose subtree will be the content obtained by parsing |
678 | the external resource; Although this is level of nesting is visible from the |
679 | \&\s-1DOM\s0 it is transparent to XPath data model, so it is possible to match nodes in |
680 | an unexpanded entity by the same XPath expression as if the entity was |
681 | expanded. See also ext_ent_handler. |
682 | .IP "ext_ent_handler" 4 |
683 | .IX Item "ext_ent_handler" |
684 | /parser/ |
685 | .Sp |
686 | Provide a custom external entity handler to be used when expand_entities is set |
687 | to 1. Possible value is a subroutine reference. |
688 | .Sp |
689 | This feature does not work properly in libxml2 < 2.6.27! |
690 | .Sp |
691 | The subroutine provided is called whenever the parser needs to retrieve the |
692 | content of an external entity. It is called with two arguments: the system \s-1ID\s0 |
693 | (\s-1URI\s0) and the public \s-1ID\s0. The value returned by the subroutine is parsed as the |
694 | content of the entity. |
695 | .Sp |
696 | This method can be used to completely disable entity loading, e.g. to prevent |
697 | exploits of the type described at (<http://searchsecuritychannel.techtarget.com/generic/0,295582,sid97_gci1304703,00.html>), where a service is tricked to expose its private data by letting it parse a |
698 | remote file (\s-1RSS\s0 feed) that contains an entity reference to a local file (e.g. \f(CW\*(C`/etc/fstab\*(C'\fR). |
699 | .Sp |
700 | A more granular solution to this problem, however, is provided by custom \s-1URL\s0 |
701 | resolvers, as in |
702 | .Sp |
703 | .Vb 9 |
704 | \& my $c = XML::LibXML::InputCallback\->new(); |
705 | \& sub match { # accept file:/ URIs except for XML catalogs in /etc/xml/ |
706 | \& my ($uri) = @_; |
707 | \& return ($uri=~m{^file:/} |
708 | \& and $uri !~ m{^file:///etc/xml/}) |
709 | \& ? 1 : 0; |
710 | \& } |
711 | \& $c\->register_callbacks([ \e&match, sub{}, sub{}, sub{} ]); |
712 | \& $parser\->input_callbacks($c); |
713 | .Ve |
714 | .IP "load_ext_dtd" 4 |
715 | .IX Item "load_ext_dtd" |
716 | /parser, reader/ |
717 | .Sp |
718 | load the external \s-1DTD\s0 subset while parsing; possible values are 0 and 1. Unless |
719 | specified, XML::LibXML sets this option to 1. |
720 | .Sp |
721 | This flag is also required for \s-1DTD\s0 Validation, to provide complete attribute, |
722 | and to expand entities, regardless if the document has an internal subset. Thus |
723 | switching off external \s-1DTD\s0 loading, will disable entity expansion, validation, |
724 | and complete attributes on internal subsets as well. |
725 | .IP "complete_attributes" 4 |
726 | .IX Item "complete_attributes" |
727 | /parser, reader/ |
728 | .Sp |
729 | create default \s-1DTD\s0 attributes; possible values are 0 and 1 |
730 | .IP "validation" 4 |
731 | .IX Item "validation" |
732 | /parser, reader/ |
733 | .Sp |
734 | validate with the \s-1DTD\s0; possible values are 0 and 1 |
735 | .IP "suppress_errors" 4 |
736 | .IX Item "suppress_errors" |
737 | /parser, html, reader/ |
738 | .Sp |
739 | suppress error reports; possible values are 0 and 1 |
740 | .IP "suppress_warnings" 4 |
741 | .IX Item "suppress_warnings" |
742 | /parser, html, reader/ |
743 | .Sp |
744 | suppress warning reports; possible values are 0 and 1 |
745 | .IP "pedantic_parser" 4 |
746 | .IX Item "pedantic_parser" |
747 | /parser, html, reader/ |
748 | .Sp |
749 | pedantic error reporting; possible values are 0 and 1 |
750 | .IP "no_blanks" 4 |
751 | .IX Item "no_blanks" |
752 | /parser, html, reader/ |
753 | .Sp |
754 | remove blank nodes; possible values are 0 and 1 |
755 | .IP "expand_xinclude or xinclude" 4 |
756 | .IX Item "expand_xinclude or xinclude" |
757 | /parser, reader/ |
758 | .Sp |
759 | Implement XInclude substitution; possible values are 0 and 1 |
760 | .Sp |
761 | Expands XIinclude tags immediately while parsing the document. Note that the |
762 | parser will use the \s-1URI\s0 resolvers installed via \f(CW\*(C`XML::LibXML::InputCallback\*(C'\fR to parse the included document (if any). |
763 | .IP "no_xinclude_nodes" 4 |
764 | .IX Item "no_xinclude_nodes" |
765 | /parser, reader/ |
766 | .Sp |
767 | do not generate \s-1XINCLUDE\s0 \s-1START/END\s0 nodes; possible values are 0 and 1 |
768 | .IP "no_network" 4 |
769 | .IX Item "no_network" |
770 | /parser, html, reader/ |
771 | .Sp |
772 | Forbid network access; possible values are 0 and 1 |
773 | .Sp |
774 | If set to true, all attempts to fetch non-local resources (such as \s-1DTD\s0 or |
775 | external entities) will fail (unless custom callbacks are defined). |
776 | .Sp |
777 | It may be necessary to use the flag \f(CW\*(C`recover\*(C'\fR for processing documents requiring such resources while networking is off. |
778 | .IP "clean_namespaces" 4 |
779 | .IX Item "clean_namespaces" |
780 | /parser, reader/ |
781 | .Sp |
782 | remove redundant namespaces declarations during parsing; possible values are 0 |
783 | and 1. |
784 | .IP "no_cdata" 4 |
785 | .IX Item "no_cdata" |
786 | /parser, html, reader/ |
787 | .Sp |
788 | merge \s-1CDATA\s0 as text nodes; possible values are 0 and 1 |
789 | .IP "no_basefix" 4 |
790 | .IX Item "no_basefix" |
791 | /parser, reader/ |
792 | .Sp |
793 | not fixup \s-1XINCLUDE\s0 xml#base \s-1URIS\s0; possible values are 0 and 1 |
794 | .IP "huge" 4 |
795 | .IX Item "huge" |
796 | /parser, html, reader/ |
797 | .Sp |
798 | relax any hardcoded limit from the parser; possible values are 0 and 1. Unless |
799 | specified, XML::LibXML sets this option to 1. |
800 | .IP "gdome" 4 |
801 | .IX Item "gdome" |
802 | /parser/ |
803 | .Sp |
804 | \&\s-1THIS\s0 \s-1OPTION\s0 \s-1IS\s0 \s-1EXPERIMENTAL\s0! |
805 | .Sp |
806 | Although quite powerful, XML:LibXML's \s-1DOM\s0 implementation is incomplete with |
807 | respect to the \s-1DOM\s0 level 2 or level 3 specifications. \s-1XML::GDOME\s0 is based on |
808 | libxml2 as well and and provides a rather complete \s-1DOM\s0 implementation by |
809 | wrapping libgdome. This flag allows you to make use of XML::LibXML's full |
810 | parser options and \s-1XML::GDOME\s0's \s-1DOM\s0 implementation at the same time. |
811 | .Sp |
812 | To make use of this function, one has to install libgdome and configure |
813 | XML::LibXML to use this library. For this you need to rebuild XML::LibXML! |
814 | .Sp |
815 | Note: this feature was not seriously tested in recent XML::LibXML releases. |
816 | .PP |
817 | For compatibility with XML::LibXML versions prior to 1.70, the following |
818 | methods are also supported for querying and setting the corresponding parser |
819 | options (if called without arguments, the methods return the current value of |
820 | the corresponding parser options; with an argument sets the option to a given |
821 | value): |
822 | .PP |
823 | .Vb 10 |
824 | \& $parser\->validation(); |
825 | \& $parser\->recover(); |
826 | \& $parser\->pedantic_parser(); |
827 | \& $parser\->line_numbers(); |
828 | \& $parser\->load_ext_dtd(); |
829 | \& $parser\->complete_attributes(); |
830 | \& $parser\->expand_xinclude(); |
831 | \& $parser\->gdome_dom(); |
832 | \& $parser\->clean_namespaces(); |
833 | \& $parser\->no_network(); |
834 | .Ve |
835 | .PP |
836 | The following obsolete methods trigger parser options in some special way: |
837 | .IP "recover_silently" 4 |
838 | .IX Item "recover_silently" |
839 | .Vb 1 |
840 | \& $parser\->recover_silently(1);; |
841 | .Ve |
842 | .Sp |
843 | If called without an argument, returns true if the current value of the \f(CW\*(C`recover\*(C'\fR parser option is 2 and returns false otherwise. With a true argument sets the \f(CW\*(C`recover\*(C'\fR parser option to 2; with a false argument sets the \f(CW\*(C`recover\*(C'\fR parser option to 0. |
844 | .IP "expand_entities" 4 |
845 | .IX Item "expand_entities" |
846 | .Vb 1 |
847 | \& $parser\->expand_entities(0); |
848 | .Ve |
849 | .Sp |
850 | Get/set the \f(CW\*(C`expand_entities\*(C'\fR option. If called with a true argument, also turns the \f(CW\*(C`load_ext_dtd\*(C'\fR option to 1. |
851 | .IP "keep_blanks" 4 |
852 | .IX Item "keep_blanks" |
853 | .Vb 1 |
854 | \& $parser\->keep_blanks(0); |
855 | .Ve |
856 | .Sp |
857 | This is actually an oposite of the \f(CW\*(C`no_blanks\*(C'\fR parser option. If used without an argument retrieves negated value of \f(CW\*(C`no_blanks\*(C'\fR. If used with an argument sets \f(CW\*(C`no_blanks\*(C'\fR to the oposite value. |
858 | .IP "base_uri" 4 |
859 | .IX Item "base_uri" |
860 | .Vb 1 |
861 | \& $parser\->base_uri( $your_base_uri ); |
862 | .Ve |
863 | .Sp |
864 | Get/set the \f(CW\*(C`URI\*(C'\fR option. |
865 | .SH "XML CATALOGS" |
866 | .IX Header "XML CATALOGS" |
867 | \&\f(CW\*(C`libxml2\*(C'\fR supports \s-1XML\s0 catalogs. Catalogs are used to map remote resources to their local |
868 | copies. Using catalogs can speed up parsing processes if many external |
869 | resources from remote addresses are loaded into the parsed documents (such as |
870 | DTDs or XIncludes). |
871 | .PP |
872 | Note that libxml2 has a global pool of loaded catalogs, so if you apply the |
873 | method \f(CW\*(C`load_catalog\*(C'\fR to one parser instance, all parser instances will start using the catalog (in |
874 | addition to other previously loaded catalogs). |
875 | .PP |
876 | Note also that catalogs are not used when a custom external entity handler is |
877 | specified. At the current state it is not possible to make use of both types of |
878 | resolving systems at the same time. |
879 | .IP "load_catalog" 4 |
880 | .IX Item "load_catalog" |
881 | .Vb 1 |
882 | \& $parser\->load_catalog( $catalog_file ); |
883 | .Ve |
884 | .Sp |
885 | Loads the \s-1XML\s0 catalog file \f(CW$catalog_file\fR. |
886 | .SH "ERROR REPORTING" |
887 | .IX Header "ERROR REPORTING" |
888 | XML::LibXML throws exceptions during parsing, validation or XPath processing |
889 | (and some other occasions). These errors can be caught by using \fIeval\fR blocks. The error is stored in \fI$@\fR. There are two implementations: the old one throws $@ which is just a message |
890 | string, in the new one $@ is an object from the class XML::LibXML::Error; this |
891 | class overrides the operator "" so that when printed, the object flattens to |
892 | the usual error message. |
893 | .PP |
894 | XML::LibXML throws errors as they occur. This is a very common misunderstanding |
895 | in the use of XML::LibXML. If the eval is omitted, XML::LibXML will always halt |
896 | your script by \*(L"croaking\*(R" (see Carp man page for details). |
897 | .PP |
898 | Also note that an increasing number of functions throw errors if bad data is |
899 | passed as arguments. If you cannot assure valid data passed to XML::LibXML you |
900 | should eval these functions. |
901 | .PP |
902 | Note: since version 1.59, \fIget_last_error()\fR is no longer available in |
903 | XML::LibXML for thread-safety reasons. |
904 | .SH "AUTHORS" |
905 | .IX Header "AUTHORS" |
906 | Matt Sergeant, |
907 | Christian Glahn, |
908 | Petr Pajas |
909 | .SH "VERSION" |
910 | .IX Header "VERSION" |
911 | 1.70 |
912 | .SH "COPYRIGHT" |
913 | .IX Header "COPYRIGHT" |
914 | 2001\-2007, AxKit.com Ltd. |
915 | .PP |
916 | 2002\-2006, Christian Glahn. |
917 | .PP |
918 | 2006\-2009, Petr Pajas. |