local-lib5/lib/perl5/i486-linux-gnu-thread-multi/XML/LibXML/Reader.pod

   1 =head1 NAME
   2
   3 XML::LibXML::Reader - XML::LibXML::Reader - interface to libxml2 pull parser
   4
   5 =head1 SYNOPSIS
   6
   7
   8
   9   use XML::LibXML::Reader;
  10
  11
  12
  13   my $reader = new XML::LibXML::Reader(location => "file.xml")
  14          or die "cannot read file.xml\n";
  15   while ($reader->read) {
  16     processNode($reader);
  17   }
  18
  19
  20
  21   sub processNode {
  22       $reader = shift;
  23       printf "%d %d %s %d\n", ($reader->depth,
  24                                $reader->nodeType,
  25                                $reader->name,
  26                                $reader->isEmptyElement);
  27   }
  28
  29 or
  30
  31
  32
  33   $reader = new XML::LibXML::Reader(location => "file.xml")
  34          or die "cannot read file.xml\n";
  35     $reader->preservePattern('//table/tr');
  36     $reader->finish;
  37     print $reader->document->toString(1);
  38
  39
  40 =head1 DESCRIPTION
  41
  42 This is a perl interface to libxml2's pull-parser implementation xmlTextReader I<<<<<< http://xmlsoft.org/html/libxml-xmlreader.html >>>>>>. This feature requires at least libxml2-2.6.21. Pull-parser (StAX in Java,
  43 XmlReader in C#) use an iterator approach to parse a xml-file. They are easier
  44 to program than event-based parser (SAX) and much more lightweight than
  45 tree-based parser (DOM), which load the complete tree into memory.
  46
  47 The Reader acts as a cursor going forward on the document stream and stopping
  48 at each node in the way. At every point DOM-like methods of the Reader object
  49 allow to examine the current node (name, namespace, attributes, etc.)
  50
  51 The user's code keeps control of the progress and simply calls the C<<<<<< read() >>>>>> function repeatedly to progress to the next node in the document order. Other
  52 functions provide means for skipping complete sub-trees, or nodes until a
  53 specific element, etc.
  54
  55 At every time, only a very limited portion of the document is kept in the
  56 memory, which makes the API more memory-efficient than using DOM. However, it
  57 is also possible to mix Reader with DOM. At every point the user may copy the
  58 current node (optionally expanded into a complete sub-tree) from the processed
  59 document to another DOM tree, or to instruct the Reader to collect sub-document
  60 in form of a DOM tree consisting of selected nodes.
  61
  62 Reader API also supports namespaces, xml:base, entity handling, and DTD
  63 validation. Schema and RelaxNG validation support will probably be added in
  64 some later revision of the Perl interface.
  65
  66 The naming of methods compared to libxml2 and C# XmlTextReader has been changed
  67 slightly to match the conventions of XML::LibXML. Some functions have been
  68 changed or added with respect to the C interface.
  69
  70
  71 =head1 CONSTRUCTOR
  72
  73 Depending on the XML source, the Reader object can be created with either of:
  74
  75
  76
  77   my $reader = XML::LibXML::Reader->new( location => "file.xml", ... );
  78     my $reader = XML::LibXML::Reader->new( string => $xml_string, ... );
  79     my $reader = XML::LibXML::Reader->new( IO => $file_handle, ... );
  80     my $reader = XML::LibXML::Reader->new( FD => fileno(STDIN), ... );
  81     my $reader = XML::LibXML::Reader->new( DOM => $dom, ... );
  82
  83 where ... are (optional) reader options described below in L<<<<<< Reader options >>>>>> or various parser options described in L<<<<<< XML::LibXML::Parser >>>>>>. The constructor recognizes the following XML sources:
  84
  85
  86 =head2 Source specification
  87
  88 =over 4
  89
  90 =item location
  91
  92 Read XML from a local file or URL.
  93
  94
  95 =item string
  96
  97 Read XML from a string.
  98
  99
 100 =item IO
 101
 102 Read XML a Perl IO filehandle.
 103
 104
 105 =item FD
 106
 107 Read XML from a file descriptor (bypasses Perl I/O layer, only applicable to
 108 filehandles for regular files or pipes). Possibly faster than IO.
 109
 110
 111 =item DOM
 112
 113 Use reader API to walk through a pre-parsed L<<<<<< XML::LibXML::Document >>>>>>.
 114
 115
 116
 117 =back
 118
 119
 120 =head2 Reader options
 121
 122 =over 4
 123
 124 =item encoding => $encoding
 125
 126 override document encoding.
 127
 128
 129 =item RelaxNG => $rng_schema
 130
 131 can be used to pass either a L<<<<<< XML::LibXML::RelaxNG >>>>>> object or a filename or URL of a RelaxNG schema to the constructor. The schema
 132 is then used to validate the document as it is processed.
 133
 134
 135 =item Schema => $xsd_schema
 136
 137 can be used to pass either a L<<<<<< XML::LibXML::Schema >>>>>> object or a filename or URL of a W3C XSD schema to the constructor. The schema
 138 is then used to validate the document as it is processed.
 139
 140
 141 =item ...
 142
 143 the reader further supports various parser options described in L<<<<<< XML::LibXML::Parser >>>>>> (specificly those labeled by /reader/).
 144
 145
 146
 147 =back
 148
 149
 150 =head1 METHODS CONTROLLING PARSING PROGRESS
 151
 152 =over 4
 153
 154 =item read ()
 155
 156 Moves the position to the next node in the stream, exposing its properties.
 157
 158 Returns 1 if the node was read successfully, 0 if there is no more nodes to
 159 read, or -1 in case of error
 160
 161
 162 =item readAttributeValue ()
 163
 164 Parses an attribute value into one or more Text and EntityReference nodes.
 165
 166 Returns 1 in case of success, 0 if the reader was not positioned on an
 167 attribute node or all the attribute values have been read, or -1 in case of
 168 error.
 169
 170
 171 =item readState ()
 172
 173 Gets the read state of the reader. Returns the state value, or -1 in case of
 174 error. The module exports constants for the Reader states, see STATES below.
 175
 176
 177 =item depth ()
 178
 179 The depth of the node in the tree, starts at 0 for the root node.
 180
 181
 182 =item next ()
 183
 184 Skip to the node following the current one in the document order while avoiding
 185 the sub-tree if any. Returns 1 if the node was read successfully, 0 if there is
 186 no more nodes to read, or -1 in case of error.
 187
 188
 189 =item nextElement (localname?,nsURI?)
 190
 191 Skip nodes following the current one in the document order until a specific
 192 element is reached. The element's name must be equal to a given localname if
 193 defined, and its namespace must equal to a given nsURI if defined. Either of
 194 the arguments can be undefined (or omitted, in case of the latter or both).
 195
 196 Returns 1 if the element was found, 0 if there is no more nodes to read, or -1
 197 in case of error.
 198
 199
 200 =item nextPatternMatch (compiled_pattern)
 201
 202 Skip nodes following the current one in the document order until an element
 203 matching a given compiled pattern is reached. See L<<<<<< XML::LibXML::Pattern >>>>>> for information on compiled patterns. See also the C<<<<<< matchesPattern >>>>>> method.
 204
 205 Returns 1 if the element was found, 0 if there is no more nodes to read, or -1
 206 in case of error.
 207
 208
 209 =item skipSiblings ()
 210
 211 Skip all nodes on the same or lower level until the first node on a higher
 212 level is reached. In particular, if the current node occurs in an element, the
 213 reader stops at the end tag of the parent element, otherwise it stops at a node
 214 immediately following the parent node.
 215
 216 Returns 1 if successful, 0 if end of the document is reached, or -1 in case of
 217 error.
 218
 219
 220 =item nextSibling ()
 221
 222 It skips to the node following the current one in the document order while
 223 avoiding the sub-tree if any.
 224
 225 Returns 1 if the node was read successfully, 0 if there is no more nodes to
 226 read, or -1 in case of error
 227
 228
 229 =item nextSiblingElement (name?,nsURI?)
 230
 231 Like nextElement but only processes sibling elements of the current node
 232 (moving forward using C<<<<<< nextSibling () >>>>>> rather than C<<<<<< read () >>>>>>, internally).
 233
 234 Returns 1 if the element was found, 0 if there is no more sibling nodes, or -1
 235 in case of error.
 236
 237
 238 =item finish ()
 239
 240 Skip all remaining nodes in the document, reaching end of the document.
 241
 242 Returns 1 if successful, 0 in case of error.
 243
 244
 245 =item close ()
 246
 247 This method releases any resources allocated by the current instance and closes
 248 any underlying input. It returns 0 on failure and 1 on success. This method is
 249 automatically called by the destructor when the reader is forgotten, therefore
 250 you do not have to call it directly.
 251
 252
 253
 254 =back
 255
 256
 257 =head1 METHODS EXTRACTING INFORMATION
 258
 259 =over 4
 260
 261 =item name ()
 262
 263 Returns the qualified name of the current node, equal to (Prefix:)LocalName.
 264
 265
 266 =item nodeType ()
 267
 268 Returns the type of the current node. See NODE TYPES below.
 269
 270
 271 =item localName ()
 272
 273 Returns the local name of the node.
 274
 275
 276 =item prefix ()
 277
 278 Returns the prefix of the namespace associated with the node.
 279
 280
 281 =item namespaceURI ()
 282
 283 Returns the URI defining the namespace associated with the node.
 284
 285
 286 =item isEmptyElement ()
 287
 288 Check if the current node is empty, this is a bit bizarre in the sense that
 289 <a/> will be considered empty while <a></a> will not.
 290
 291
 292 =item hasValue ()
 293
 294 Returns true if the node can have a text value.
 295
 296
 297 =item value ()
 298
 299 Provides the text value of the node if present or undef if not available.
 300
 301
 302 =item readInnerXml ()
 303
 304 Reads the contents of the current node, including child nodes and markup.
 305 Returns a string containing the XML of the node's content, or undef if the
 306 current node is neither an element nor attribute, or has no child nodes.
 307
 308
 309 =item readOuterXml ()
 310
 311 Reads the contents of the current node, including child nodes and markup.
 312
 313 Returns a string containing the XML of the node including its content, or undef
 314 if the current node is neither an element nor attribute.
 315
 316
 317 =item nodePath()
 318
 319 Returns a cannonical location path to the current element from the root node to
 320 the current node. Namespaced elements are matched by '*', because there is no
 321 way to declare prefixes within XPath patterns. Unlike C<<<<<< XML::LibXML::Node::nodePath() >>>>>>, this function does not provide sibling counts (i.e. instead of e.g. '/a/b[1]'
 322 and '/a/b[2]' you get '/a/b' for both matches).
 323
 324
 325 =item matchesPattern(compiled_pattern)
 326
 327 Returns a true value if the current node matches a compiled pattern. See L<<<<<< XML::LibXML::Pattern >>>>>> for information on compiled patterns. See also the C<<<<<< nextPatternMatch >>>>>> method.
 328
 329
 330
 331 =back
 332
 333
 334 =head1 METHODS EXTRACTING DOM NODES
 335
 336 =over 4
 337
 338 =item document ()
 339
 340 Provides access to the document tree built by the reader. This function can be
 341 used to collect the preserved nodes (see C<<<<<< preserveNode() >>>>>> and preservePattern).
 342
 343 CAUTION: Never use this function to modify the tree unless reading of the whole
 344 document is completed!
 345
 346
 347 =item copyCurrentNode (deep)
 348
 349 This function is similar a DOM function C<<<<<< copyNode() >>>>>>. It returns a copy of the currently processed node as a corresponding DOM
 350 object. Use deep = 1 to obtain the full sub-tree.
 351
 352
 353 =item preserveNode ()
 354
 355 This tells the XML Reader to preserve the current node in the document tree. A
 356 document tree consisting of the preserved nodes and their content can be
 357 obtained using the method C<<<<<< document() >>>>>> once parsing is finished.
 358
 359 Returns the node or NULL in case of error.
 360
 361
 362 =item preservePattern (pattern,\%ns_map)
 363
 364 This tells the XML Reader to preserve all nodes matched by the pattern (which
 365 is a streaming XPath subset). A document tree consisting of the preserved nodes
 366 and their content can be obtained using the method C<<<<<< document() >>>>>> once parsing is finished.
 367
 368 An optional second argument can be used to provide a HASH reference mapping
 369 prefixes used by the XPath to namespace URIs.
 370
 371 The XPath subset available with this function is described at
 372
 373
 374
 375   http://www.w3.org/TR/xmlschema-1/#Selector
 376
 377 and matches the production
 378
 379
 380
 381   Path ::= ('.//')? ( Step '/' )* ( Step | '@' NameTest )
 382
 383 Returns a positive number in case of success and -1 in case of error
 384
 385
 386
 387 =back
 388
 389
 390 =head1 METHODS PROCESSING ATTRIBUTES
 391
 392 =over 4
 393
 394 =item attributeCount ()
 395
 396 Provides the number of attributes of the current node.
 397
 398
 399 =item hasAttributes ()
 400
 401 Whether the node has attributes.
 402
 403
 404 =item getAttribute (name)
 405
 406 Provides the value of the attribute with the specified qualified name.
 407
 408 Returns a string containing the value of the specified attribute, or undef in
 409 case of error.
 410
 411
 412 =item getAttributeNs (localName, namespaceURI)
 413
 414 Provides the value of the specified attribute.
 415
 416 Returns a string containing the value of the specified attribute, or undef in
 417 case of error.
 418
 419
 420 =item getAttributeNo (no)
 421
 422 Provides the value of the attribute with the specified index relative to the
 423 containing element.
 424
 425 Returns a string containing the value of the specified attribute, or undef in
 426 case of error.
 427
 428
 429 =item isDefault ()
 430
 431 Returns true if the current attribute node was generated from the default value
 432 defined in the DTD.
 433
 434
 435 =item moveToAttribute (name)
 436
 437 Moves the position to the attribute with the specified local name and namespace
 438 URI.
 439
 440 Returns 1 in case of success, -1 in case of error, 0 if not found
 441
 442
 443 =item moveToAttributeNo (no)
 444
 445 Moves the position to the attribute with the specified index relative to the
 446 containing element.
 447
 448 Returns 1 in case of success, -1 in case of error, 0 if not found
 449
 450
 451 =item moveToAttributeNs (localName,namespaceURI)
 452
 453 Moves the position to the attribute with the specified local name and namespace
 454 URI.
 455
 456 Returns 1 in case of success, -1 in case of error, 0 if not found
 457
 458
 459 =item moveToFirstAttribute ()
 460
 461 Moves the position to the first attribute associated with the current node.
 462
 463 Returns 1 in case of success, -1 in case of error, 0 if not found
 464
 465
 466 =item moveToNextAttribute ()
 467
 468 Moves the position to the next attribute associated with the current node.
 469
 470 Returns 1 in case of success, -1 in case of error, 0 if not found
 471
 472
 473 =item moveToElement ()
 474
 475 Moves the position to the node that contains the current attribute node.
 476
 477 Returns 1 in case of success, -1 in case of error, 0 if not moved
 478
 479
 480 =item isNamespaceDecl ()
 481
 482 Determine whether the current node is a namespace declaration rather than a
 483 regular attribute.
 484
 485 Returns 1 if the current node is a namespace declaration, 0 if it is a regular
 486 attribute or other type of node, or -1 in case of error.
 487
 488
 489
 490 =back
 491
 492
 493 =head1 OTHER METHODS
 494
 495 =over 4
 496
 497 =item lookupNamespace (prefix)
 498
 499 Resolves a namespace prefix in the scope of the current element.
 500
 501 Returns a string containing the namespace URI to which the prefix maps or undef
 502 in case of error.
 503
 504
 505 =item encoding ()
 506
 507 Returns a string containing the encoding of the document or undef in case of
 508 error.
 509
 510
 511 =item standalone ()
 512
 513 Determine the standalone status of the document being read. Returns 1 if the
 514 document was declared to be standalone, 0 if it was declared to be not
 515 standalone, or -1 if the document did not specify its standalone status or in
 516 case of error.
 517
 518
 519 =item xmlVersion ()
 520
 521 Determine the XML version of the document being read. Returns a string
 522 containing the XML version of the document or undef in case of error.
 523
 524
 525 =item baseURI ()
 526
 527 Returns the base URI of a given node.
 528
 529
 530 =item isValid ()
 531
 532 Retrieve the validity status from the parser.
 533
 534 Returns 1 if valid, 0 if no, and -1 in case of error.
 535
 536
 537 =item xmlLang ()
 538
 539 The xml:lang scope within which the node resides.
 540
 541
 542 =item lineNumber ()
 543
 544 Provide the line number of the current parsing point.
 545
 546
 547 =item columnNumber ()
 548
 549 Provide the column number of the current parsing point.
 550
 551
 552 =item byteConsumed ()
 553
 554 This function provides the current index of the parser relative to the start of
 555 the current entity. This function is computed in bytes from the beginning
 556 starting at zero and finishing at the size in bytes of the file if parsing a
 557 file. The function is of constant cost if the input is UTF-8 but can be costly
 558 if run on non-UTF-8 input.
 559
 560
 561 =item setParserProp (prop => value, ...)
 562
 563 Change the parser processing behaviour by changing some of its internal
 564 properties. The following properties are available with this function:
 565 ``load_ext_dtd'', ``complete_attributes'', ``validation'', ``expand_entities''.
 566
 567 Since some of the properties can only be changed before any read has been done,
 568 it is best to set the parsing properties at the constructor.
 569
 570 Returns 0 if the call was successful, or -1 in case of error
 571
 572
 573 =item getParserProp (prop)
 574
 575 Get value of an parser internal property. The following property names can be
 576 used: ``load_ext_dtd'', ``complete_attributes'', ``validation'',
 577 ``expand_entities''.
 578
 579 Returns the value, usually 0 or 1, or -1 in case of error.
 580
 581
 582
 583 =back
 584
 585
 586 =head1 DESTRUCTION
 587
 588 XML::LibXML takes care of the reader object destruction when the last reference
 589 to the reader object goes out of scope. The document tree is preserved, though,
 590 if either of $reader->document or $reader->preserveNode was used and references
 591 to the document tree exist.
 592
 593
 594 =head1 NODE TYPES
 595
 596 The reader interface provides the following constants for node types (the
 597 constant symbols are exported by default or if tag C<<<<<< :types >>>>>> is used).
 598
 599
 600
 601   XML_READER_TYPE_NONE                    => 0
 602   XML_READER_TYPE_ELEMENT                 => 1
 603   XML_READER_TYPE_ATTRIBUTE               => 2
 604   XML_READER_TYPE_TEXT                    => 3
 605   XML_READER_TYPE_CDATA                   => 4
 606   XML_READER_TYPE_ENTITY_REFERENCE        => 5
 607   XML_READER_TYPE_ENTITY                  => 6
 608   XML_READER_TYPE_PROCESSING_INSTRUCTION  => 7
 609   XML_READER_TYPE_COMMENT                 => 8
 610   XML_READER_TYPE_DOCUMENT                => 9
 611   XML_READER_TYPE_DOCUMENT_TYPE           => 10
 612   XML_READER_TYPE_DOCUMENT_FRAGMENT       => 11
 613   XML_READER_TYPE_NOTATION                => 12
 614   XML_READER_TYPE_WHITESPACE              => 13
 615   XML_READER_TYPE_SIGNIFICANT_WHITESPACE  => 14
 616   XML_READER_TYPE_END_ELEMENT             => 15
 617   XML_READER_TYPE_END_ENTITY              => 16
 618   XML_READER_TYPE_XML_DECLARATION         => 17
 619
 620
 621 =head1 STATES
 622
 623 The following constants represent the values returned by C<<<<<< readState() >>>>>>. They are exported by default, or if tag C<<<<<< :states >>>>>> is used:
 624
 625
 626
 627   XML_READER_NONE      => -1
 628   XML_READER_START     =>  0
 629   XML_READER_ELEMENT   =>  1
 630   XML_READER_END       =>  2
 631   XML_READER_EMPTY     =>  3
 632   XML_READER_BACKTRACK =>  4
 633   XML_READER_DONE      =>  5
 634   XML_READER_ERROR     =>  6
 635
 636
 637 =head1 SEE ALSO
 638
 639 L<<<<<< XML::LibXML::Pattern >>>>>> for information about compiled patterns.
 640
 641 http://xmlsoft.org/html/libxml-xmlreader.html
 642
 643 http://dotgnu.org/pnetlib-doc/System/Xml/XmlTextReader.html
 644
 645
 646 =head1 ORIGINAL IMPLEMENTATION
 647
 648 Heiko Klein, <H.Klein@gmx.net<gt> and Petr Pajas
 649
 650 =head1 AUTHORS
 651
 652 Matt Sergeant,
 653 Christian Glahn,
 654 Petr Pajas
 655
 656
 657 =head1 VERSION
 658
 659 1.70
 660
 661 =head1 COPYRIGHT
 662
 663 2001-2007, AxKit.com Ltd.
 664
 665 2002-2006, Christian Glahn.
 666
 667 2006-2009, Petr Pajas.
 668
 669 =cut