3 # Copyright (c) 1998-2000 Larry Wall and Clark Cooper
6 # This program is free software; you can redistribute it and/or
7 # modify it under the same terms as Perl itself.
14 require XML::Parser::Expat;
16 die "Parser.pm and Expat.pm versions don't match"
17 unless $VERSION eq $XML::Parser::Expat::VERSION;
22 use vars qw($VERSION $LWP_load_failed);
27 my ($class, %args) = @_;
28 my $style = $args{Style};
30 my $nonexopt = $args{Non_Expat_Options} ||= {};
32 $nonexopt->{Style} = 1;
33 $nonexopt->{Non_Expat_Options} = 1;
34 $nonexopt->{Handlers} = 1;
35 $nonexopt->{_HNDL_TYPES} = 1;
36 $nonexopt->{NoLWP} = 1;
38 $args{_HNDL_TYPES} = {%XML::Parser::Expat::Handler_Setters};
39 $args{_HNDL_TYPES}->{Init} = 1;
40 $args{_HNDL_TYPES}->{Final} = 1;
42 $args{Handlers} ||= {};
43 my $handlers = $args{Handlers};
45 if (defined($style)) {
46 my $stylepkg = $style;
48 if ($stylepkg !~ /::/) {
49 $stylepkg = "\u$style";
52 my $fullpkg = 'XML::Parser::Style::' . $stylepkg;
53 my $stylefile = $fullpkg;
54 $stylefile =~ s/::/\//g;
55 require "$stylefile.pm";
59 # fallback to old behaviour
60 $stylepkg = 'XML::Parser::' . $stylepkg;
65 foreach $htype (keys %{$args{_HNDL_TYPES}}) {
66 # Handlers explicity given override
67 # handlers from the Style package
68 unless (defined($handlers->{$htype})) {
70 # A handler in the style package must either have
71 # exactly the right case as the type name or a
72 # completely lower case version of it.
74 my $hname = "${stylepkg}::$htype";
75 if (defined(&$hname)) {
76 $handlers->{$htype} = \&$hname;
80 $hname = "${stylepkg}::\L$htype";
81 if (defined(&$hname)) {
82 $handlers->{$htype} = \&$hname;
89 unless (defined($handlers->{ExternEnt})
90 or defined ($handlers->{ExternEntFin})) {
92 if ($args{NoLWP} or $LWP_load_failed) {
93 $handlers->{ExternEnt} = \&file_ext_ent_handler;
94 $handlers->{ExternEntFin} = \&file_ext_ent_cleanup;
97 # The following just bootstraps the real LWP external entity
100 $handlers->{ExternEnt} = \&initial_ext_ent_handler;
102 # No cleanup function available until LWPExternEnt.pl loaded
106 $args{Pkg} ||= caller;
107 bless \%args, $class;
111 my ($self, @handler_pairs) = @_;
113 croak("Uneven number of arguments to setHandlers method")
114 if (int(@handler_pairs) & 1);
117 while (@handler_pairs) {
118 my $type = shift @handler_pairs;
119 my $handler = shift @handler_pairs;
120 unless (defined($self->{_HNDL_TYPES}->{$type})) {
121 my @types = sort keys %{$self->{_HNDL_TYPES}};
123 croak("Unknown Parser handler type: $type\n Valid types: @types");
125 push(@ret, $type, $self->{Handlers}->{$type});
126 $self->{Handlers}->{$type} = $handler;
134 my @expat_options = ();
137 while (($key, $val) = each %{$self}) {
138 push (@expat_options, $key, $val)
139 unless exists $self->{Non_Expat_Options}->{$key};
142 my %handlers = %{$self->{Handlers}};
143 my $init = delete $handlers{Init};
144 my $final = delete $handlers{Final};
146 my $expatnb = new XML::Parser::ExpatNB(@expat_options, @_);
147 $expatnb->setHandlers(%handlers);
152 $expatnb->{_State_} = 1;
154 $expatnb->{FinalHandler} = $final
163 my @expat_options = ();
165 while (($key, $val) = each %{$self}) {
166 push(@expat_options, $key, $val)
167 unless exists $self->{Non_Expat_Options}->{$key};
170 my $expat = new XML::Parser::Expat(@expat_options, @_);
171 my %handlers = %{$self->{Handlers}};
172 my $init = delete $handlers{Init};
173 my $final = delete $handlers{Final};
175 $expat->setHandlers(%handlers);
178 $expat->base($self->{Base});
187 $result = $expat->parse($arg);
195 if ($result and defined($final)) {
197 @result = &$final($expat);
200 $result = &$final($expat);
206 return unless defined wantarray;
207 return wantarray ? @result : $result;
219 open(FILE, $file) or croak "Couldn't open $file:\n$!";
224 $self->{Base} = $file;
228 @ret = $self->parse(*FILE, @_);
233 $ret = $self->parse(*FILE, @_);
240 return unless defined wantarray;
241 return wantarray ? @ret : $ret;
244 sub initial_ext_ent_handler {
245 # This just bootstraps in the real lwp_ext_ent_handler which
246 # also loads the URI and LWP modules.
248 unless ($LWP_load_failed) {
253 require('XML/Parser/LWPExternEnt.pl');
257 $_[0]->setHandlers(ExternEnt => \&lwp_ext_ent_handler,
258 ExternEntFin => \&lwp_ext_ent_cleanup);
260 goto &lwp_ext_ent_handler;
263 # Failed to load lwp handler, act as if NoLWP
265 $LWP_load_failed = 1;
267 my $cmsg = "Couldn't load LWP based external entity handler\n";
268 $cmsg .= "Switching to file-based external entity handler\n";
269 $cmsg .= " (To avoid this message, use NoLWP option to XML::Parser)\n";
273 $_[0]->setHandlers(ExternEnt => \&file_ext_ent_handler,
274 ExternEntFin => \&file_ext_ent_cleanup);
275 goto &file_ext_ent_handler;
279 sub file_ext_ent_handler {
280 my ($xp, $base, $path) = @_;
282 # Prepend base only for relative paths
285 and not ($path =~ m!^(?:[\\/]|\w+:)!))
288 $newpath =~ s![^\\/:]*$!$path!;
292 if ($path =~ /^\s*[|>+]/
293 or $path =~ /\|\s*$/) {
295 .= "System ID ($path) contains Perl IO control characters";
300 my $fh = new IO::File($path);
301 unless (defined $fh) {
303 .= "Failed to open $path:\n$!";
307 $xp->{_BaseStack} ||= [];
308 $xp->{_FhStack} ||= [];
310 push(@{$xp->{_BaseStack}}, $base);
311 push(@{$xp->{_FhStack}}, $fh);
318 sub file_ext_ent_cleanup {
321 my $fh = pop(@{$xp->{_FhStack}});
324 my $base = pop(@{$xp->{_BaseStack}});
334 XML::Parser - A perl module for parsing XML documents
340 $p1 = new XML::Parser(Style => 'Debug');
341 $p1->parsefile('REC-xml-19980210.xml');
342 $p1->parse('<foo id="me">Hello World</foo>');
345 $p2 = new XML::Parser(Handlers => {Start => \&handle_start,
347 Char => \&handle_char});
350 # Another alternative
351 $p3 = new XML::Parser(ErrorContext => 2);
353 $p3->setHandlers(Char => \&text,
356 open(FOO, 'xmlgenerator |');
357 $p3->parse(*FOO, ProtocolEncoding => 'ISO-8859-1');
360 $p3->parsefile('junk.xml', ErrorContext => 3);
369 This module provides ways to parse XML documents. It is built on top of
370 L<XML::Parser::Expat>, which is a lower level interface to James Clark's
371 expat library. Each call to one of the parsing methods creates a new
372 instance of XML::Parser::Expat which is then used to parse the document.
373 Expat options may be provided when the XML::Parser object is created.
374 These options are then passed on to the Expat object on each parse call.
375 They can also be given as extra arguments to the parse methods, in which
376 case they override options given at XML::Parser creation time.
378 The behavior of the parser is controlled either by C<L</Style>> and/or
379 C<L</Handlers>> options, or by L</setHandlers> method. These all provide
380 mechanisms for XML::Parser to set the handlers needed by XML::Parser::Expat.
381 If neither C<Style> nor C<Handlers> are specified, then parsing just
382 checks the document for being well-formed.
384 When underlying handlers get called, they receive as their first parameter
385 the I<Expat> object, not the Parser object.
393 This is a class method, the constructor for XML::Parser. Options are passed
394 as keyword value pairs. Recognized options are:
400 This option provides an easy way to create a given style of parser. The
401 built in styles are: L<"Debug">, L<"Subs">, L<"Tree">, L<"Objects">,
402 and L<"Stream">. These are all defined in separate packages under
403 C<XML::Parser::Style::*>, and you can find further documentation for
404 each style both below, and in those packages.
406 Custom styles can be provided by giving a full package name containing
407 at least one '::'. This package should then have subs defined for each
408 handler it wishes to have installed. See L<"STYLES"> below
409 for a discussion of each built in style.
413 When provided, this option should be an anonymous hash containing as
414 keys the type of handler and as values a sub reference to handle that
415 type of event. All the handlers get passed as their 1st parameter the
416 instance of expat that is parsing the document. Further details on
417 handlers can be found in L<"HANDLERS">. Any handler set here
418 overrides the corresponding handler set with the Style option.
422 Some styles will refer to subs defined in this package. If not provided,
423 it defaults to the package which called the constructor.
427 This is an Expat option. When this option is defined, errors are reported
428 in context. The value should be the number of lines to show on either side
429 of the line in which the error occurred.
431 =item * ProtocolEncoding
433 This is an Expat option. This sets the protocol encoding name. It defaults
434 to none. The built-in encodings are: C<UTF-8>, C<ISO-8859-1>, C<UTF-16>, and
435 C<US-ASCII>. Other encodings may be used if they have encoding maps in one
436 of the directories in the @Encoding_Path list. Check L<"ENCODINGS"> for
437 more information on encoding maps. Setting the protocol encoding overrides
438 any encoding in the XML declaration.
442 This is an Expat option. If this is set to a true value, then namespace
443 processing is done during the parse. See L<XML::Parser::Expat/"Namespaces">
444 for further discussion of namespace processing.
448 This is an Expat option. Normally, the parser will try to expand references
449 to entities defined in the internal subset. If this option is set to a true
450 value, and a default handler is also set, then the default handler will be
451 called when an entity reference is seen in text. This has no effect if a
452 default handler has not been registered, and it has no effect on the expansion
453 of entity references inside attribute values.
455 =item * Stream_Delimiter
457 This is an Expat option. It takes a string value. When this string is found
458 alone on a line while parsing from a stream, then the parse is ended as if it
459 saw an end of file. The intended use is with a stream of xml documents in a
460 MIME multipart format. The string should not contain a trailing newline.
462 =item * ParseParamEnt
464 This is an Expat option. Unless standalone is set to "yes" in the XML
465 declaration, setting this to a true value allows the external DTD to be read,
466 and parameter entities to be parsed and expanded.
470 This option has no effect if the ExternEnt or ExternEntFin handlers are
471 directly set. Otherwise, if true, it forces the use of a file based external
474 =item * Non-Expat-Options
476 If provided, this should be an anonymous hash whose keys are options that
477 shouldn't be passed to Expat. This should only be of concern to those
478 subclassing XML::Parser.
482 =item setHandlers(TYPE, HANDLER [, TYPE, HANDLER [...]])
484 This method registers handlers for various parser events. It overrides any
485 previous handlers registered through the Style or Handler options or through
486 earlier calls to setHandlers. By providing a false or undefined value as
487 the handler, the existing handler can be unset.
489 This method returns a list of type, handler pairs corresponding to the
490 input. The handlers returned are the ones that were in effect prior to
493 See a description of the handler types in L<"HANDLERS">.
495 =item parse(SOURCE [, OPT => OPT_VALUE [...]])
497 The SOURCE parameter should either be a string containing the whole XML
498 document, or it should be an open IO::Handle. Constructor options to
499 XML::Parser::Expat given as keyword-value pairs may follow the SOURCE
500 parameter. These override, for this call, any options or attributes passed
501 through from the XML::Parser instance.
503 A die call is thrown if a parse error occurs. Otherwise it will return 1
504 or whatever is returned from the B<Final> handler, if one is installed.
505 In other words, what parse may return depends on the style.
509 This is just an alias for parse for backwards compatibility.
511 =item parsefile(FILE [, OPT => OPT_VALUE [...]])
513 Open FILE for reading, then call parse with the open handle. The file
514 is closed no matter how parse returns. Returns what parse returns.
516 =item parse_start([ OPT => OPT_VALUE [...]])
518 Create and return a new instance of XML::Parser::ExpatNB. Constructor
519 options may be provided. If an init handler has been provided, it is
520 called before returning the ExpatNB object. Documents are parsed by
521 making incremental calls to the parse_more method of this object, which
522 takes a string. A single call to the parse_done method of this object,
523 which takes no arguments, indicates that the document is finished.
525 If there is a final handler installed, it is executed by the parse_done
526 method before returning and the parse_done method returns whatever is
527 returned by the final handler.
533 Expat is an event based parser. As the parser recognizes parts of the
534 document (say the start or end tag for an XML element), then any handlers
535 registered for that type of an event are called with suitable parameters.
536 All handlers receive an instance of XML::Parser::Expat as their first
537 argument. See L<XML::Parser::Expat/"METHODS"> for a discussion of the
538 methods that can be called on this object.
542 This is called just before the parsing of the document starts.
546 This is called just after parsing has finished, but only if no errors
547 occurred during the parse. Parse returns what this returns.
549 =head2 Start (Expat, Element [, Attr, Val [,...]])
551 This event is generated when an XML start tag is recognized. Element is the
552 name of the XML element type that is opened with the start tag. The Attr &
553 Val pairs are generated for each attribute in the start tag.
555 =head2 End (Expat, Element)
557 This event is generated when an XML end tag is recognized. Note that
558 an XML empty tag (<foo/>) generates both a start and an end event.
560 =head2 Char (Expat, String)
562 This event is generated when non-markup is recognized. The non-markup
563 sequence of characters is in String. A single non-markup sequence of
564 characters may generate multiple calls to this handler. Whatever the
565 encoding of the string in the original document, this is given to the
568 =head2 Proc (Expat, Target, Data)
570 This event is generated when a processing instruction is recognized.
572 =head2 Comment (Expat, Data)
574 This event is generated when a comment is recognized.
576 =head2 CdataStart (Expat)
578 This is called at the start of a CDATA section.
580 =head2 CdataEnd (Expat)
582 This is called at the end of a CDATA section.
584 =head2 Default (Expat, String)
586 This is called for any characters that don't have a registered handler.
587 This includes both characters that are part of markup for which no
588 events are generated (markup declarations) and characters that
589 could generate events, but for which no handler has been registered.
591 Whatever the encoding in the original document, the string is returned to
592 the handler in UTF-8.
594 =head2 Unparsed (Expat, Entity, Base, Sysid, Pubid, Notation)
596 This is called for a declaration of an unparsed entity. Entity is the name
597 of the entity. Base is the base to be used for resolving a relative URI.
598 Sysid is the system id. Pubid is the public id. Notation is the notation
599 name. Base and Pubid may be undefined.
601 =head2 Notation (Expat, Notation, Base, Sysid, Pubid)
603 This is called for a declaration of notation. Notation is the notation name.
604 Base is the base to be used for resolving a relative URI. Sysid is the system
605 id. Pubid is the public id. Base, Sysid, and Pubid may all be undefined.
607 =head2 ExternEnt (Expat, Base, Sysid, Pubid)
609 This is called when an external entity is referenced. Base is the base to be
610 used for resolving a relative URI. Sysid is the system id. Pubid is the public
611 id. Base, and Pubid may be undefined.
613 This handler should either return a string, which represents the contents of
614 the external entity, or return an open filehandle that can be read to obtain
615 the contents of the external entity, or return undef, which indicates the
616 external entity couldn't be found and will generate a parse error.
618 If an open filehandle is returned, it must be returned as either a glob
619 (*FOO) or as a reference to a glob (e.g. an instance of IO::Handle).
621 A default handler is installed for this event. The default handler is
622 XML::Parser::lwp_ext_ent_handler unless the NoLWP option was provided with
623 a true value, otherwise XML::Parser::file_ext_ent_handler is the default
624 handler for external entities. Even without the NoLWP option, if the
625 URI or LWP modules are missing, the file based handler ends up being used
626 after giving a warning on the first external entity reference.
628 The LWP external entity handler will use proxies defined in the environment
629 (http_proxy, ftp_proxy, etc.).
631 Please note that the LWP external entity handler reads the entire
632 entity into a string and returns it, where as the file handler opens a
635 Also note that the file external entity handler will likely choke on
636 absolute URIs or file names that don't fit the conventions of the local
639 The expat base method can be used to set a basename for
640 relative pathnames. If no basename is given, or if the basename is itself
641 a relative name, then it is relative to the current working directory.
643 =head2 ExternEntFin (Expat)
645 This is called after parsing an external entity. It's not called unless
646 an ExternEnt handler is also set. There is a default handler installed
647 that pairs with the default ExternEnt handler.
649 If you're going to install your own ExternEnt handler, then you should
650 set (or unset) this handler too.
652 =head2 Entity (Expat, Name, Val, Sysid, Pubid, Ndata, IsParam)
654 This is called when an entity is declared. For internal entities, the Val
655 parameter will contain the value and the remaining three parameters will be
656 undefined. For external entities, the Val parameter will be undefined, the
657 Sysid parameter will have the system id, the Pubid parameter will have the
658 public id if it was provided (it will be undefined otherwise), the Ndata
659 parameter will contain the notation for unparsed entities. If this is a
660 parameter entity declaration, then the IsParam parameter is true.
662 Note that this handler and the Unparsed handler above overlap. If both are
663 set, then this handler will not be called for unparsed entities.
665 =head2 Element (Expat, Name, Model)
667 The element handler is called when an element declaration is found. Name
668 is the element name, and Model is the content model as an XML::Parser::Content
669 object. See L<XML::Parser::Expat/"XML::Parser::ContentModel Methods">
670 for methods available for this class.
672 =head2 Attlist (Expat, Elname, Attname, Type, Default, Fixed)
674 This handler is called for each attribute in an ATTLIST declaration.
675 So an ATTLIST declaration that has multiple attributes will generate multiple
676 calls to this handler. The Elname parameter is the name of the element with
677 which the attribute is being associated. The Attname parameter is the name
678 of the attribute. Type is the attribute type, given as a string. Default is
679 the default value, which will either be "#REQUIRED", "#IMPLIED" or a quoted
680 string (i.e. the returned string will begin and end with a quote character).
681 If Fixed is true, then this is a fixed attribute.
683 =head2 Doctype (Expat, Name, Sysid, Pubid, Internal)
685 This handler is called for DOCTYPE declarations. Name is the document type
686 name. Sysid is the system id of the document type, if it was provided,
687 otherwise it's undefined. Pubid is the public id of the document type,
688 which will be undefined if no public id was given. Internal is the internal
689 subset, given as a string. If there was no internal subset, it will be
690 undefined. Internal will contain all whitespace, comments, processing
691 instructions, and declarations seen in the internal subset. The declarations
692 will be there whether or not they have been processed by another handler
693 (except for unparsed entities processed by the Unparsed handler). However,
694 comments and processing instructions will not appear if they've been processed
695 by their respective handlers.
697 =head2 * DoctypeFin (Parser)
699 This handler is called after parsing of the DOCTYPE declaration has finished,
700 including any internal or external DTD declarations.
702 =head2 XMLDecl (Expat, Version, Encoding, Standalone)
704 This handler is called for xml declarations. Version is a string containg
705 the version. Encoding is either undefined or contains an encoding string.
706 Standalone will be either true, false, or undefined if the standalone attribute
707 is yes, no, or not made respectively.
713 This just prints out the document in outline form. Nothing special is
718 Each time an element starts, a sub by that name in the package specified
719 by the Pkg option is called with the same parameters that the Start
720 handler gets called with.
722 Each time an element ends, a sub with that name appended with an underscore
723 ("_"), is called with the same parameters that the End handler gets called
726 Nothing special is returned by parse.
730 Parse will return a parse tree for the document. Each node in the tree
731 takes the form of a tag, content pair. Text nodes are represented with
732 a pseudo-tag of "0" and the string that is their content. For elements,
733 the content is an array reference. The first item in the array is a
734 (possibly empty) hash reference containing attributes. The remainder of
735 the array is a sequence of tag-content pairs representing the content
738 So for example the result of parsing:
740 <foo><head id="a">Hello <em>there</em></head><bar>Howdy<ref/></bar>do</foo>
745 ==================================================================
746 [foo, [{}, head, [{id => "a"}, 0, "Hello ", em, [{}, 0, "there"]],
747 bar, [ {}, 0, "Howdy", ref, [{}]],
752 The root document "foo", has 3 children: a "head" element, a "bar"
753 element and the text "do". After the empty attribute hash, these are
754 represented in it's contents by 3 tag-content pairs.
758 This is similar to the Tree style, except that a hash object is created for
759 each element. The corresponding object will be in the class whose name
760 is created by appending "::" and the element name to the package set with
761 the Pkg option. Non-markup text will be in the ::Characters class. The
762 contents of the corresponding object will be in an anonymous array that
763 is the value of the Kids property for that object.
767 This style also uses the Pkg package. If none of the subs that this
768 style looks for is there, then the effect of parsing with this style is
769 to print a canonical copy of the document without comments or declarations.
770 All the subs receive as their 1st parameter the Expat instance for the
771 document they're parsing.
773 It looks for the following routines:
777 =item * StartDocument
779 Called at the start of the parse .
783 Called for every start tag with a second parameter of the element type. The $_
784 variable will contain a copy of the tag and the %_ variable will contain
785 attribute values supplied for that element.
789 Called for every end tag with a second parameter of the element type. The $_
790 variable will contain a copy of the end tag.
794 Called just before start or end tags with accumulated non-markup text in
799 Called for processing instructions. The $_ variable will contain a copy of
800 the PI and the target and data are sent as 2nd and 3rd parameters
805 Called at conclusion of the parse.
811 XML documents may be encoded in character sets other than Unicode as
812 long as they may be mapped into the Unicode character set. Expat has
813 further restrictions on encodings. Read the xmlparse.h header file in
814 the expat distribution to see details on these restrictions.
816 Expat has built-in encodings for: C<UTF-8>, C<ISO-8859-1>, C<UTF-16>, and
817 C<US-ASCII>. Encodings are set either through the XML declaration
818 encoding attribute or through the ProtocolEncoding option to XML::Parser
819 or XML::Parser::Expat.
821 For encodings other than the built-ins, expat calls the function
822 load_encoding in the Expat package with the encoding name. This function
823 looks for a file in the path list @XML::Parser::Expat::Encoding_Path, that
824 matches the lower-cased name with a '.enc' extension. The first one it
827 If you wish to build your own encoding maps, check out the XML::Encoding
832 Larry Wall <F<larry@wall.org>> wrote version 1.0.
834 Clark Cooper <F<coopercc@netheaven.com>> picked up support, changed the API
835 for this version (2.x), provided documentation,
836 and added some standard package features.
838 Matt Sergeant <F<matt@sergeant.org>> is now maintaining XML::Parser