1 # $Id: PurePerl.pm,v 1.28 2008-08-05 12:36:51 grant Exp $
3 package XML::SAX::PurePerl;
10 use XML::SAX::PurePerl::Productions qw($NameChar $SingleChar);
11 use XML::SAX::PurePerl::Reader;
12 use XML::SAX::PurePerl::EncodingDetect ();
13 use XML::SAX::Exception;
14 use XML::SAX::PurePerl::DocType ();
15 use XML::SAX::PurePerl::DTDDecls ();
16 use XML::SAX::PurePerl::XMLDecl ();
17 use XML::SAX::DocumentLocator ();
18 use XML::SAX::Base ();
19 use XML::SAX qw(Namespaces);
20 use XML::NamespaceSupport ();
24 require XML::SAX::PurePerl::NoUnicodeExt;
27 require XML::SAX::PurePerl::UnicodeExt;
31 @ISA = ('XML::SAX::Base');
41 my $xmlns_ns = "http://www.w3.org/2000/xmlns/";
42 my $xml_ns = "http://www.w3.org/XML/1998/namespace";
45 sub _parse_characterstream {
48 confess("CharacterStream is not yet correctly implemented");
49 my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh);
50 return $self->_parse($reader);
53 sub _parse_bytestream {
56 my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh);
57 return $self->_parse($reader);
63 my $reader = XML::SAX::PurePerl::Reader::String->new($str);
64 return $self->_parse($reader);
70 my $reader = XML::SAX::PurePerl::Reader::URI->new($uri);
71 return $self->_parse($reader);
75 my ($self, $reader) = @_;
77 $reader->public_id($self->{ParseOptions}{Source}{PublicId});
78 $reader->system_id($self->{ParseOptions}{Source}{SystemId});
80 $self->{NSHelper} = XML::NamespaceSupport->new({xmlns => 1});
82 $self->set_document_locator(
83 XML::SAX::DocumentLocator->new(
84 sub { $reader->public_id },
85 sub { $reader->system_id },
86 sub { $reader->line },
87 sub { $reader->column },
88 sub { $reader->get_encoding },
89 sub { $reader->get_xml_version },
93 $self->start_document({});
95 if (defined $self->{ParseOptions}{Source}{Encoding}) {
96 $reader->set_encoding($self->{ParseOptions}{Source}{Encoding});
99 $self->encoding_detect($reader);
103 $self->document($reader);
105 return $self->end_document({});
110 my ($error, $reader) = @_;
112 # warn("parser error: $error from ", $reader->line, " : ", $reader->column, "\n");
113 my $exception = XML::SAX::Exception::Parse->new(
115 ColumnNumber => $reader->column,
116 LineNumber => $reader->line,
117 PublicId => $reader->public_id,
118 SystemId => $reader->system_id,
121 $self->fatal_error($exception);
126 my ($self, $reader) = @_;
128 # document ::= prolog element Misc*
130 $self->prolog($reader);
131 $self->element($reader) ||
132 $self->parser_error("Document requires an element", $reader);
134 while(length($reader->data)) {
135 $self->Misc($reader) ||
136 $self->parser_error("Only Comments, PIs and whitespace allowed at end of document", $reader);
141 my ($self, $reader) = @_;
143 $self->XMLDecl($reader);
145 # consume all misc bits
146 1 while($self->Misc($reader));
148 if ($self->doctypedecl($reader)) {
149 while (length($reader->data)) {
150 $self->Misc($reader) || last;
156 my ($self, $reader) = @_;
158 return 0 unless $reader->match('<');
160 my $name = $self->Name($reader) || $self->parser_error("Invalid element name", $reader);
164 while( my ($k, $v) = $self->Attribute($reader) ) {
168 my $have_namespaces = $self->get_feature(Namespaces);
170 # Namespace processing
171 $self->{NSHelper}->push_context;
173 # my %attrs = @attribs;
174 # while (my ($k,$v) = each %attrs) {
175 if ($have_namespaces) {
176 while ( my ($k, $v) = each %attribs ) {
177 if ($k =~ m/^xmlns(:(.*))?$/) {
178 my $prefix = $2 || '';
179 $self->{NSHelper}->declare_prefix($prefix, $v);
186 $self->SUPER::start_prefix_mapping($ns);
191 # Create element object and fire event
193 while (my ($name, $value) = each %attribs ) {
194 # TODO normalise value here
195 my ($ns, $prefix, $lname);
196 if ($have_namespaces) {
197 ($ns, $prefix, $lname) = $self->{NSHelper}->process_attribute_name($name);
199 $ns ||= ''; $prefix ||= ''; $lname ||= '';
200 $attrib_hash{"{$ns}$lname"} = {
209 %attribs = (); # lose the memory since we recurse deep
211 my ($ns, $prefix, $lname);
212 if ($self->get_feature(Namespaces)) {
213 ($ns, $prefix, $lname) = $self->{NSHelper}->process_element_name($name);
218 $ns ||= ''; $prefix ||= ''; $lname ||= '';
220 # Process remainder of start_element
221 $self->skip_whitespace($reader);
223 my $data = $reader->data(2);
224 if ($data =~ /^\/>/) {
225 $reader->move_along(2);
228 $data =~ /^>/ or $self->parser_error("No close element tag", $reader);
229 $reader->move_along(1);
239 Attributes => \%attrib_hash,
241 $self->start_element($el);
246 $self->content($reader);
248 my $data = $reader->data(2);
249 $data =~ /^<\// or $self->parser_error("No close tag marker", $reader);
250 $reader->move_along(2);
251 my $end_name = $self->Name($reader);
252 $end_name eq $name || $self->parser_error("End tag mismatch ($end_name != $name)", $reader);
253 $self->skip_whitespace($reader);
254 $reader->match('>') or $self->parser_error("No close '>' on end tag", $reader);
258 delete $end_el{Attributes};
259 $self->end_element(\%end_el);
261 for my $ns (@new_ns) {
262 $self->end_prefix_mapping($ns);
264 $self->{NSHelper}->pop_context;
270 my ($self, $reader) = @_;
273 $self->CharData($reader);
275 my $data = $reader->data(2);
277 if ($data =~ /^<\//) {
280 elsif ($data =~ /^&/) {
281 $self->Reference($reader) or $self->parser_error("bare & not allowed in content", $reader);
284 elsif ($data =~ /^<!/) {
285 ($self->CDSect($reader)
287 $self->Comment($reader))
290 elsif ($data =~ /^<\?/) {
291 $self->PI($reader) and next;
293 elsif ($data =~ /^</) {
294 $self->element($reader) and next;
303 my ($self, $reader) = @_;
305 my $data = $reader->data(9);
306 return 0 unless $data =~ /^<!\[CDATA\[/;
307 $reader->move_along(9);
309 $self->start_cdata({});
311 $data = $reader->data;
313 $self->parser_error("EOF looking for CDATA section end", $reader)
314 unless length($data);
316 if ($data =~ /^(.*?)\]\]>/s) {
318 $reader->move_along(length($chars) + 3);
319 $self->characters({Data => $chars});
323 $self->characters({Data => $data});
324 $reader->move_along(length($data));
325 $data = $reader->data;
328 $self->end_cdata({});
333 my ($self, $reader) = @_;
335 my $data = $reader->data;
338 return unless length($data);
340 if ($data =~ /^([^<&]*)[<&]/s) {
342 $self->parser_error("String ']]>' not allowed in character data", $reader)
343 if $chars =~ /\]\]>/;
344 $reader->move_along(length($chars));
345 $self->characters({Data => $chars}) if length($chars);
349 $self->characters({Data => $data});
350 $reader->move_along(length($data));
351 $data = $reader->data;
357 my ($self, $reader) = @_;
358 if ($self->Comment($reader)) {
361 elsif ($self->PI($reader)) {
364 elsif ($self->skip_whitespace($reader)) {
372 my ($self, $reader) = @_;
374 return 0 unless $reader->match('&');
376 my $data = $reader->data;
378 # Fetch more data if we have an incomplete numeric reference
379 if ($data =~ /^(#\d*|#x[0-9a-fA-F]*)$/) {
380 $data = $reader->data(length($data) + 6);
383 if ($data =~ /^#x([0-9a-fA-F]+);/) {
385 $reader->move_along(length($ref) + 3);
386 my $char = chr_ref(hex($ref));
387 $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader)
388 unless $char =~ /$SingleChar/o;
389 $self->characters({ Data => $char });
392 elsif ($data =~ /^#([0-9]+);/) {
394 $reader->move_along(length($ref) + 2);
395 my $char = chr_ref($ref);
396 $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader)
397 unless $char =~ /$SingleChar/o;
398 $self->characters({ Data => $char });
403 my $name = $self->Name($reader)
404 || $self->parser_error("Invalid name in entity", $reader);
405 $reader->match(';') or $self->parser_error("No semi-colon found after entity name", $reader);
407 # warn("got entity: \&$name;\n");
410 if ($self->_is_entity($name)) {
412 if ($self->_is_external($name)) {
413 my $value = $self->_get_entity($name);
414 my $ent_reader = XML::SAX::PurePerl::Reader::URI->new($value);
415 $self->encoding_detect($ent_reader);
416 $self->extParsedEnt($ent_reader);
419 my $value = $self->_stringify_entity($name);
420 my $ent_reader = XML::SAX::PurePerl::Reader::String->new($value);
421 $self->content($ent_reader);
425 elsif ($name =~ /^(?:amp|gt|lt|quot|apos)$/) {
426 $self->characters({ Data => $int_ents{$name} });
430 $self->parser_error("Undeclared entity", $reader);
436 my ($self, $name, $reader) = @_;
437 if ($name =~ /^#x([0-9a-fA-F]+)$/) {
438 my $chr = chr_ref(hex($1));
439 $chr =~ /$SingleChar/o or $self->parser_error("Character reference '&$name;' refers to an illegal XML character", $reader);
442 elsif ($name =~ /^#([0-9]+)$/) {
443 my $chr = chr_ref($1);
444 $chr =~ /$SingleChar/o or $self->parser_error("Character reference '&$name;' refers to an illegal XML character", $reader);
448 if ($self->_is_entity($name)) {
449 if ($self->_is_external($name)) {
450 $self->parser_error("No external entity references allowed in attribute values", $reader);
453 my $value = $self->_stringify_entity($name);
457 elsif ($name =~ /^(?:amp|lt|gt|quot|apos)$/) {
458 return $int_ents{$name};
461 $self->parser_error("Undeclared entity '$name'", $reader);
467 my ($self, $reader) = @_;
469 $self->TextDecl($reader);
470 $self->content($reader);
474 my ($self, $name) = @_;
475 # TODO: Fix this to use $reader to store the entities perhaps.
476 if ($self->{ParseOptions}{external_entities}{$name}) {
483 my ($self, $name) = @_;
485 if (exists $self->{ParseOptions}{entities}{$name}) {
491 sub _stringify_entity {
492 my ($self, $name) = @_;
494 if (exists $self->{ParseOptions}{expanded_entity}{$name}) {
495 return $self->{ParseOptions}{expanded_entity}{$name};
498 my $reader = XML::SAX::PurePerl::Reader::URI->new($self->{ParseOptions}{entities}{$name});
501 my $data = $reader->data;
503 $reader->move_along(length($data)) or last;
505 return $self->{ParseOptions}{expanded_entity}{$name} = $ent;
509 my ($self, $name) = @_;
511 return $self->{ParseOptions}{entities}{$name};
514 sub skip_whitespace {
515 my ($self, $reader) = @_;
517 my $data = $reader->data;
520 while ($data =~ s/^([\x20\x0A\x0D\x09]*)//) {
521 last unless length($1);
523 $reader->move_along(length($1));
524 $data = $reader->data;
531 my ($self, $reader) = @_;
533 $self->skip_whitespace($reader) || return;
535 my $data = $reader->data(2);
536 return if $data =~ /^\/?>/;
538 if (my $name = $self->Name($reader)) {
539 $self->skip_whitespace($reader);
540 $reader->match('=') or $self->parser_error("No '=' in Attribute", $reader);
541 $self->skip_whitespace($reader);
542 my $value = $self->AttValue($reader);
544 if (!$self->cdata_attrib($name)) {
545 $value =~ s/^\x20*//; # discard leading spaces
546 $value =~ s/\x20*$//; # discard trailing spaces
547 $value =~ s/ {1,}/ /g; # all >1 space to single space
550 return $name, $value;
557 # TODO implement this!
562 my ($self, $reader) = @_;
564 my $quote = $self->quote($reader);
569 my $data = $reader->data;
570 $self->parser_error("EOF found while looking for the end of attribute value", $reader)
571 unless length($data);
572 if ($data =~ /^([^$quote]*)$quote/) {
573 $reader->move_along(length($1) + 1);
579 $reader->move_along(length($data));
584 $self->parser_error("< character not allowed in attribute values", $reader);
587 $value =~ s/[\x09\x0A\x0D]/\x20/g;
588 $value =~ s/&(#(x[0-9a-fA-F]+)|#([0-9]+)|\w+);/$self->AttReference($1, $reader)/geo;
594 my ($self, $reader) = @_;
596 my $data = $reader->data(4);
597 if ($data =~ /^<!--/) {
598 $reader->move_along(4);
599 my $comment_str = '';
601 my $data = $reader->data;
602 $self->parser_error("End of data seen while looking for close comment marker", $reader)
603 unless length($data);
604 if ($data =~ /^(.*?)-->/s) {
606 $self->parser_error("Invalid comment (dash)", $reader) if $comment_str =~ /-$/;
607 $reader->move_along(length($1) + 3);
611 $comment_str .= $data;
612 $reader->move_along(length($data));
616 $self->comment({ Data => $comment_str });
624 my ($self, $reader) = @_;
626 my $data = $reader->data(2);
628 if ($data =~ /^<\?/) {
629 $reader->move_along(2);
631 $target = $self->Name($reader) ||
632 $self->parser_error("PI has no target", $reader);
635 if ($self->skip_whitespace($reader)) {
637 my $data = $reader->data;
638 $self->parser_error("End of data seen while looking for close PI marker", $reader)
639 unless length($data);
640 if ($data =~ /^(.*?)\?>/s) {
642 $reader->move_along(length($1) + 2);
647 $reader->move_along(length($data));
652 my $data = $reader->data(2);
653 $data =~ /^\?>/ or $self->parser_error("PI closing sequence not found", $reader);
654 $reader->move_along(2);
657 $self->processing_instruction({ Target => $target, Data => $pi_data });
665 my ($self, $reader) = @_;
669 my $data = $reader->data;
670 return unless length($data);
671 $data =~ /^([^\s>\/&\?;=<\)\(\[\],\%\#\!\*\|]*)/ or return;
673 my $len = length($1);
674 $reader->move_along($len);
675 last if ($len != length($data));
678 return unless length($name);
680 $name =~ /$NameChar/o or $self->parser_error("Name <$name> does not match NameChar production", $reader);
686 my ($self, $reader) = @_;
688 my $data = $reader->data;
690 $data =~ /^(['"])/ or $self->parser_error("Invalid quote token", $reader);
691 $reader->move_along(1);
700 XML::SAX::PurePerl - Pure Perl XML Parser with SAX2 interface
704 use XML::Handler::Foo;
705 use XML::SAX::PurePerl;
706 my $handler = XML::Handler::Foo->new();
707 my $parser = XML::SAX::PurePerl->new(Handler => $handler);
708 $parser->parse_uri("myfile.xml");
712 This module implements an XML parser in pure perl. It is written around the
713 upcoming perl 5.8's unicode support and support for multiple document
714 encodings (using the PerlIO layer), however it has been ported to work with
715 ASCII/UTF8 documents under lower perl versions.
717 The SAX2 API is described in detail at http://sourceforge.net/projects/perl-xml/, in
718 the CVS archive, under libxml-perl/docs. Hopefully those documents will be in a
719 better location soon.
721 Please refer to the SAX2 documentation for how to use this module - it is merely a
722 front end to SAX2, and implements nothing that is not in that spec (or at least tries
723 not to - please email me if you find errors in this implementation).
727 XML::SAX::PurePerl is B<slow>. Very slow. I suggest you use something else
728 in fact. However it is great as a fallback parser for XML::SAX, where the
729 user might not be able to install an XS based parser or C library.
731 Currently lots, probably. At the moment the weakest area is parsing DOCTYPE declarations,
732 though the code is in place to start doing this. Also parsing parameter entity
733 references is causing me much confusion, since it's not exactly what I would call
734 trivial, or well documented in the XML grammar. XML documents with internal subsets
737 I am however trying to work towards full conformance using the Oasis test suite.
741 Matt Sergeant, matt@sergeant.org. Copyright 2001.
743 Please report all bugs to the Perl-XML mailing list at perl-xml@listserv.activestate.com.
747 This is free software. You may use it or redistribute it under the same terms as