Commit | Line | Data |
3fea05b9 |
1 | package XML::Parser::Expat; |
2 | |
3 | require 5.004; |
4 | |
5 | use strict; |
6 | use vars qw($VERSION @ISA %Handler_Setters %Encoding_Table @Encoding_Path |
7 | $have_File_Spec); |
8 | use Carp; |
9 | |
10 | require DynaLoader; |
11 | |
12 | @ISA = qw(DynaLoader); |
13 | $VERSION = "2.36" ; |
14 | |
15 | $have_File_Spec = $INC{'File/Spec.pm'} || do 'File/Spec.pm'; |
16 | |
17 | %Encoding_Table = (); |
18 | if ($have_File_Spec) { |
19 | @Encoding_Path = (grep(-d $_, |
20 | map(File::Spec->catdir($_, qw(XML Parser Encodings)), |
21 | @INC)), |
22 | File::Spec->curdir); |
23 | } |
24 | else { |
25 | @Encoding_Path = (grep(-d $_, map($_ . '/XML/Parser/Encodings', @INC)), '.'); |
26 | } |
27 | |
28 | |
29 | bootstrap XML::Parser::Expat $VERSION; |
30 | |
31 | %Handler_Setters = ( |
32 | Start => \&SetStartElementHandler, |
33 | End => \&SetEndElementHandler, |
34 | Char => \&SetCharacterDataHandler, |
35 | Proc => \&SetProcessingInstructionHandler, |
36 | Comment => \&SetCommentHandler, |
37 | CdataStart => \&SetStartCdataHandler, |
38 | CdataEnd => \&SetEndCdataHandler, |
39 | Default => \&SetDefaultHandler, |
40 | Unparsed => \&SetUnparsedEntityDeclHandler, |
41 | Notation => \&SetNotationDeclHandler, |
42 | ExternEnt => \&SetExternalEntityRefHandler, |
43 | ExternEntFin => \&SetExtEntFinishHandler, |
44 | Entity => \&SetEntityDeclHandler, |
45 | Element => \&SetElementDeclHandler, |
46 | Attlist => \&SetAttListDeclHandler, |
47 | Doctype => \&SetDoctypeHandler, |
48 | DoctypeFin => \&SetEndDoctypeHandler, |
49 | XMLDecl => \&SetXMLDeclHandler |
50 | ); |
51 | |
52 | sub new { |
53 | my ($class, %args) = @_; |
54 | my $self = bless \%args, $_[0]; |
55 | $args{_State_} = 0; |
56 | $args{Context} = []; |
57 | $args{Namespaces} ||= 0; |
58 | $args{ErrorMessage} ||= ''; |
59 | if ($args{Namespaces}) { |
60 | $args{Namespace_Table} = {}; |
61 | $args{Namespace_List} = [undef]; |
62 | $args{Prefix_Table} = {}; |
63 | $args{New_Prefixes} = []; |
64 | } |
65 | $args{_Setters} = \%Handler_Setters; |
66 | $args{Parser} = ParserCreate($self, $args{ProtocolEncoding}, |
67 | $args{Namespaces}); |
68 | $self; |
69 | } |
70 | |
71 | sub load_encoding { |
72 | my ($file) = @_; |
73 | |
74 | $file =~ s!([^/]+)$!\L$1\E!; |
75 | $file .= '.enc' unless $file =~ /\.enc$/; |
76 | unless ($file =~ m!^/!) { |
77 | foreach (@Encoding_Path) { |
78 | my $tmp = ($have_File_Spec |
79 | ? File::Spec->catfile($_, $file) |
80 | : "$_/$file"); |
81 | if (-e $tmp) { |
82 | $file = $tmp; |
83 | last; |
84 | } |
85 | } |
86 | } |
87 | |
88 | local(*ENC); |
89 | open(ENC, $file) or croak("Couldn't open encmap $file:\n$!\n"); |
90 | binmode(ENC); |
91 | my $data; |
92 | my $br = sysread(ENC, $data, -s $file); |
93 | croak("Trouble reading $file:\n$!\n") |
94 | unless defined($br); |
95 | close(ENC); |
96 | |
97 | my $name = LoadEncoding($data, $br); |
98 | croak("$file isn't an encmap file") |
99 | unless defined($name); |
100 | |
101 | $name; |
102 | } # End load_encoding |
103 | |
104 | sub setHandlers { |
105 | my ($self, @handler_pairs) = @_; |
106 | |
107 | croak("Uneven number of arguments to setHandlers method") |
108 | if (int(@handler_pairs) & 1); |
109 | |
110 | my @ret; |
111 | |
112 | while (@handler_pairs) { |
113 | my $type = shift @handler_pairs; |
114 | my $handler = shift @handler_pairs; |
115 | croak "Handler for $type not a Code ref" |
116 | unless (! defined($handler) or ! $handler or ref($handler) eq 'CODE'); |
117 | |
118 | my $hndl = $self->{_Setters}->{$type}; |
119 | |
120 | unless (defined($hndl)) { |
121 | my @types = sort keys %{$self->{_Setters}}; |
122 | croak("Unknown Expat handler type: $type\n Valid types: @types"); |
123 | } |
124 | |
125 | my $old = &$hndl($self->{Parser}, $handler); |
126 | push (@ret, $type, $old); |
127 | } |
128 | |
129 | return @ret; |
130 | } |
131 | |
132 | sub xpcroak |
133 | { |
134 | my ($self, $message) = @_; |
135 | |
136 | my $eclines = $self->{ErrorContext}; |
137 | my $line = GetCurrentLineNumber($_[0]->{Parser}); |
138 | $message .= " at line $line"; |
139 | $message .= ":\n" . $self->position_in_context($eclines) |
140 | if defined($eclines); |
141 | croak $message; |
142 | } |
143 | |
144 | sub xpcarp { |
145 | my ($self, $message) = @_; |
146 | |
147 | my $eclines = $self->{ErrorContext}; |
148 | my $line = GetCurrentLineNumber($_[0]->{Parser}); |
149 | $message .= " at line $line"; |
150 | $message .= ":\n" . $self->position_in_context($eclines) |
151 | if defined($eclines); |
152 | carp $message; |
153 | } |
154 | |
155 | sub default_current { |
156 | my $self = shift; |
157 | if ($self->{_State_} == 1) { |
158 | return DefaultCurrent($self->{Parser}); |
159 | } |
160 | } |
161 | |
162 | sub recognized_string { |
163 | my $self = shift; |
164 | if ($self->{_State_} == 1) { |
165 | return RecognizedString($self->{Parser}); |
166 | } |
167 | } |
168 | |
169 | sub original_string { |
170 | my $self = shift; |
171 | if ($self->{_State_} == 1) { |
172 | return OriginalString($self->{Parser}); |
173 | } |
174 | } |
175 | |
176 | sub current_line { |
177 | my $self = shift; |
178 | if ($self->{_State_} == 1) { |
179 | return GetCurrentLineNumber($self->{Parser}); |
180 | } |
181 | } |
182 | |
183 | sub current_column { |
184 | my $self = shift; |
185 | if ($self->{_State_} == 1) { |
186 | return GetCurrentColumnNumber($self->{Parser}); |
187 | } |
188 | } |
189 | |
190 | sub current_byte { |
191 | my $self = shift; |
192 | if ($self->{_State_} == 1) { |
193 | return GetCurrentByteIndex($self->{Parser}); |
194 | } |
195 | } |
196 | |
197 | sub base { |
198 | my ($self, $newbase) = @_; |
199 | my $p = $self->{Parser}; |
200 | my $oldbase = GetBase($p); |
201 | SetBase($p, $newbase) if @_ > 1; |
202 | return $oldbase; |
203 | } |
204 | |
205 | sub context { |
206 | my $ctx = $_[0]->{Context}; |
207 | @$ctx; |
208 | } |
209 | |
210 | sub current_element { |
211 | my ($self) = @_; |
212 | @{$self->{Context}} ? $self->{Context}->[-1] : undef; |
213 | } |
214 | |
215 | sub in_element { |
216 | my ($self, $element) = @_; |
217 | @{$self->{Context}} ? $self->eq_name($self->{Context}->[-1], $element) |
218 | : undef; |
219 | } |
220 | |
221 | sub within_element { |
222 | my ($self, $element) = @_; |
223 | my $cnt = 0; |
224 | foreach (@{$self->{Context}}) { |
225 | $cnt++ if $self->eq_name($_, $element); |
226 | } |
227 | return $cnt; |
228 | } |
229 | |
230 | sub depth { |
231 | my ($self) = @_; |
232 | int(@{$self->{Context}}); |
233 | } |
234 | |
235 | sub element_index { |
236 | my ($self) = @_; |
237 | |
238 | if ($self->{_State_} == 1) { |
239 | return ElementIndex($self->{Parser}); |
240 | } |
241 | } |
242 | |
243 | ################ |
244 | # Namespace methods |
245 | |
246 | sub namespace { |
247 | my ($self, $name) = @_; |
248 | local($^W) = 0; |
249 | $self->{Namespace_List}->[int($name)]; |
250 | } |
251 | |
252 | sub eq_name { |
253 | my ($self, $nm1, $nm2) = @_; |
254 | local($^W) = 0; |
255 | |
256 | int($nm1) == int($nm2) and $nm1 eq $nm2; |
257 | } |
258 | |
259 | sub generate_ns_name { |
260 | my ($self, $name, $namespace) = @_; |
261 | |
262 | $namespace ? |
263 | GenerateNSName($name, $namespace, $self->{Namespace_Table}, |
264 | $self->{Namespace_List}) |
265 | : $name; |
266 | } |
267 | |
268 | sub new_ns_prefixes { |
269 | my ($self) = @_; |
270 | if ($self->{Namespaces}) { |
271 | return @{$self->{New_Prefixes}}; |
272 | } |
273 | return (); |
274 | } |
275 | |
276 | sub expand_ns_prefix { |
277 | my ($self, $prefix) = @_; |
278 | |
279 | if ($self->{Namespaces}) { |
280 | my $stack = $self->{Prefix_Table}->{$prefix}; |
281 | return (defined($stack) and @$stack) ? $stack->[-1] : undef; |
282 | } |
283 | |
284 | return undef; |
285 | } |
286 | |
287 | sub current_ns_prefixes { |
288 | my ($self) = @_; |
289 | |
290 | if ($self->{Namespaces}) { |
291 | my %set = %{$self->{Prefix_Table}}; |
292 | |
293 | if (exists $set{'#default'} and not defined($set{'#default'}->[-1])) { |
294 | delete $set{'#default'}; |
295 | } |
296 | |
297 | return keys %set; |
298 | } |
299 | |
300 | return (); |
301 | } |
302 | |
303 | |
304 | ################################################################ |
305 | # Namespace declaration handlers |
306 | # |
307 | |
308 | sub NamespaceStart { |
309 | my ($self, $prefix, $uri) = @_; |
310 | |
311 | $prefix = '#default' unless defined $prefix; |
312 | my $stack = $self->{Prefix_Table}->{$prefix}; |
313 | |
314 | if (defined $stack) { |
315 | push(@$stack, $uri); |
316 | } |
317 | else { |
318 | $self->{Prefix_Table}->{$prefix} = [$uri]; |
319 | } |
320 | |
321 | # The New_Prefixes list gets emptied at end of startElement function |
322 | # in Expat.xs |
323 | |
324 | push(@{$self->{New_Prefixes}}, $prefix); |
325 | } |
326 | |
327 | sub NamespaceEnd { |
328 | my ($self, $prefix) = @_; |
329 | |
330 | $prefix = '#default' unless defined $prefix; |
331 | |
332 | my $stack = $self->{Prefix_Table}->{$prefix}; |
333 | if (@$stack > 1) { |
334 | pop(@$stack); |
335 | } |
336 | else { |
337 | delete $self->{Prefix_Table}->{$prefix}; |
338 | } |
339 | } |
340 | |
341 | ################ |
342 | |
343 | sub specified_attr { |
344 | my $self = shift; |
345 | |
346 | if ($self->{_State_} == 1) { |
347 | return GetSpecifiedAttributeCount($self->{Parser}); |
348 | } |
349 | } |
350 | |
351 | sub finish { |
352 | my ($self) = @_; |
353 | if ($self->{_State_} == 1) { |
354 | my $parser = $self->{Parser}; |
355 | UnsetAllHandlers($parser); |
356 | } |
357 | } |
358 | |
359 | sub position_in_context { |
360 | my ($self, $lines) = @_; |
361 | if ($self->{_State_} == 1) { |
362 | my $parser = $self->{Parser}; |
363 | my ($string, $linepos) = PositionContext($parser, $lines); |
364 | |
365 | return '' unless defined($string); |
366 | |
367 | my $col = GetCurrentColumnNumber($parser); |
368 | my $ptr = ('=' x ($col - 1)) . '^' . "\n"; |
369 | my $ret; |
370 | my $dosplit = $linepos < length($string); |
371 | |
372 | $string .= "\n" unless $string =~ /\n$/; |
373 | |
374 | if ($dosplit) { |
375 | $ret = substr($string, 0, $linepos) . $ptr |
376 | . substr($string, $linepos); |
377 | } else { |
378 | $ret = $string . $ptr; |
379 | } |
380 | |
381 | return $ret; |
382 | } |
383 | } |
384 | |
385 | sub xml_escape { |
386 | my $self = shift; |
387 | my $text = shift; |
388 | |
389 | study $text; |
390 | $text =~ s/\&/\&/g; |
391 | $text =~ s/</\</g; |
392 | foreach (@_) { |
393 | croak "xml_escape: '$_' isn't a single character" if length($_) > 1; |
394 | |
395 | if ($_ eq '>') { |
396 | $text =~ s/>/\>/g; |
397 | } |
398 | elsif ($_ eq '"') { |
399 | $text =~ s/\"/\"/; |
400 | } |
401 | elsif ($_ eq "'") { |
402 | $text =~ s/\'/\'/; |
403 | } |
404 | else { |
405 | my $rep = '&#' . sprintf('x%X', ord($_)) . ';'; |
406 | if (/\W/) { |
407 | my $ptrn = "\\$_"; |
408 | $text =~ s/$ptrn/$rep/g; |
409 | } |
410 | else { |
411 | $text =~ s/$_/$rep/g; |
412 | } |
413 | } |
414 | } |
415 | $text; |
416 | } |
417 | |
418 | sub skip_until { |
419 | my $self = shift; |
420 | if ($self->{_State_} <= 1) { |
421 | SkipUntil($self->{Parser}, $_[0]); |
422 | } |
423 | } |
424 | |
425 | sub release { |
426 | my $self = shift; |
427 | ParserRelease($self->{Parser}); |
428 | } |
429 | |
430 | sub DESTROY { |
431 | my $self = shift; |
432 | ParserFree($self->{Parser}); |
433 | } |
434 | |
435 | sub parse { |
436 | my $self = shift; |
437 | my $arg = shift; |
438 | croak "Parse already in progress (Expat)" if $self->{_State_}; |
439 | $self->{_State_} = 1; |
440 | my $parser = $self->{Parser}; |
441 | my $ioref; |
442 | my $result = 0; |
443 | |
444 | if (defined $arg) { |
445 | if (ref($arg) and UNIVERSAL::isa($arg, 'IO::Handle')) { |
446 | $ioref = $arg; |
447 | } elsif (tied($arg)) { |
448 | my $class = ref($arg); |
449 | no strict 'refs'; |
450 | $ioref = $arg if defined &{"${class}::TIEHANDLE"}; |
451 | } |
452 | else { |
453 | require IO::Handle; |
454 | eval { |
455 | no strict 'refs'; |
456 | $ioref = *{$arg}{IO} if defined *{$arg}; |
457 | }; |
458 | undef $@; |
459 | } |
460 | } |
461 | |
462 | if (defined($ioref)) { |
463 | my $delim = $self->{Stream_Delimiter}; |
464 | my $prev_rs; |
465 | |
466 | $prev_rs = ref($ioref)->input_record_separator("\n$delim\n") |
467 | if defined($delim); |
468 | |
469 | $result = ParseStream($parser, $ioref, $delim); |
470 | |
471 | ref($ioref)->input_record_separator($prev_rs) |
472 | if defined($delim); |
473 | } else { |
474 | $result = ParseString($parser, $arg); |
475 | } |
476 | |
477 | $self->{_State_} = 2; |
478 | $result or croak $self->{ErrorMessage}; |
479 | } |
480 | |
481 | sub parsestring { |
482 | my $self = shift; |
483 | $self->parse(@_); |
484 | } |
485 | |
486 | sub parsefile { |
487 | my $self = shift; |
488 | croak "Parser has already been used" if $self->{_State_}; |
489 | local(*FILE); |
490 | open(FILE, $_[0]) or croak "Couldn't open $_[0]:\n$!"; |
491 | binmode(FILE); |
492 | my $ret = $self->parse(*FILE); |
493 | close(FILE); |
494 | $ret; |
495 | } |
496 | |
497 | ################################################################ |
498 | package XML::Parser::ContentModel; |
499 | use overload '""' => \&asString, 'eq' => \&thiseq; |
500 | |
501 | sub EMPTY () {1} |
502 | sub ANY () {2} |
503 | sub MIXED () {3} |
504 | sub NAME () {4} |
505 | sub CHOICE () {5} |
506 | sub SEQ () {6} |
507 | |
508 | |
509 | sub isempty { |
510 | return $_[0]->{Type} == EMPTY; |
511 | } |
512 | |
513 | sub isany { |
514 | return $_[0]->{Type} == ANY; |
515 | } |
516 | |
517 | sub ismixed { |
518 | return $_[0]->{Type} == MIXED; |
519 | } |
520 | |
521 | sub isname { |
522 | return $_[0]->{Type} == NAME; |
523 | } |
524 | |
525 | sub name { |
526 | return $_[0]->{Tag}; |
527 | } |
528 | |
529 | sub ischoice { |
530 | return $_[0]->{Type} == CHOICE; |
531 | } |
532 | |
533 | sub isseq { |
534 | return $_[0]->{Type} == SEQ; |
535 | } |
536 | |
537 | sub quant { |
538 | return $_[0]->{Quant}; |
539 | } |
540 | |
541 | sub children { |
542 | my $children = $_[0]->{Children}; |
543 | if (defined $children) { |
544 | return @$children; |
545 | } |
546 | return undef; |
547 | } |
548 | |
549 | sub asString { |
550 | my ($self) = @_; |
551 | my $ret; |
552 | |
553 | if ($self->{Type} == NAME) { |
554 | $ret = $self->{Tag}; |
555 | } |
556 | elsif ($self->{Type} == EMPTY) { |
557 | return "EMPTY"; |
558 | } |
559 | elsif ($self->{Type} == ANY) { |
560 | return "ANY"; |
561 | } |
562 | elsif ($self->{Type} == MIXED) { |
563 | $ret = '(#PCDATA'; |
564 | foreach (@{$self->{Children}}) { |
565 | $ret .= '|' . $_; |
566 | } |
567 | $ret .= ')'; |
568 | } |
569 | else { |
570 | my $sep = $self->{Type} == CHOICE ? '|' : ','; |
571 | $ret = '(' . join($sep, map { $_->asString } @{$self->{Children}}) . ')'; |
572 | } |
573 | |
574 | $ret .= $self->{Quant} if $self->{Quant}; |
575 | return $ret; |
576 | } |
577 | |
578 | sub thiseq { |
579 | my $self = shift; |
580 | |
581 | return $self->asString eq $_[0]; |
582 | } |
583 | |
584 | ################################################################ |
585 | package XML::Parser::ExpatNB; |
586 | |
587 | use vars qw(@ISA); |
588 | use Carp; |
589 | |
590 | @ISA = qw(XML::Parser::Expat); |
591 | |
592 | sub parse { |
593 | my $self = shift; |
594 | my $class = ref($self); |
595 | croak "parse method not supported in $class"; |
596 | } |
597 | |
598 | sub parsestring { |
599 | my $self = shift; |
600 | my $class = ref($self); |
601 | croak "parsestring method not supported in $class"; |
602 | } |
603 | |
604 | sub parsefile { |
605 | my $self = shift; |
606 | my $class = ref($self); |
607 | croak "parsefile method not supported in $class"; |
608 | } |
609 | |
610 | sub parse_more { |
611 | my ($self, $data) = @_; |
612 | |
613 | $self->{_State_} = 1; |
614 | my $ret = XML::Parser::Expat::ParsePartial($self->{Parser}, $data); |
615 | |
616 | croak $self->{ErrorMessage} unless $ret; |
617 | } |
618 | |
619 | sub parse_done { |
620 | my $self = shift; |
621 | |
622 | my $ret = XML::Parser::Expat::ParseDone($self->{Parser}); |
623 | unless ($ret) { |
624 | my $msg = $self->{ErrorMessage}; |
625 | $self->release; |
626 | croak $msg; |
627 | } |
628 | |
629 | $self->{_State_} = 2; |
630 | |
631 | my $result = $ret; |
632 | my @result = (); |
633 | my $final = $self->{FinalHandler}; |
634 | if (defined $final) { |
635 | if (wantarray) { |
636 | @result = &$final($self); |
637 | } |
638 | else { |
639 | $result = &$final($self); |
640 | } |
641 | } |
642 | |
643 | $self->release; |
644 | |
645 | return unless defined wantarray; |
646 | return wantarray ? @result : $result; |
647 | } |
648 | |
649 | ################################################################ |
650 | |
651 | package XML::Parser::Encinfo; |
652 | |
653 | sub DESTROY { |
654 | my $self = shift; |
655 | XML::Parser::Expat::FreeEncoding($self); |
656 | } |
657 | |
658 | 1; |
659 | |
660 | __END__ |
661 | |
662 | =head1 NAME |
663 | |
664 | XML::Parser::Expat - Lowlevel access to James Clark's expat XML parser |
665 | |
666 | =head1 SYNOPSIS |
667 | |
668 | use XML::Parser::Expat; |
669 | |
670 | $parser = new XML::Parser::Expat; |
671 | $parser->setHandlers('Start' => \&sh, |
672 | 'End' => \&eh, |
673 | 'Char' => \&ch); |
674 | open(FOO, 'info.xml') or die "Couldn't open"; |
675 | $parser->parse(*FOO); |
676 | close(FOO); |
677 | # $parser->parse('<foo id="me"> here <em>we</em> go </foo>'); |
678 | |
679 | sub sh |
680 | { |
681 | my ($p, $el, %atts) = @_; |
682 | $p->setHandlers('Char' => \&spec) |
683 | if ($el eq 'special'); |
684 | ... |
685 | } |
686 | |
687 | sub eh |
688 | { |
689 | my ($p, $el) = @_; |
690 | $p->setHandlers('Char' => \&ch) # Special elements won't contain |
691 | if ($el eq 'special'); # other special elements |
692 | ... |
693 | } |
694 | |
695 | =head1 DESCRIPTION |
696 | |
697 | This module provides an interface to James Clark's XML parser, expat. As in |
698 | expat, a single instance of the parser can only parse one document. Calls |
699 | to parsestring after the first for a given instance will die. |
700 | |
701 | Expat (and XML::Parser::Expat) are event based. As the parser recognizes |
702 | parts of the document (say the start or end of an XML element), then any |
703 | handlers registered for that type of an event are called with suitable |
704 | parameters. |
705 | |
706 | =head1 METHODS |
707 | |
708 | =over 4 |
709 | |
710 | =item new |
711 | |
712 | This is a class method, the constructor for XML::Parser::Expat. Options are |
713 | passed as keyword value pairs. The recognized options are: |
714 | |
715 | =over 4 |
716 | |
717 | =item * ProtocolEncoding |
718 | |
719 | The protocol encoding name. The default is none. The expat built-in |
720 | encodings are: C<UTF-8>, C<ISO-8859-1>, C<UTF-16>, and C<US-ASCII>. |
721 | Other encodings may be used if they have encoding maps in one of the |
722 | directories in the @Encoding_Path list. Setting the protocol encoding |
723 | overrides any encoding in the XML declaration. |
724 | |
725 | =item * Namespaces |
726 | |
727 | When this option is given with a true value, then the parser does namespace |
728 | processing. By default, namespace processing is turned off. When it is |
729 | turned on, the parser consumes I<xmlns> attributes and strips off prefixes |
730 | from element and attributes names where those prefixes have a defined |
731 | namespace. A name's namespace can be found using the L<"namespace"> method |
732 | and two names can be checked for absolute equality with the L<"eq_name"> |
733 | method. |
734 | |
735 | =item * NoExpand |
736 | |
737 | Normally, the parser will try to expand references to entities defined in |
738 | the internal subset. If this option is set to a true value, and a default |
739 | handler is also set, then the default handler will be called when an |
740 | entity reference is seen in text. This has no effect if a default handler |
741 | has not been registered, and it has no effect on the expansion of entity |
742 | references inside attribute values. |
743 | |
744 | =item * Stream_Delimiter |
745 | |
746 | This option takes a string value. When this string is found alone on a line |
747 | while parsing from a stream, then the parse is ended as if it saw an end of |
748 | file. The intended use is with a stream of xml documents in a MIME multipart |
749 | format. The string should not contain a trailing newline. |
750 | |
751 | =item * ErrorContext |
752 | |
753 | When this option is defined, errors are reported in context. The value |
754 | of ErrorContext should be the number of lines to show on either side of |
755 | the line in which the error occurred. |
756 | |
757 | =item * ParseParamEnt |
758 | |
759 | Unless standalone is set to "yes" in the XML declaration, setting this to |
760 | a true value allows the external DTD to be read, and parameter entities |
761 | to be parsed and expanded. |
762 | |
763 | =item * Base |
764 | |
765 | The base to use for relative pathnames or URLs. This can also be done by |
766 | using the base method. |
767 | |
768 | =back |
769 | |
770 | =item setHandlers(TYPE, HANDLER [, TYPE, HANDLER [...]]) |
771 | |
772 | This method registers handlers for the various events. If no handlers are |
773 | registered, then a call to parsestring or parsefile will only determine if |
774 | the corresponding XML document is well formed (by returning without error.) |
775 | This may be called from within a handler, after the parse has started. |
776 | |
777 | Setting a handler to something that evaluates to false unsets that |
778 | handler. |
779 | |
780 | This method returns a list of type, handler pairs corresponding to the |
781 | input. The handlers returned are the ones that were in effect before the |
782 | call to setHandlers. |
783 | |
784 | The recognized events and the parameters passed to the corresponding |
785 | handlers are: |
786 | |
787 | =over 4 |
788 | |
789 | =item * Start (Parser, Element [, Attr, Val [,...]]) |
790 | |
791 | This event is generated when an XML start tag is recognized. Parser is |
792 | an XML::Parser::Expat instance. Element is the name of the XML element that |
793 | is opened with the start tag. The Attr & Val pairs are generated for each |
794 | attribute in the start tag. |
795 | |
796 | =item * End (Parser, Element) |
797 | |
798 | This event is generated when an XML end tag is recognized. Note that |
799 | an XML empty tag (<foo/>) generates both a start and an end event. |
800 | |
801 | There is always a lower level start and end handler installed that wrap |
802 | the corresponding callbacks. This is to handle the context mechanism. |
803 | A consequence of this is that the default handler (see below) will not |
804 | see a start tag or end tag unless the default_current method is called. |
805 | |
806 | =item * Char (Parser, String) |
807 | |
808 | This event is generated when non-markup is recognized. The non-markup |
809 | sequence of characters is in String. A single non-markup sequence of |
810 | characters may generate multiple calls to this handler. Whatever the |
811 | encoding of the string in the original document, this is given to the |
812 | handler in UTF-8. |
813 | |
814 | =item * Proc (Parser, Target, Data) |
815 | |
816 | This event is generated when a processing instruction is recognized. |
817 | |
818 | =item * Comment (Parser, String) |
819 | |
820 | This event is generated when a comment is recognized. |
821 | |
822 | =item * CdataStart (Parser) |
823 | |
824 | This is called at the start of a CDATA section. |
825 | |
826 | =item * CdataEnd (Parser) |
827 | |
828 | This is called at the end of a CDATA section. |
829 | |
830 | =item * Default (Parser, String) |
831 | |
832 | This is called for any characters that don't have a registered handler. |
833 | This includes both characters that are part of markup for which no |
834 | events are generated (markup declarations) and characters that |
835 | could generate events, but for which no handler has been registered. |
836 | |
837 | Whatever the encoding in the original document, the string is returned to |
838 | the handler in UTF-8. |
839 | |
840 | =item * Unparsed (Parser, Entity, Base, Sysid, Pubid, Notation) |
841 | |
842 | This is called for a declaration of an unparsed entity. Entity is the name |
843 | of the entity. Base is the base to be used for resolving a relative URI. |
844 | Sysid is the system id. Pubid is the public id. Notation is the notation |
845 | name. Base and Pubid may be undefined. |
846 | |
847 | =item * Notation (Parser, Notation, Base, Sysid, Pubid) |
848 | |
849 | This is called for a declaration of notation. Notation is the notation name. |
850 | Base is the base to be used for resolving a relative URI. Sysid is the system |
851 | id. Pubid is the public id. Base, Sysid, and Pubid may all be undefined. |
852 | |
853 | =item * ExternEnt (Parser, Base, Sysid, Pubid) |
854 | |
855 | This is called when an external entity is referenced. Base is the base to be |
856 | used for resolving a relative URI. Sysid is the system id. Pubid is the public |
857 | id. Base, and Pubid may be undefined. |
858 | |
859 | This handler should either return a string, which represents the contents of |
860 | the external entity, or return an open filehandle that can be read to obtain |
861 | the contents of the external entity, or return undef, which indicates the |
862 | external entity couldn't be found and will generate a parse error. |
863 | |
864 | If an open filehandle is returned, it must be returned as either a glob |
865 | (*FOO) or as a reference to a glob (e.g. an instance of IO::Handle). |
866 | |
867 | =item * ExternEntFin (Parser) |
868 | |
869 | This is called after an external entity has been parsed. It allows |
870 | applications to perform cleanup on actions performed in the above |
871 | ExternEnt handler. |
872 | |
873 | =item * Entity (Parser, Name, Val, Sysid, Pubid, Ndata, IsParam) |
874 | |
875 | This is called when an entity is declared. For internal entities, the Val |
876 | parameter will contain the value and the remaining three parameters will |
877 | be undefined. For external entities, the Val parameter |
878 | will be undefined, the Sysid parameter will have the system id, the Pubid |
879 | parameter will have the public id if it was provided (it will be undefined |
880 | otherwise), the Ndata parameter will contain the notation for unparsed |
881 | entities. If this is a parameter entity declaration, then the IsParam |
882 | parameter is true. |
883 | |
884 | Note that this handler and the Unparsed handler above overlap. If both are |
885 | set, then this handler will not be called for unparsed entities. |
886 | |
887 | =item * Element (Parser, Name, Model) |
888 | |
889 | The element handler is called when an element declaration is found. Name is |
890 | the element name, and Model is the content model as an |
891 | XML::Parser::ContentModel object. See L<"XML::Parser::ContentModel Methods"> |
892 | for methods available for this class. |
893 | |
894 | =item * Attlist (Parser, Elname, Attname, Type, Default, Fixed) |
895 | |
896 | This handler is called for each attribute in an ATTLIST declaration. |
897 | So an ATTLIST declaration that has multiple attributes |
898 | will generate multiple calls to this handler. The Elname parameter is the |
899 | name of the element with which the attribute is being associated. The Attname |
900 | parameter is the name of the attribute. Type is the attribute type, given as |
901 | a string. Default is the default value, which will either be "#REQUIRED", |
902 | "#IMPLIED" or a quoted string (i.e. the returned string will begin and end |
903 | with a quote character). If Fixed is true, then this is a fixed attribute. |
904 | |
905 | =item * Doctype (Parser, Name, Sysid, Pubid, Internal) |
906 | |
907 | This handler is called for DOCTYPE declarations. Name is the document type |
908 | name. Sysid is the system id of the document type, if it was provided, |
909 | otherwise it's undefined. Pubid is the public id of the document type, |
910 | which will be undefined if no public id was given. Internal will be |
911 | true or false, indicating whether or not the doctype declaration contains |
912 | an internal subset. |
913 | |
914 | =item * DoctypeFin (Parser) |
915 | |
916 | This handler is called after parsing of the DOCTYPE declaration has finished, |
917 | including any internal or external DTD declarations. |
918 | |
919 | =item * XMLDecl (Parser, Version, Encoding, Standalone) |
920 | |
921 | This handler is called for XML declarations. Version is a string containg |
922 | the version. Encoding is either undefined or contains an encoding string. |
923 | Standalone is either undefined, or true or false. Undefined indicates |
924 | that no standalone parameter was given in the XML declaration. True or |
925 | false indicates "yes" or "no" respectively. |
926 | |
927 | =back |
928 | |
929 | =item namespace(name) |
930 | |
931 | Return the URI of the namespace that the name belongs to. If the name doesn't |
932 | belong to any namespace, an undef is returned. This is only valid on names |
933 | received through the Start or End handlers from a single document, or through |
934 | a call to the generate_ns_name method. In other words, don't use names |
935 | generated from one instance of XML::Parser::Expat with other instances. |
936 | |
937 | =item eq_name(name1, name2) |
938 | |
939 | Return true if name1 and name2 are identical (i.e. same name and from |
940 | the same namespace.) This is only meaningful if both names were obtained |
941 | through the Start or End handlers from a single document, or through |
942 | a call to the generate_ns_name method. |
943 | |
944 | =item generate_ns_name(name, namespace) |
945 | |
946 | Return a name, associated with a given namespace, good for using with the |
947 | above 2 methods. The namespace argument should be the namespace URI, not |
948 | a prefix. |
949 | |
950 | =item new_ns_prefixes |
951 | |
952 | When called from a start tag handler, returns namespace prefixes declared |
953 | with this start tag. If called elsewere (or if there were no namespace |
954 | prefixes declared), it returns an empty list. Setting of the default |
955 | namespace is indicated with '#default' as a prefix. |
956 | |
957 | =item expand_ns_prefix(prefix) |
958 | |
959 | Return the uri to which the given prefix is currently bound. Returns |
960 | undef if the prefix isn't currently bound. Use '#default' to find the |
961 | current binding of the default namespace (if any). |
962 | |
963 | =item current_ns_prefixes |
964 | |
965 | Return a list of currently bound namespace prefixes. The order of the |
966 | the prefixes in the list has no meaning. If the default namespace is |
967 | currently bound, '#default' appears in the list. |
968 | |
969 | =item recognized_string |
970 | |
971 | Returns the string from the document that was recognized in order to call |
972 | the current handler. For instance, when called from a start handler, it |
973 | will give us the the start-tag string. The string is encoded in UTF-8. |
974 | This method doesn't return a meaningful string inside declaration handlers. |
975 | |
976 | =item original_string |
977 | |
978 | Returns the verbatim string from the document that was recognized in |
979 | order to call the current handler. The string is in the original document |
980 | encoding. This method doesn't return a meaningful string inside declaration |
981 | handlers. |
982 | |
983 | =item default_current |
984 | |
985 | When called from a handler, causes the sequence of characters that generated |
986 | the corresponding event to be sent to the default handler (if one is |
987 | registered). Use of this method is deprecated in favor the recognized_string |
988 | method, which you can use without installing a default handler. This |
989 | method doesn't deliver a meaningful string to the default handler when |
990 | called from inside declaration handlers. |
991 | |
992 | =item xpcroak(message) |
993 | |
994 | Concatenate onto the given message the current line number within the |
995 | XML document plus the message implied by ErrorContext. Then croak with |
996 | the formed message. |
997 | |
998 | =item xpcarp(message) |
999 | |
1000 | Concatenate onto the given message the current line number within the |
1001 | XML document plus the message implied by ErrorContext. Then carp with |
1002 | the formed message. |
1003 | |
1004 | =item current_line |
1005 | |
1006 | Returns the line number of the current position of the parse. |
1007 | |
1008 | =item current_column |
1009 | |
1010 | Returns the column number of the current position of the parse. |
1011 | |
1012 | =item current_byte |
1013 | |
1014 | Returns the current position of the parse. |
1015 | |
1016 | =item base([NEWBASE]); |
1017 | |
1018 | Returns the current value of the base for resolving relative URIs. If |
1019 | NEWBASE is supplied, changes the base to that value. |
1020 | |
1021 | =item context |
1022 | |
1023 | Returns a list of element names that represent open elements, with the |
1024 | last one being the innermost. Inside start and end tag handlers, this |
1025 | will be the tag of the parent element. |
1026 | |
1027 | =item current_element |
1028 | |
1029 | Returns the name of the innermost currently opened element. Inside |
1030 | start or end handlers, returns the parent of the element associated |
1031 | with those tags. |
1032 | |
1033 | =item in_element(NAME) |
1034 | |
1035 | Returns true if NAME is equal to the name of the innermost currently opened |
1036 | element. If namespace processing is being used and you want to check |
1037 | against a name that may be in a namespace, then use the generate_ns_name |
1038 | method to create the NAME argument. |
1039 | |
1040 | =item within_element(NAME) |
1041 | |
1042 | Returns the number of times the given name appears in the context list. |
1043 | If namespace processing is being used and you want to check |
1044 | against a name that may be in a namespace, then use the generate_ns_name |
1045 | method to create the NAME argument. |
1046 | |
1047 | =item depth |
1048 | |
1049 | Returns the size of the context list. |
1050 | |
1051 | =item element_index |
1052 | |
1053 | Returns an integer that is the depth-first visit order of the current |
1054 | element. This will be zero outside of the root element. For example, |
1055 | this will return 1 when called from the start handler for the root element |
1056 | start tag. |
1057 | |
1058 | =item skip_until(INDEX) |
1059 | |
1060 | INDEX is an integer that represents an element index. When this method |
1061 | is called, all handlers are suspended until the start tag for an element |
1062 | that has an index number equal to INDEX is seen. If a start handler has |
1063 | been set, then this is the first tag that the start handler will see |
1064 | after skip_until has been called. |
1065 | |
1066 | |
1067 | =item position_in_context(LINES) |
1068 | |
1069 | Returns a string that shows the current parse position. LINES should be |
1070 | an integer >= 0 that represents the number of lines on either side of the |
1071 | current parse line to place into the returned string. |
1072 | |
1073 | =item xml_escape(TEXT [, CHAR [, CHAR ...]]) |
1074 | |
1075 | Returns TEXT with markup characters turned into character entities. Any |
1076 | additional characters provided as arguments are also turned into character |
1077 | references where found in TEXT. |
1078 | |
1079 | =item parse (SOURCE) |
1080 | |
1081 | The SOURCE parameter should either be a string containing the whole XML |
1082 | document, or it should be an open IO::Handle. Only a single document |
1083 | may be parsed for a given instance of XML::Parser::Expat, so this will croak |
1084 | if it's been called previously for this instance. |
1085 | |
1086 | =item parsestring(XML_DOC_STRING) |
1087 | |
1088 | Parses the given string as an XML document. Only a single document may be |
1089 | parsed for a given instance of XML::Parser::Expat, so this will die if either |
1090 | parsestring or parsefile has been called for this instance previously. |
1091 | |
1092 | This method is deprecated in favor of the parse method. |
1093 | |
1094 | =item parsefile(FILENAME) |
1095 | |
1096 | Parses the XML document in the given file. Will die if parsestring or |
1097 | parsefile has been called previously for this instance. |
1098 | |
1099 | =item is_defaulted(ATTNAME) |
1100 | |
1101 | NO LONGER WORKS. To find out if an attribute is defaulted please use |
1102 | the specified_attr method. |
1103 | |
1104 | =item specified_attr |
1105 | |
1106 | When the start handler receives lists of attributes and values, the |
1107 | non-defaulted (i.e. explicitly specified) attributes occur in the list |
1108 | first. This method returns the number of specified items in the list. |
1109 | So if this number is equal to the length of the list, there were no |
1110 | defaulted values. Otherwise the number points to the index of the |
1111 | first defaulted attribute name. |
1112 | |
1113 | =item finish |
1114 | |
1115 | Unsets all handlers (including internal ones that set context), but expat |
1116 | continues parsing to the end of the document or until it finds an error. |
1117 | It should finish up a lot faster than with the handlers set. |
1118 | |
1119 | =item release |
1120 | |
1121 | There are data structures used by XML::Parser::Expat that have circular |
1122 | references. This means that these structures will never be garbage |
1123 | collected unless these references are explicitly broken. Calling this |
1124 | method breaks those references (and makes the instance unusable.) |
1125 | |
1126 | Normally, higher level calls handle this for you, but if you are using |
1127 | XML::Parser::Expat directly, then it's your responsibility to call it. |
1128 | |
1129 | =back |
1130 | |
1131 | =head2 XML::Parser::ContentModel Methods |
1132 | |
1133 | The element declaration handlers are passed objects of this class as the |
1134 | content model of the element declaration. They also represent content |
1135 | particles, components of a content model. |
1136 | |
1137 | When referred to as a string, these objects are automagicly converted to a |
1138 | string representation of the model (or content particle). |
1139 | |
1140 | =over 4 |
1141 | |
1142 | =item isempty |
1143 | |
1144 | This method returns true if the object is "EMPTY", false otherwise. |
1145 | |
1146 | =item isany |
1147 | |
1148 | This method returns true if the object is "ANY", false otherwise. |
1149 | |
1150 | =item ismixed |
1151 | |
1152 | This method returns true if the object is "(#PCDATA)" or "(#PCDATA|...)*", |
1153 | false otherwise. |
1154 | |
1155 | =item isname |
1156 | |
1157 | This method returns if the object is an element name. |
1158 | |
1159 | =item ischoice |
1160 | |
1161 | This method returns true if the object is a choice of content particles. |
1162 | |
1163 | |
1164 | =item isseq |
1165 | |
1166 | This method returns true if the object is a sequence of content particles. |
1167 | |
1168 | =item quant |
1169 | |
1170 | This method returns undef or a string representing the quantifier |
1171 | ('?', '*', '+') associated with the model or particle. |
1172 | |
1173 | =item children |
1174 | |
1175 | This method returns undef or (for mixed, choice, and sequence types) |
1176 | an array of component content particles. There will always be at least |
1177 | one component for choices and sequences, but for a mixed content model |
1178 | of pure PCDATA, "(#PCDATA)", then an undef is returned. |
1179 | |
1180 | =back |
1181 | |
1182 | =head2 XML::Parser::ExpatNB Methods |
1183 | |
1184 | The class XML::Parser::ExpatNB is a subclass of XML::Parser::Expat used |
1185 | for non-blocking access to the expat library. It does not support the parse, |
1186 | parsestring, or parsefile methods, but it does have these additional methods: |
1187 | |
1188 | =over 4 |
1189 | |
1190 | =item parse_more(DATA) |
1191 | |
1192 | Feed expat more text to munch on. |
1193 | |
1194 | =item parse_done |
1195 | |
1196 | Tell expat that it's gotten the whole document. |
1197 | |
1198 | =back |
1199 | |
1200 | =head1 FUNCTIONS |
1201 | |
1202 | =over 4 |
1203 | |
1204 | =item XML::Parser::Expat::load_encoding(ENCODING) |
1205 | |
1206 | Load an external encoding. ENCODING is either the name of an encoding or |
1207 | the name of a file. The basename is converted to lowercase and a '.enc' |
1208 | extension is appended unless there's one already there. Then, unless |
1209 | it's an absolute pathname (i.e. begins with '/'), the first file by that |
1210 | name discovered in the @Encoding_Path path list is used. |
1211 | |
1212 | The encoding in the file is loaded and kept in the %Encoding_Table |
1213 | table. Earlier encodings of the same name are replaced. |
1214 | |
1215 | This function is automaticly called by expat when it encounters an encoding |
1216 | it doesn't know about. Expat shouldn't call this twice for the same |
1217 | encoding name. The only reason users should use this function is to |
1218 | explicitly load an encoding not contained in the @Encoding_Path list. |
1219 | |
1220 | =back |
1221 | |
1222 | =head1 AUTHORS |
1223 | |
1224 | Larry Wall <F<larry@wall.org>> wrote version 1.0. |
1225 | |
1226 | Clark Cooper <F<coopercc@netheaven.com>> picked up support, changed the API |
1227 | for this version (2.x), provided documentation, and added some standard |
1228 | package features. |
1229 | |
1230 | =cut |