3 package Pod::Simple::PullParser;
6 BEGIN {@ISA = ('Pod::Simple')}
11 use Pod::Simple::PullParserStartToken;
12 use Pod::Simple::PullParserEndToken;
13 use Pod::Simple::PullParserTextToken;
15 BEGIN { *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG }
17 __PACKAGE__->_accessorize(
18 'source_fh', # the filehandle we're reading from
19 'source_scalar_ref', # the scalarref we're reading from
20 'source_arrayref', # the arrayref we're reading from
23 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
25 # And here is how we implement a pull-parser on top of a push-parser...
28 my($self, $source) = @_;
29 $self = $self->new unless ref $self;
31 $source = *STDIN{IO} unless defined $source;
32 $self->set_source($source);
33 $self->output_fh(*STDOUT{IO});
35 $self->run; # define run() in a subclass if you want to use filter()!
39 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
41 sub parse_string_document {
43 $this->set_source(\ $_[0]);
48 my($this, $filename) = @_;
49 $this->set_source($filename);
53 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
54 # In case anyone tries to use them:
58 if( __PACKAGE__ eq ref($_[0]) || $_[0]) { # I'm not being subclassed!
59 Carp::croak "You can call run() only on subclasses of "
63 "You can't call run() because ",
64 ref($_[0]) || $_[0], " didn't define a run() method";
70 Carp::croak "Use set_source with ", __PACKAGE__,
71 " and subclasses, not parse_lines";
76 Carp::croak "Use set_source with ", __PACKAGE__,
77 " and subclasses, not parse_line";
80 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
84 my $self = $class->SUPER::new(@_);
85 die "Couldn't construct for $class" unless $self;
87 $self->{'token_buffer'} ||= [];
88 $self->{'start_token_class'} ||= 'Pod::Simple::PullParserStartToken';
89 $self->{'text_token_class'} ||= 'Pod::Simple::PullParserTextToken';
90 $self->{'end_token_class'} ||= 'Pod::Simple::PullParserEndToken';
92 DEBUG > 1 and print "New pullparser object: $self\n";
97 # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
101 DEBUG > 1 and print "\nget_token starting up on $self.\n";
102 DEBUG > 2 and print " Items in token-buffer (",
103 scalar( @{ $self->{'token_buffer'} } ) ,
105 " " . $_->dump . "\n", @{ $self->{'token_buffer'} }
107 @{ $self->{'token_buffer'} } ? '' : ' (no tokens)',
111 until( @{ $self->{'token_buffer'} } ) {
112 DEBUG > 3 and print "I need to get something into my empty token buffer...\n";
113 if($self->{'source_dead'}) {
114 DEBUG and print "$self 's source is dead.\n";
115 push @{ $self->{'token_buffer'} }, undef;
116 } elsif(exists $self->{'source_fh'}) {
118 my $fh = $self->{'source_fh'}
119 || Carp::croak('You have to call set_source before you can call get_token');
121 DEBUG and print "$self 's source is filehandle $fh.\n";
122 # Read those many lines at a time
123 for(my $i = Pod::Simple::MANY_LINES; $i--;) {
124 DEBUG > 3 and print " Fetching a line from source filehandle $fh...\n";
125 local $/ = $Pod::Simple::NL;
126 push @lines, scalar(<$fh>); # readline
127 DEBUG > 3 and print " Line is: ",
128 defined($lines[-1]) ? $lines[-1] : "<undef>\n";
129 unless( defined $lines[-1] ) {
130 DEBUG and print "That's it for that source fh! Killing.\n";
131 delete $self->{'source_fh'}; # so it can be GC'd
134 # but pass thru the undef, which will set source_dead to true
136 # TODO: look to see if $lines[-1] is =encoding, and if so,
137 # do horribly magic things
142 print "* I've gotten ", scalar(@lines), " lines:\n";
143 foreach my $l (@lines) {
145 print " line {$l}\n";
147 print " line undef\n";
150 print "* end of ", scalar(@lines), " lines\n";
153 $self->SUPER::parse_lines(@lines);
155 } elsif(exists $self->{'source_arrayref'}) {
156 DEBUG and print "$self 's source is arrayref $self->{'source_arrayref'}, with ",
157 scalar(@{$self->{'source_arrayref'}}), " items left in it.\n";
159 DEBUG > 3 and print " Fetching ", Pod::Simple::MANY_LINES, " lines.\n";
160 $self->SUPER::parse_lines(
161 splice @{ $self->{'source_arrayref'} },
163 Pod::Simple::MANY_LINES
165 unless( @{ $self->{'source_arrayref'} } ) {
166 DEBUG and print "That's it for that source arrayref! Killing.\n";
167 $self->SUPER::parse_lines(undef);
168 delete $self->{'source_arrayref'}; # so it can be GC'd
170 # to make sure that an undef is always sent to signal end-of-stream
172 } elsif(exists $self->{'source_scalar_ref'}) {
174 DEBUG and print "$self 's source is scalarref $self->{'source_scalar_ref'}, with ",
175 length(${ $self->{'source_scalar_ref'} }) -
176 (pos(${ $self->{'source_scalar_ref'} }) || 0),
177 " characters left to parse.\n";
179 DEBUG > 3 and print " Fetching a line from source-string...\n";
180 if( ${ $self->{'source_scalar_ref'} } =~
181 m/([^\n\r]*)((?:\r?\n)?)/g
184 $self->SUPER::parse_lines($1)
185 if length($1) or length($2)
186 or pos( ${ $self->{'source_scalar_ref'} })
187 != length( ${ $self->{'source_scalar_ref'} });
188 # I.e., unless it's a zero-length "empty line" at the very
189 # end of "foo\nbar\n" (i.e., between the \n and the EOS).
190 } else { # that's the end. Byebye
191 $self->SUPER::parse_lines(undef);
192 delete $self->{'source_scalar_ref'};
193 DEBUG and print "That's it for that source scalarref! Killing.\n";
201 DEBUG and print "get_token about to return ",
202 Pod::Simple::pretty( @{$self->{'token_buffer'}}
203 ? $self->{'token_buffer'}[-1] : undef
205 return shift @{$self->{'token_buffer'}}; # that's an undef if empty
211 DEBUG and print "Ungetting ", scalar(@_), " tokens: ",
212 @_ ? "@_\n" : "().\n";
214 Carp::croak "Can't unget that, because it's not a token -- it's undef!"
216 Carp::croak "Can't unget $t, because it's not a token -- it's a string!"
218 Carp::croak "Can't unget $t, because it's not a token object!"
219 unless UNIVERSAL::can($t, 'type');
222 unshift @{$self->{'token_buffer'}}, @_;
223 DEBUG > 1 and print "Token buffer now has ",
224 scalar(@{$self->{'token_buffer'}}), " items in it.\n";
228 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
230 # $self->{'source_filename'} = $source;
234 return $self->{'source_fh'} unless @_;
237 Carp::croak("Can't use empty-string as a source for set_source");
238 } elsif(ref(\( $_[0] )) eq 'GLOB') {
239 $self->{'source_filename'} = '' . ($handle = $_[0]);
240 DEBUG and print "$self 's source is glob $_[0]\n";
242 } elsif(ref( $_[0] ) eq 'SCALAR') {
243 $self->{'source_scalar_ref'} = $_[0];
244 DEBUG and print "$self 's source is scalar ref $_[0]\n";
246 } elsif(ref( $_[0] ) eq 'ARRAY') {
247 $self->{'source_arrayref'} = $_[0];
248 DEBUG and print "$self 's source is array ref $_[0]\n";
251 $self->{'source_filename'} = '' . ($handle = $_[0]);
252 DEBUG and print "$self 's source is fh-obj $_[0]\n";
253 } elsif(!length $_[0]) {
254 Carp::croak("Can't use empty-string as a source for set_source");
255 } else { # It's a filename!
256 DEBUG and print "$self 's source is filename $_[0]\n";
259 open(PODSOURCE, "<$_[0]") || Carp::croak "Can't open $_[0]: $!";
260 $handle = *PODSOURCE{IO};
262 $self->{'source_filename'} = $_[0];
263 DEBUG and print " Its name is $_[0].\n";
265 # TODO: file-discipline things here!
268 $self->{'source_fh'} = $handle;
269 DEBUG and print " Its handle is $handle\n";
273 # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
275 sub get_title_short { shift->get_short_title(@_) } # alias
277 sub get_short_title {
278 my $title = shift->get_title(@_);
279 $title = $1 if $title =~ m/^(\S{1,60})\s+--?\s+./s;
280 # turn "Foo::Bar -- bars for your foo" into "Foo::Bar"
284 sub get_title { shift->_get_titled_section(
285 'NAME', max_token => 50, desperate => 1, @_)
287 sub get_version { shift->_get_titled_section(
290 accept_verbatim => 1,
291 max_content_length => 3_000,
295 sub get_description { shift->_get_titled_section(
298 max_content_length => 3_000,
302 sub get_authors { shift->get_author(@_) } # a harmless alias
306 # Max_token is so high because these are
307 # typically at the end of the document:
308 $this->_get_titled_section('AUTHOR' , max_token => 10_000, @_) ||
309 $this->_get_titled_section('AUTHORS', max_token => 10_000, @_);
312 #--------------------------------------------------------------------------
314 sub _get_titled_section {
315 # Based on a get_title originally contributed by Graham Barr
316 my($self, $titlename, %options) = (@_);
318 my $max_token = delete $options{'max_token'};
319 my $desperate_for_title = delete $options{'desperate'};
320 my $accept_verbatim = delete $options{'accept_verbatim'};
321 my $max_content_length = delete $options{'max_content_length'};
322 my $nocase = delete $options{'nocase'};
323 $max_content_length = 120 unless defined $max_content_length;
325 Carp::croak( "Unknown " . ((1 == keys %options) ? "option: " : "options: ")
326 . join " ", map "[$_]", sort keys %options
330 my %content_containers;
331 $content_containers{'Para'} = 1;
332 if($accept_verbatim) {
333 $content_containers{'Verbatim'} = 1;
334 $content_containers{'VerbatimFormatted'} = 1;
343 Carp::croak "What kind of titlename is \"$titlename\"?!" unless
344 defined $titlename and $titlename =~ m/^[A-Z ]{1,60}$/s; #sanity
345 my $titlename_re = quotemeta($titlename);
347 my $head1_text_content;
348 my $para_text_content;
351 ++$token_count <= ($max_token || 1_000_000)
352 and defined(my $token = $self->get_token)
354 push @to_unget, $token;
356 if ($state == 0) { # seeking =head1
357 if( $token->is_start and $token->tagname eq 'head1' ) {
358 DEBUG and print " Found head1. Seeking content...\n";
360 $head1_text_content = '';
364 elsif($state == 1) { # accumulating text until end of head1
365 if( $token->is_text ) {
366 DEBUG and print " Adding \"", $token->text, "\" to head1-content.\n";
367 $head1_text_content .= $token->text;
368 } elsif( $token->is_end and $token->tagname eq 'head1' ) {
369 DEBUG and print " Found end of head1. Considering content...\n";
370 $head1_text_content = uc $head1_text_content if $nocase;
371 if($head1_text_content eq $titlename
372 or $head1_text_content =~ m/\($titlename_re\)/s
373 # We accept "=head1 Nomen Modularis (NAME)" for sake of i18n
375 DEBUG and print " Yup, it was $titlename. Seeking next para-content...\n";
379 # if we're so desperate we'll take the first
380 # =head1's content as a title
381 and $head1_text_content =~ m/\S/
382 and $head1_text_content !~ m/^[ A-Z]+$/s
383 and $head1_text_content !~
385 NAME | TITLE | VERSION | AUTHORS? | DESCRIPTION | SYNOPSIS
386 | COPYRIGHT | LICENSE | NOTES? | FUNCTIONS? | METHODS?
387 | CAVEATS? | BUGS? | SEE\ ALSO | SWITCHES | ENVIRONMENT
389 # avoid accepting things like =head1 Thingy Thongy (DESCRIPTION)
390 and ($max_content_length
391 ? (length($head1_text_content) <= $max_content_length) # sanity
394 DEBUG and print " It looks titular: \"$head1_text_content\".\n",
396 $title = $head1_text_content;
400 DEBUG and print " Didn't look titular ($head1_text_content).\n",
401 "\n Dropping back to seeking-head1-content mode...\n";
407 # seeking start of para (which must immediately follow)
408 if($token->is_start and $content_containers{ $token->tagname }) {
409 DEBUG and print " Found start of Para. Accumulating content...\n";
410 $para_text_content = '';
414 " Didn't see an immediately subsequent start-Para. Reseeking H1\n";
420 # accumulating text until end of Para
421 if( $token->is_text ) {
422 DEBUG and print " Adding \"", $token->text, "\" to para-content.\n";
423 $para_text_content .= $token->text;
426 } elsif( $token->is_end and $content_containers{ $token->tagname } ) {
427 DEBUG and print " Found end of Para. Considering content: ",
428 $para_text_content, "\n";
430 if( $para_text_content =~ m/\S/
431 and ($max_content_length
432 ? (length($para_text_content) <= $max_content_length)
435 # Some minimal sanity constraints, I think.
436 DEBUG and print " It looks contentworthy, I guess. Using it.\n";
437 $title = $para_text_content;
440 DEBUG and print " Doesn't look at all contentworthy!\n Giving up.\n";
448 die "IMPOSSIBLE STATE $state!\n"; # should never happen
454 $self->unget_token(@to_unget);
457 if(defined $title) { print " Returing title <$title>\n" }
458 else { print "Returning title <>\n" }
461 return '' unless defined $title;
466 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
468 # Methods that actually do work at parse-time:
470 sub _handle_element_start {
471 my $self = shift; # leaving ($element_name, $attr_hash_r)
472 DEBUG > 2 and print "++ $_[0] (", map("<$_> ", %{$_[1]}), ")\n";
474 push @{ $self->{'token_buffer'} },
475 $self->{'start_token_class'}->new(@_);
480 my $self = shift; # leaving ($text)
481 DEBUG > 2 and print "== $_[0]\n";
482 push @{ $self->{'token_buffer'} },
483 $self->{'text_token_class'}->new(@_);
487 sub _handle_element_end {
488 my $self = shift; # leaving ($element_name);
489 DEBUG > 2 and print "-- $_[0]\n";
490 push @{ $self->{'token_buffer'} },
491 $self->{'end_token_class'}->new(@_);
495 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
504 Pod::Simple::PullParser -- a pull-parser interface to parsing Pod
508 my $parser = SomePodProcessor->new;
509 $parser->set_source( "whatever.pod" );
514 my $parser = SomePodProcessor->new;
515 $parser->set_source( $some_filehandle_object );
520 my $parser = SomePodProcessor->new;
521 $parser->set_source( \$document_source );
526 my $parser = SomePodProcessor->new;
527 $parser->set_source( \@document_lines );
533 package SomePodProcessor;
535 use base qw(Pod::Simple::PullParser);
540 while(my $token = $self->get_token) {
541 ...process each token...
547 This class is for using Pod::Simple to build a Pod processor -- but
548 one that uses an interface based on a stream of token objects,
549 instead of based on events.
551 This is a subclass of L<Pod::Simple> and inherits all its methods.
553 A subclass of Pod::Simple::PullParser should define a C<run> method
554 that calls C<< $token = $parser->get_token >> to pull tokens.
556 See the source for Pod::Simple::RTF for an example of a formatter
557 that uses Pod::Simple::PullParser.
563 =item my $token = $parser->get_token
565 This returns the next token object (which will be of a subclass of
566 L<Pod::Simple::PullParserToken>), or undef if the parser-stream has hit
567 the end of the document.
569 =item $parser->unget_token( $token )
571 =item $parser->unget_token( $token1, $token2, ... )
573 This restores the token object(s) to the front of the parser stream.
577 The source has to be set before you can parse anything. The lowest-level
578 way is to call C<set_source>:
582 =item $parser->set_source( $filename )
584 =item $parser->set_source( $filehandle_object )
586 =item $parser->set_source( \$document_source )
588 =item $parser->set_source( \@document_lines )
592 Or you can call these methods, which Pod::Simple::PullParser has defined
593 to work just like Pod::Simple's same-named methods:
597 =item $parser->parse_file(...)
599 =item $parser->parse_string_document(...)
601 =item $parser->filter(...)
603 =item $parser->parse_from_file(...)
607 For those to work, the Pod-processing subclass of
608 Pod::Simple::PullParser has to have defined a $parser->run method --
609 so it is advised that all Pod::Simple::PullParser subclasses do so.
610 See the Synopsis above, or the source for Pod::Simple::RTF.
612 Authors of formatter subclasses might find these methods useful to
613 call on a parser object that you haven't started pulling tokens
618 =item my $title_string = $parser->get_title
620 This tries to get the title string out of $parser, by getting some tokens,
621 and scanning them for the title, and then ungetting them so that you can
622 process the token-stream from the beginning.
624 For example, suppose you have a document that starts out:
628 Hoo::Boy::Wowza -- Stuff B<wow> yeah!
630 $parser->get_title on that document will return "Hoo::Boy::Wowza --
631 Stuff wow yeah!". If the document starts with:
635 Hoo::Boy::W00t -- Stuff B<w00t> yeah!
637 Then you'll need to pass the C<nocase> option in order to recognize "Name":
639 $parser->get_title(nocase => 1);
641 In cases where get_title can't find the title, it will return empty-string
644 =item my $title_string = $parser->get_short_title
646 This is just like get_title, except that it returns just the modulename, if
647 the title seems to be of the form "SomeModuleName -- description".
649 For example, suppose you have a document that starts out:
653 Hoo::Boy::Wowza -- Stuff B<wow> yeah!
655 then $parser->get_short_title on that document will return
658 But if the document starts out:
662 Hooboy, stuff B<wow> yeah!
664 then $parser->get_short_title on that document will return "Hooboy,
665 stuff wow yeah!". If the document starts with:
669 Hoo::Boy::W00t -- Stuff B<w00t> yeah!
671 Then you'll need to pass the C<nocase> option in order to recognize "Name":
673 $parser->get_short_title(nocase => 1);
675 If the title can't be found, then get_short_title returns empty-string
678 =item $author_name = $parser->get_author
680 This works like get_title except that it returns the contents of the
681 "=head1 AUTHOR\n\nParagraph...\n" section, assuming that that section
682 isn't terribly long. To recognize a "=head1 Author\n\nParagraph\n"
683 section, pass the C<nocase> otpion:
685 $parser->get_author(nocase => 1);
687 (This method tolerates "AUTHORS" instead of "AUTHOR" too.)
689 =item $description_name = $parser->get_description
691 This works like get_title except that it returns the contents of the
692 "=head1 DESCRIPTION\n\nParagraph...\n" section, assuming that that section
693 isn't terribly long. To recognize a "=head1 Description\n\nParagraph\n"
694 section, pass the C<nocase> otpion:
696 $parser->get_description(nocase => 1);
698 =item $version_block = $parser->get_version
700 This works like get_title except that it returns the contents of
701 the "=head1 VERSION\n\n[BIG BLOCK]\n" block. Note that this does NOT
702 return the module's C<$VERSION>!! To recognize a
703 "=head1 Version\n\n[BIG BLOCK]\n" section, pass the C<nocase> otpion:
705 $parser->get_version(nocase => 1);
711 You don't actually I<have> to define a C<run> method. If you're
712 writing a Pod-formatter class, you should define a C<run> just so
713 that users can call C<parse_file> etc, but you don't I<have> to.
715 And if you're not writing a formatter class, but are instead just
716 writing a program that does something simple with a Pod::PullParser
717 object (and not an object of a subclass), then there's no reason to
718 bother subclassing to add a C<run> method.
724 L<Pod::Simple::PullParserToken> -- and its subclasses
725 L<Pod::Simple::PullParserStartToken>,
726 L<Pod::Simple::PullParserTextToken>, and
727 L<Pod::Simple::PullParserEndToken>.
729 L<HTML::TokeParser>, which inspired this.
731 =head1 COPYRIGHT AND DISCLAIMERS
733 Copyright (c) 2002 Sean M. Burke. All rights reserved.
735 This library is free software; you can redistribute it and/or modify it
736 under the same terms as Perl itself.
738 This program is distributed in the hope that it will be useful, but
739 without any warranty; without even the implied warranty of
740 merchantability or fitness for a particular purpose.
744 Sean M. Burke C<sburke@cpan.org>
752 sub _old_get_title { # some witchery in here
758 push @to_unget, $self->get_token;
759 unless(defined $to_unget[-1]) { # whoops, short doc!
764 DEBUG and print "-Got token ", $to_unget[-1]->dump, "\n";
766 (DEBUG and print "Too much in the buffer.\n"),
767 last if @to_unget > 25; # sanity
770 if( #$to_unget[-1]->type eq 'end'
771 #and $to_unget[-1]->tagname eq 'Para'
775 ($_->type eq 'start') ? ("<" . $_->tagname .">")
776 : ($_->type eq 'end' ) ? ("</". $_->tagname .">")
777 : ($_->type eq 'text' ) ? ($_->text =~ m<^([A-Z]+)$>s ? $1 : 'X')
780 )) =~ m{<head1>NAME</head1><Para>(X|</?[BCIFLS]>)+</Para>$}s
782 # Whee, it fits the pattern
783 DEBUG and print "Seems to match =head1 NAME pattern.\n";
785 foreach my $t (reverse @to_unget) {
786 last if $t->type eq 'start' and $t->tagname eq 'Para';
787 $title = $t->text . $title if $t->type eq 'text';
789 undef $title if $title =~ m<^\s*$>; # make sure it's contentful!
792 } elsif ($pattern =~ m{<head(\d)>(.+)</head\d>$}
793 and !( $1 eq '1' and $2 eq 'NAME' )
795 # Well, it fits a fallback pattern
796 DEBUG and print "Seems to match NAMEless pattern.\n";
798 foreach my $t (reverse @to_unget) {
799 last if $t->type eq 'start' and $t->tagname =~ m/^head\d$/s;
800 $title = $t->text . $title if $t->type eq 'text';
802 undef $title if $title =~ m<^\s*$>; # make sure it's contentful!
806 DEBUG and $pattern and print "Leading pattern: $pattern\n";
811 $self->unget_token(@to_unget);
814 if(defined $title) { print " Returing title <$title>\n" }
815 else { print "Returning title <>\n" }
818 return '' unless defined $title;