1 # Pod::PlainText -- Convert POD data to formatted ASCII text.
2 # $Id: Text.pm,v 2.1 1999/09/20 11:53:33 eagle Exp $
4 # Copyright 1999-2000 by Russ Allbery <rra@stanford.edu>
6 # This program is free software; you can redistribute it and/or modify it
7 # under the same terms as Perl itself.
9 # This module is intended to be a replacement for Pod::Text, and attempts to
10 # match its output except for some specific circumstances where other
11 # decisions seemed to produce better output. It uses Pod::Parser and is
12 # designed to be very easy to subclass.
14 ############################################################################
15 # Modules and declarations
16 ############################################################################
18 package Pod::PlainText;
22 use Carp qw(carp croak);
26 use vars qw(@ISA %ESCAPES $VERSION);
28 # We inherit from Pod::Select instead of Pod::Parser so that we can be used
30 @ISA = qw(Pod::Select);
35 ############################################################################
36 # Table of supported E<> escapes
37 ############################################################################
39 # This table is taken near verbatim from Pod::PlainText in Pod::Parser,
40 # which got it near verbatim from the original Pod::Text. It is therefore
41 # credited to Tom Christiansen, and I'm glad I didn't have to write it. :)
43 'amp' => '&', # ampersand
44 'lt' => '<', # left chevron, less-than
45 'gt' => '>', # right chevron, greater-than
46 'quot' => '"', # double quote
48 "Aacute" => "\xC1", # capital A, acute accent
49 "aacute" => "\xE1", # small a, acute accent
50 "Acirc" => "\xC2", # capital A, circumflex accent
51 "acirc" => "\xE2", # small a, circumflex accent
52 "AElig" => "\xC6", # capital AE diphthong (ligature)
53 "aelig" => "\xE6", # small ae diphthong (ligature)
54 "Agrave" => "\xC0", # capital A, grave accent
55 "agrave" => "\xE0", # small a, grave accent
56 "Aring" => "\xC5", # capital A, ring
57 "aring" => "\xE5", # small a, ring
58 "Atilde" => "\xC3", # capital A, tilde
59 "atilde" => "\xE3", # small a, tilde
60 "Auml" => "\xC4", # capital A, dieresis or umlaut mark
61 "auml" => "\xE4", # small a, dieresis or umlaut mark
62 "Ccedil" => "\xC7", # capital C, cedilla
63 "ccedil" => "\xE7", # small c, cedilla
64 "Eacute" => "\xC9", # capital E, acute accent
65 "eacute" => "\xE9", # small e, acute accent
66 "Ecirc" => "\xCA", # capital E, circumflex accent
67 "ecirc" => "\xEA", # small e, circumflex accent
68 "Egrave" => "\xC8", # capital E, grave accent
69 "egrave" => "\xE8", # small e, grave accent
70 "ETH" => "\xD0", # capital Eth, Icelandic
71 "eth" => "\xF0", # small eth, Icelandic
72 "Euml" => "\xCB", # capital E, dieresis or umlaut mark
73 "euml" => "\xEB", # small e, dieresis or umlaut mark
74 "Iacute" => "\xCD", # capital I, acute accent
75 "iacute" => "\xED", # small i, acute accent
76 "Icirc" => "\xCE", # capital I, circumflex accent
77 "icirc" => "\xEE", # small i, circumflex accent
78 "Igrave" => "\xCD", # capital I, grave accent
79 "igrave" => "\xED", # small i, grave accent
80 "Iuml" => "\xCF", # capital I, dieresis or umlaut mark
81 "iuml" => "\xEF", # small i, dieresis or umlaut mark
82 "Ntilde" => "\xD1", # capital N, tilde
83 "ntilde" => "\xF1", # small n, tilde
84 "Oacute" => "\xD3", # capital O, acute accent
85 "oacute" => "\xF3", # small o, acute accent
86 "Ocirc" => "\xD4", # capital O, circumflex accent
87 "ocirc" => "\xF4", # small o, circumflex accent
88 "Ograve" => "\xD2", # capital O, grave accent
89 "ograve" => "\xF2", # small o, grave accent
90 "Oslash" => "\xD8", # capital O, slash
91 "oslash" => "\xF8", # small o, slash
92 "Otilde" => "\xD5", # capital O, tilde
93 "otilde" => "\xF5", # small o, tilde
94 "Ouml" => "\xD6", # capital O, dieresis or umlaut mark
95 "ouml" => "\xF6", # small o, dieresis or umlaut mark
96 "szlig" => "\xDF", # small sharp s, German (sz ligature)
97 "THORN" => "\xDE", # capital THORN, Icelandic
98 "thorn" => "\xFE", # small thorn, Icelandic
99 "Uacute" => "\xDA", # capital U, acute accent
100 "uacute" => "\xFA", # small u, acute accent
101 "Ucirc" => "\xDB", # capital U, circumflex accent
102 "ucirc" => "\xFB", # small u, circumflex accent
103 "Ugrave" => "\xD9", # capital U, grave accent
104 "ugrave" => "\xF9", # small u, grave accent
105 "Uuml" => "\xDC", # capital U, dieresis or umlaut mark
106 "uuml" => "\xFC", # small u, dieresis or umlaut mark
107 "Yacute" => "\xDD", # capital Y, acute accent
108 "yacute" => "\xFD", # small y, acute accent
109 "yuml" => "\xFF", # small y, dieresis or umlaut mark
111 "lchevron" => "\xAB", # left chevron (double less than)
112 "rchevron" => "\xBB", # right chevron (double greater than)
116 ############################################################################
118 ############################################################################
120 # Initialize the object. Must be sure to call our parent initializer.
124 $$self{alt} = 0 unless defined $$self{alt};
125 $$self{indent} = 4 unless defined $$self{indent};
126 $$self{loose} = 0 unless defined $$self{loose};
127 $$self{sentence} = 0 unless defined $$self{sentence};
128 $$self{width} = 76 unless defined $$self{width};
130 $$self{INDENTS} = []; # Stack of indentations.
131 $$self{MARGIN} = $$self{indent}; # Current left margin in spaces.
133 $self->SUPER::initialize;
137 ############################################################################
139 ############################################################################
141 # Called for each command paragraph. Gets the command, the associated
142 # paragraph, the line number, and a Pod::Paragraph object. Just dispatches
143 # the command to a method named the same as the command. =cut is handled
144 # internally by Pod::Parser.
148 return if $command eq 'pod';
149 return if ($$self{EXCLUDE} && $command ne 'end');
150 $self->item ("\n") if defined $$self{ITEM};
151 $command = 'cmd_' . $command;
152 $self->$command (@_);
155 # Called for a verbatim paragraph. Gets the paragraph, the line number, and
156 # a Pod::Paragraph object. Just output it verbatim, but with tabs converted
160 return if $$self{EXCLUDE};
161 $self->item if defined $$self{ITEM};
164 s/^(\s*\S+)/(' ' x $$self{MARGIN}) . $1/gme;
168 # Called for a regular text block. Gets the paragraph, the line number, and
169 # a Pod::Paragraph object. Perform interpolation and output the results.
172 return if $$self{EXCLUDE};
173 $self->output ($_[0]), return if $$self{VERBATIM};
177 # Perform a little magic to collapse multiple L<> references. This is
178 # here mostly for backwards-compatibility. We'll just rewrite the whole
179 # thing into actual text at this part, bypassing the whole internal
180 # sequence parsing thing.
183 L< # A link of the form L</something>.
186 [:\w]+ # The item has to be a simple word...
187 (\(\))? # ...or simple function.
191 ,?\s+(and\s+)? # Allow lots of them, conjuncted.
204 my @items = split /(?:,?\s+(?:and\s+)?)/;
207 for ($i = 0; $i < @items; $i++) {
208 $string .= $items[$i];
209 $string .= ", " if @items > 2 && $i != $#items;
210 $string .= " and " if ($i == $#items - 1);
212 $string .= " entries elsewhere in this document";
216 # Now actually interpolate and output the paragraph.
217 $_ = $self->interpolate ($_, $line);
219 if (defined $$self{ITEM}) {
220 $self->item ($_ . "\n");
222 $self->output ($self->reformat ($_ . "\n"));
226 # Called for an interior sequence. Gets the command, argument, and a
227 # Pod::InteriorSequence object and is expected to return the resulting text.
228 # Calls code, bold, italic, file, and link to handle those types of
229 # sequences, and handles S<>, E<>, X<>, and Z<> directly.
230 sub interior_sequence {
234 return '' if ($command eq 'X' || $command eq 'Z');
236 # Expand escapes into the actual character now, carping if invalid.
237 if ($command eq 'E') {
238 return $ESCAPES{$_} if defined $ESCAPES{$_};
239 carp "Unknown escape: E<$_>";
243 # For all the other sequences, empty content produces no output.
246 # For S<>, compress all internal whitespace and then map spaces to \01.
247 # When we output the text, we'll map this back.
248 if ($command eq 'S') {
254 # Anything else needs to get dispatched to another method.
255 if ($command eq 'B') { return $self->seq_b ($_) }
256 elsif ($command eq 'C') { return $self->seq_c ($_) }
257 elsif ($command eq 'F') { return $self->seq_f ($_) }
258 elsif ($command eq 'I') { return $self->seq_i ($_) }
259 elsif ($command eq 'L') { return $self->seq_l ($_) }
260 else { carp "Unknown sequence $command<$_>" }
263 # Called for each paragraph that's actually part of the POD. We take
264 # advantage of this opportunity to untabify the input.
265 sub preprocess_paragraph {
268 1 while s/^(.*?)(\t+)/$1 . ' ' x (length ($2) * 8 - length ($1) % 8)/me;
273 ############################################################################
275 ############################################################################
277 # All command paragraphs take the paragraph and the line number.
279 # First level heading.
284 $_ = $self->interpolate ($_, shift);
286 $self->output ("\n==== $_ ====\n\n");
288 $_ .= "\n" if $$self{loose};
289 $self->output ($_ . "\n");
293 # Second level heading.
298 $_ = $self->interpolate ($_, shift);
300 $self->output ("\n== $_ ==\n\n");
302 $self->output (' ' x ($$self{indent} / 2) . $_ . "\n\n");
306 # third level heading - not strictly perlpodspec compliant
311 $_ = $self->interpolate ($_, shift);
313 $self->output ("\n= $_ =\n");
315 $self->output (' ' x ($$self{indent}) . $_ . "\n");
319 # fourth level heading - not strictly perlpodspec compliant
321 *cmd_head4 = \&cmd_head3;
327 unless (/^[-+]?\d+\s+$/) { $_ = $$self{indent} }
328 push (@{ $$self{INDENTS} }, $$self{MARGIN});
329 $$self{MARGIN} += ($_ + 0);
335 $$self{MARGIN} = pop @{ $$self{INDENTS} };
336 unless (defined $$self{MARGIN}) {
337 carp "Unmatched =back";
338 $$self{MARGIN} = $$self{indent};
342 # An individual list item.
345 if (defined $$self{ITEM}) { $self->item }
348 $$self{ITEM} = $self->interpolate ($_);
351 # Begin a block for a particular translator. Setting VERBATIM triggers
352 # special handling in textblock().
356 my ($kind) = /^(\S+)/ or return;
357 if ($kind eq 'text') {
358 $$self{VERBATIM} = 1;
364 # End a block for a particular translator. We assume that all =begin/=end
365 # pairs are properly closed.
369 $$self{VERBATIM} = 0;
372 # One paragraph for a particular translator. Ignore it unless it's intended
373 # for text, in which case we treat it as a verbatim text block.
378 return unless s/^text\b[ \t]*\n?//;
379 $self->verbatim ($_, $line);
383 ############################################################################
385 ############################################################################
387 # The simple formatting ones. These are here mostly so that subclasses can
388 # override them and do more complicated things.
389 sub seq_b { return $_[0]{alt} ? "``$_[1]''" : $_[1] }
390 sub seq_c { return $_[0]{alt} ? "``$_[1]''" : "`$_[1]'" }
391 sub seq_f { return $_[0]{alt} ? "\"$_[1]\"" : $_[1] }
392 sub seq_i { return '*' . $_[1] . '*' }
394 # The complicated one. Handle links. Since this is plain text, we can't
395 # actually make any real links, so this is all to figure out what text we
401 # Smash whitespace in case we were split across multiple lines.
404 # If we were given any explicit text, just output it.
405 if (/^([^|]+)\|/) { return $1 }
407 # Okay, leading and trailing whitespace isn't important; get rid of it.
411 # Default to using the whole content of the link entry as a section
412 # name. Note that L<manpage/> forces a manpage interpretation, as does
413 # something looking like L<manpage(section)>. The latter is an
414 # enhancement over the original Pod::Text.
415 my ($manpage, $section) = ('', $_);
416 if (/^(?:https?|ftp|news):/) {
419 } elsif (/^"\s*(.*?)\s*"$/) {
420 $section = '"' . $1 . '"';
421 } elsif (m/^[-:.\w]+(?:\(\S+\))?$/) {
422 ($manpage, $section) = ($_, '');
424 ($manpage, $section) = split (/\s*\/\s*/, $_, 2);
428 # Now build the actual output text.
429 if (!length $section) {
430 $text = "the $manpage manpage" if length $manpage;
431 } elsif ($section =~ /^[:\w]+(?:\(\))?/) {
432 $text .= 'the ' . $section . ' entry';
433 $text .= (length $manpage) ? " in the $manpage manpage"
434 : " elsewhere in this document";
436 $section =~ s/^\"\s*//;
437 $section =~ s/\s*\"$//;
438 $text .= 'the section on "' . $section . '"';
439 $text .= " in the $manpage manpage" if length $manpage;
445 ############################################################################
447 ############################################################################
449 # This method is called whenever an =item command is complete (in other
450 # words, we've seen its associated paragraph or know for certain that it
451 # doesn't have one). It gets the paragraph associated with the item as an
452 # argument. If that argument is empty, just output the item tag; if it
453 # contains a newline, output the item tag followed by the newline.
454 # Otherwise, see if there's enough room for us to output the item tag in the
455 # margin of the text or if we have to put it on a separate line.
459 my $tag = $$self{ITEM};
460 unless (defined $tag) {
461 carp "item called without tag";
465 my $indent = $$self{INDENTS}[-1];
466 unless (defined $indent) { $indent = $$self{indent} }
467 my $space = ' ' x $indent;
468 $space =~ s/^ /:/ if $$self{alt};
469 if (!$_ || /^\s+$/ || ($$self{MARGIN} - $indent < length ($tag) + 1)) {
470 my $margin = $$self{MARGIN};
471 $$self{MARGIN} = $indent;
472 my $output = $self->reformat ($tag);
473 $output =~ s/\n*$/\n/;
474 $self->output ($output);
475 $$self{MARGIN} = $margin;
476 $self->output ($self->reformat ($_)) if /\S/;
478 $_ = $self->reformat ($_);
479 s/^ /:/ if ($$self{alt} && $indent > 0);
480 my $tagspace = ' ' x length $tag;
481 s/^($space)$tagspace/$1$tag/ or warn "Bizarre space in item";
487 ############################################################################
489 ############################################################################
491 # Wrap a line, indenting by the current left margin. We can't use
492 # Text::Wrap because it plays games with tabs. We can't use formline, even
493 # though we'd really like to, because it screws up non-printing characters.
494 # So we have to do the wrapping ourselves.
499 my $spaces = ' ' x $$self{MARGIN};
500 my $width = $$self{width} - $$self{MARGIN};
501 while (length > $width) {
502 if (s/^([^\n]{0,$width})\s+// || s/^([^\n]{$width})//) {
503 $output .= $spaces . $1 . "\n";
508 $output .= $spaces . $_;
509 $output =~ s/\s+$/\n\n/;
513 # Reformat a paragraph of text for the current margin. Takes the text to
514 # reformat and returns the formatted text.
519 # If we're trying to preserve two spaces after sentences, do some
520 # munging to support that. Otherwise, smash all repeated whitespace.
521 if ($$self{sentence}) {
532 # Output text to the output device.
533 sub output { $_[1] =~ tr/\01/ /; print { $_[0]->output_handle } $_[1] }
536 ############################################################################
537 # Backwards compatibility
538 ############################################################################
540 # The old Pod::Text module did everything in a pod2text() function. This
541 # tries to provide the same interface for legacy applications.
545 # This is really ugly; I hate doing option parsing in the middle of a
546 # module. But the old Pod::Text module supported passing flags to its
547 # entry function, so handle -a and -<number>.
548 while ($_[0] =~ /^-/) {
550 if ($flag eq '-a') { push (@args, alt => 1) }
551 elsif ($flag =~ /^-(\d+)$/) { push (@args, width => $1) }
558 # Now that we know what arguments we're using, create the parser.
559 my $parser = Pod::PlainText->new (@args);
561 # If two arguments were given, the second argument is going to be a file
562 # handle. That means we want to call parse_from_filehandle(), which
563 # means we need to turn the first argument into a file handle. Magic
564 # open will handle the <&STDIN case automagically.
567 unless (open (IN, $_[0])) {
568 croak ("Can't open $_[0] for reading: $!\n");
572 return $parser->parse_from_filehandle (@_);
574 return $parser->parse_from_file (@_);
579 ############################################################################
580 # Module return value and documentation
581 ############################################################################
588 Pod::PlainText - Convert POD data to formatted ASCII text
593 my $parser = Pod::PlainText->new (sentence => 0, width => 78);
595 # Read POD from STDIN and write to STDOUT.
596 $parser->parse_from_filehandle;
598 # Read POD from file.pod and write to file.txt.
599 $parser->parse_from_file ('file.pod', 'file.txt');
603 Pod::PlainText is a module that can convert documentation in the POD format (the
604 preferred language for documenting Perl) into formatted ASCII. It uses no
605 special formatting controls or codes whatsoever, and its output is therefore
606 suitable for nearly any device.
608 As a derived class from Pod::Parser, Pod::PlainText supports the same methods and
609 interfaces. See L<Pod::Parser> for all the details; briefly, one creates a
610 new parser with C<Pod::PlainText-E<gt>new()> and then calls either
611 parse_from_filehandle() or parse_from_file().
613 new() can take options, in the form of key/value pairs, that control the
614 behavior of the parser. The currently recognized options are:
620 If set to a true value, selects an alternate output format that, among other
621 things, uses a different heading style and marks C<=item> entries with a
622 colon in the left margin. Defaults to false.
626 The number of spaces to indent regular text, and the default indentation for
627 C<=over> blocks. Defaults to 4.
631 If set to a true value, a blank line is printed after a C<=head1> heading.
632 If set to false (the default), no blank line is printed after C<=head1>,
633 although one is still printed after C<=head2>. This is the default because
634 it's the expected formatting for manual pages; if you're formatting
635 arbitrary text documents, setting this to true may result in more pleasing
640 If set to a true value, Pod::PlainText will assume that each sentence ends in two
641 spaces, and will try to preserve that spacing. If set to false, all
642 consecutive whitespace in non-verbatim paragraphs is compressed into a
643 single space. Defaults to true.
647 The column at which to wrap text on the right-hand side. Defaults to 76.
651 The standard Pod::Parser method parse_from_filehandle() takes up to two
652 arguments, the first being the file handle to read POD from and the second
653 being the file handle to write the formatted output to. The first defaults
654 to STDIN if not given, and the second defaults to STDOUT. The method
655 parse_from_file() is almost identical, except that its two arguments are the
656 input and output disk files instead. See L<Pod::Parser> for the specific
663 =item Bizarre space in item
665 (W) Something has gone wrong in internal C<=item> processing. This message
666 indicates a bug in Pod::PlainText; you should never see it.
668 =item Can't open %s for reading: %s
670 (F) Pod::PlainText was invoked via the compatibility mode pod2text() interface
671 and the input file it was given could not be opened.
673 =item Unknown escape: %s
675 (W) The POD source contained an C<EE<lt>E<gt>> escape that Pod::PlainText didn't
678 =item Unknown sequence: %s
680 (W) The POD source contained a non-standard internal sequence (something of
681 the form C<XE<lt>E<gt>>) that Pod::PlainText didn't know about.
683 =item Unmatched =back
685 (W) Pod::PlainText encountered a C<=back> command that didn't correspond to an
692 Embedded Ctrl-As (octal 001) in the input will be mapped to spaces on
693 output, due to an internal implementation detail.
697 This is a replacement for an earlier Pod::Text module written by Tom
698 Christiansen. It has a revamped interface, since it now uses Pod::Parser,
699 but an interface roughly compatible with the old Pod::Text::pod2text()
700 function is still available. Please change to the new calling convention,
703 The original Pod::Text contained code to do formatting via termcap
704 sequences, although it wasn't turned on by default and it was problematic to
705 get it to work at all. This rewrite doesn't even try to do that, but a
706 subclass of it does. Look for L<Pod::Text::Termcap|Pod::Text::Termcap>.
710 L<Pod::Parser|Pod::Parser>, L<Pod::Text::Termcap|Pod::Text::Termcap>,
715 Please report bugs using L<http://rt.cpan.org>.
717 Russ Allbery E<lt>rra@stanford.eduE<gt>, based I<very> heavily on the
718 original Pod::Text by Tom Christiansen E<lt>tchrist@mox.perl.comE<gt> and
719 its conversion to Pod::Parser by Brad Appleton
720 E<lt>bradapp@enteract.comE<gt>.