1 # Pod::Text -- Convert POD data to formatted ASCII text.
2 # $Id: Text.pm,v 2.3 1999/10/07 09:41:57 eagle Exp $
4 # Copyright 1999 by Russ Allbery <rra@stanford.edu>
6 # This program is free software; you can redistribute it and/or modify it
7 # under the same terms as Perl itself.
9 # This module is intended to be a replacement for Pod::Text, and attempts to
10 # match its output except for some specific circumstances where other
11 # decisions seemed to produce better output. It uses Pod::Parser and is
12 # designed to be very easy to subclass.
14 ############################################################################
15 # Modules and declarations
16 ############################################################################
22 use Carp qw(carp croak);
27 use vars qw(@ISA @EXPORT %ESCAPES $VERSION);
29 # We inherit from Pod::Select instead of Pod::Parser so that we can be used
31 @ISA = qw(Pod::Select Exporter);
33 # We have to export pod2text for backward compatibility.
34 @EXPORT = qw(pod2text);
36 ($VERSION = (split (' ', q$Revision: 2.3 $ ))[1]) =~ s/\.(\d)$/.0$1/;
39 ############################################################################
40 # Table of supported E<> escapes
41 ############################################################################
43 # This table is taken near verbatim from Pod::PlainText in Pod::Parser,
44 # which got it near verbatim from the original Pod::Text. It is therefore
45 # credited to Tom Christiansen, and I'm glad I didn't have to write it. :)
46 # "iexcl" to "divide" added by Tim Jenness
48 'amp' => '&', # ampersand
49 'lt' => '<', # left chevron, less-than
50 'gt' => '>', # right chevron, greater-than
51 'quot' => '"', # double quote
53 "Aacute" => "\xC1", # capital A, acute accent
54 "aacute" => "\xE1", # small a, acute accent
55 "Acirc" => "\xC2", # capital A, circumflex accent
56 "acirc" => "\xE2", # small a, circumflex accent
57 "AElig" => "\xC6", # capital AE diphthong (ligature)
58 "aelig" => "\xE6", # small ae diphthong (ligature)
59 "Agrave" => "\xC0", # capital A, grave accent
60 "agrave" => "\xE0", # small a, grave accent
61 "Aring" => "\xC5", # capital A, ring
62 "aring" => "\xE5", # small a, ring
63 "Atilde" => "\xC3", # capital A, tilde
64 "atilde" => "\xE3", # small a, tilde
65 "Auml" => "\xC4", # capital A, dieresis or umlaut mark
66 "auml" => "\xE4", # small a, dieresis or umlaut mark
67 "Ccedil" => "\xC7", # capital C, cedilla
68 "ccedil" => "\xE7", # small c, cedilla
69 "Eacute" => "\xC9", # capital E, acute accent
70 "eacute" => "\xE9", # small e, acute accent
71 "Ecirc" => "\xCA", # capital E, circumflex accent
72 "ecirc" => "\xEA", # small e, circumflex accent
73 "Egrave" => "\xC8", # capital E, grave accent
74 "egrave" => "\xE8", # small e, grave accent
75 "ETH" => "\xD0", # capital Eth, Icelandic
76 "eth" => "\xF0", # small eth, Icelandic
77 "Euml" => "\xCB", # capital E, dieresis or umlaut mark
78 "euml" => "\xEB", # small e, dieresis or umlaut mark
79 "Iacute" => "\xCD", # capital I, acute accent
80 "iacute" => "\xED", # small i, acute accent
81 "Icirc" => "\xCE", # capital I, circumflex accent
82 "icirc" => "\xEE", # small i, circumflex accent
83 "Igrave" => "\xCD", # capital I, grave accent
84 "igrave" => "\xED", # small i, grave accent
85 "Iuml" => "\xCF", # capital I, dieresis or umlaut mark
86 "iuml" => "\xEF", # small i, dieresis or umlaut mark
87 "Ntilde" => "\xD1", # capital N, tilde
88 "ntilde" => "\xF1", # small n, tilde
89 "Oacute" => "\xD3", # capital O, acute accent
90 "oacute" => "\xF3", # small o, acute accent
91 "Ocirc" => "\xD4", # capital O, circumflex accent
92 "ocirc" => "\xF4", # small o, circumflex accent
93 "Ograve" => "\xD2", # capital O, grave accent
94 "ograve" => "\xF2", # small o, grave accent
95 "Oslash" => "\xD8", # capital O, slash
96 "oslash" => "\xF8", # small o, slash
97 "Otilde" => "\xD5", # capital O, tilde
98 "otilde" => "\xF5", # small o, tilde
99 "Ouml" => "\xD6", # capital O, dieresis or umlaut mark
100 "ouml" => "\xF6", # small o, dieresis or umlaut mark
101 "szlig" => "\xDF", # small sharp s, German (sz ligature)
102 "THORN" => "\xDE", # capital THORN, Icelandic
103 "thorn" => "\xFE", # small thorn, Icelandic
104 "Uacute" => "\xDA", # capital U, acute accent
105 "uacute" => "\xFA", # small u, acute accent
106 "Ucirc" => "\xDB", # capital U, circumflex accent
107 "ucirc" => "\xFB", # small u, circumflex accent
108 "Ugrave" => "\xD9", # capital U, grave accent
109 "ugrave" => "\xF9", # small u, grave accent
110 "Uuml" => "\xDC", # capital U, dieresis or umlaut mark
111 "uuml" => "\xFC", # small u, dieresis or umlaut mark
112 "Yacute" => "\xDD", # capital Y, acute accent
113 "yacute" => "\xFD", # small y, acute accent
114 "yuml" => "\xFF", # small y, dieresis or umlaut mark
116 "lchevron" => "\xAB", # left chevron (double less than) laquo
117 "rchevron" => "\xBB", # right chevron (double greater than) raquo
119 "iexcl" => "\xA1", # inverted exclamation mark
120 "cent" => "\xA2", # cent sign
121 "pound" => "\xA3", # (UK) pound sign
122 "curren" => "\xA4", # currency sign
123 "yen" => "\xA5", # yen sign
124 "brvbar" => "\xA6", # broken vertical bar
125 "sect" => "\xA7", # section sign
126 "uml" => "\xA8", # diaresis
127 "copy" => "\xA9", # Copyright symbol
128 "ordf" => "\xAA", # feminine ordinal indicator
129 "laquo" => "\xAB", # left pointing double angle quotation mark
130 "not" => "\xAC", # not sign
131 "shy" => "\xAD", # soft hyphen
132 "reg" => "\xAE", # registered trademark
133 "macr" => "\xAF", # macron, overline
134 "deg" => "\xB0", # degree sign
135 "plusmn" => "\xB1", # plus-minus sign
136 "sup2" => "\xB2", # superscript 2
137 "sup3" => "\xB3", # superscript 3
138 "acute" => "\xB4", # acute accent
139 "micro" => "\xB5", # micro sign
140 "para" => "\xB6", # pilcrow sign = paragraph sign
141 "middot" => "\xB7", # middle dot = Georgian comma
142 "cedil" => "\xB8", # cedilla
143 "sup1" => "\xB9", # superscript 1
144 "ordm" => "\xBA", # masculine ordinal indicator
145 "raquo" => "\xBB", # right pointing double angle quotation mark
146 "frac14" => "\xBC", # vulgar fraction one quarter
147 "frac12" => "\xBD", # vulgar fraction one half
148 "frac34" => "\xBE", # vulgar fraction three quarters
149 "iquest" => "\xBF", # inverted question mark
150 "times" => "\xD7", # multiplication sign
151 "divide" => "\xF7", # division sign
155 ############################################################################
157 ############################################################################
159 # Initialize the object. Must be sure to call our parent initializer.
163 $$self{alt} = 0 unless defined $$self{alt};
164 $$self{indent} = 4 unless defined $$self{indent};
165 $$self{loose} = 0 unless defined $$self{loose};
166 $$self{sentence} = 0 unless defined $$self{sentence};
167 $$self{width} = 76 unless defined $$self{width};
169 $$self{INDENTS} = []; # Stack of indentations.
170 $$self{MARGIN} = $$self{indent}; # Current left margin in spaces.
172 $self->SUPER::initialize;
176 ############################################################################
178 ############################################################################
180 # Called for each command paragraph. Gets the command, the associated
181 # paragraph, the line number, and a Pod::Paragraph object. Just dispatches
182 # the command to a method named the same as the command. =cut is handled
183 # internally by Pod::Parser.
187 return if $command eq 'pod';
188 return if ($$self{EXCLUDE} && $command ne 'end');
189 $self->item ("\n") if defined $$self{ITEM};
190 $command = 'cmd_' . $command;
191 $self->$command (@_);
194 # Called for a verbatim paragraph. Gets the paragraph, the line number, and
195 # a Pod::Paragraph object. Just output it verbatim, but with tabs converted
199 return if $$self{EXCLUDE};
200 $self->item if defined $$self{ITEM};
203 s/^(\s*\S+)/(' ' x $$self{MARGIN}) . $1/gme;
207 # Called for a regular text block. Gets the paragraph, the line number, and
208 # a Pod::Paragraph object. Perform interpolation and output the results.
211 return if $$self{EXCLUDE};
212 $self->output ($_[0]), return if $$self{VERBATIM};
216 # Perform a little magic to collapse multiple L<> references. This is
217 # here mostly for backwards-compatibility. We'll just rewrite the whole
218 # thing into actual text at this part, bypassing the whole internal
219 # sequence parsing thing.
222 L< # A link of the form L</something>.
225 [:\w]+ # The item has to be a simple word...
226 (\(\))? # ...or simple function.
230 ,?\s+(and\s+)? # Allow lots of them, conjuncted.
243 my @items = split /(?:,?\s+(?:and\s+)?)/;
246 for ($i = 0; $i < @items; $i++) {
247 $string .= $items[$i];
248 $string .= ", " if @items > 2 && $i != $#items;
249 $string .= " and " if ($i == $#items - 1);
251 $string .= " entries elsewhere in this document";
255 # Now actually interpolate and output the paragraph.
256 $_ = $self->interpolate ($_, $line);
258 if (defined $$self{ITEM}) {
259 $self->item ($_ . "\n");
261 $self->output ($self->reformat ($_ . "\n"));
265 # Called for an interior sequence. Gets the command, argument, and a
266 # Pod::InteriorSequence object and is expected to return the resulting text.
267 # Calls code, bold, italic, file, and link to handle those types of
268 # sequences, and handles S<>, E<>, X<>, and Z<> directly.
269 sub interior_sequence {
273 return '' if ($command eq 'X' || $command eq 'Z');
275 # Expand escapes into the actual character now, carping if invalid.
276 if ($command eq 'E') {
280 return $ESCAPES{$_} if defined $ESCAPES{$_};
281 carp "Unknown escape: E<$_>";
286 # For all the other sequences, empty content produces no output.
289 # For S<>, compress all internal whitespace and then map spaces to \01.
290 # When we output the text, we'll map this back.
291 if ($command eq 'S') {
297 # Anything else needs to get dispatched to another method.
298 if ($command eq 'B') { return $self->seq_b ($_) }
299 elsif ($command eq 'C') { return $self->seq_c ($_) }
300 elsif ($command eq 'F') { return $self->seq_f ($_) }
301 elsif ($command eq 'I') { return $self->seq_i ($_) }
302 elsif ($command eq 'L') { return $self->seq_l ($_) }
303 else { carp "Unknown sequence $command<$_>" }
306 # Called for each paragraph that's actually part of the POD. We take
307 # advantage of this opportunity to untabify the input.
308 sub preprocess_paragraph {
311 1 while s/^(.*?)(\t+)/$1 . ' ' x (length ($2) * 8 - length ($1) % 8)/me;
316 ############################################################################
318 ############################################################################
320 # All command paragraphs take the paragraph and the line number.
322 # First level heading.
327 $_ = $self->interpolate ($_, shift);
329 $self->output ("\n==== $_ ====\n\n");
331 $_ .= "\n" if $$self{loose};
332 $self->output ($_ . "\n");
336 # Second level heading.
341 $_ = $self->interpolate ($_, shift);
343 $self->output ("\n== $_ ==\n\n");
345 $self->output (' ' x ($$self{indent} / 2) . $_ . "\n\n");
353 unless (/^[-+]?\d+\s+$/) { $_ = $$self{indent} }
354 push (@{ $$self{INDENTS} }, $$self{MARGIN});
355 $$self{MARGIN} += ($_ + 0);
361 $$self{MARGIN} = pop @{ $$self{INDENTS} };
362 unless (defined $$self{MARGIN}) {
363 carp "Unmatched =back";
364 $$self{MARGIN} = $$self{indent};
368 # An individual list item.
371 if (defined $$self{ITEM}) { $self->item }
374 $$self{ITEM} = $self->interpolate ($_);
377 # Begin a block for a particular translator. Setting VERBATIM triggers
378 # special handling in textblock().
382 my ($kind) = /^(\S+)/ or return;
383 if ($kind eq 'text') {
384 $$self{VERBATIM} = 1;
390 # End a block for a particular translator. We assume that all =begin/=end
391 # pairs are properly closed.
395 $$self{VERBATIM} = 0;
398 # One paragraph for a particular translator. Ignore it unless it's intended
399 # for text, in which case we treat it as a verbatim text block.
404 return unless s/^text\b[ \t]*\n?//;
405 $self->verbatim ($_, $line);
409 ############################################################################
411 ############################################################################
413 # The simple formatting ones. These are here mostly so that subclasses can
414 # override them and do more complicated things.
415 sub seq_b { return $_[0]{alt} ? "``$_[1]''" : $_[1] }
416 sub seq_c { return $_[0]{alt} ? "``$_[1]''" : "`$_[1]'" }
417 sub seq_f { return $_[0]{alt} ? "\"$_[1]\"" : $_[1] }
418 sub seq_i { return '*' . $_[1] . '*' }
420 # The complicated one. Handle links. Since this is plain text, we can't
421 # actually make any real links, so this is all to figure out what text we
427 # Smash whitespace in case we were split across multiple lines.
430 # If we were given any explicit text, just output it.
431 if (/^([^|]+)\|/) { return $1 }
433 # Okay, leading and trailing whitespace isn't important; get rid of it.
437 # Default to using the whole content of the link entry as a section
438 # name. Note that L<manpage/> forces a manpage interpretation, as does
439 # something looking like L<manpage(section)>. The latter is an
440 # enhancement over the original Pod::Text.
441 my ($manpage, $section) = ('', $_);
442 if (/^"\s*(.*?)\s*"$/) {
443 $section = '"' . $1 . '"';
444 } elsif (m/^[-:.\w]+(?:\(\S+\))?$/) {
445 ($manpage, $section) = ($_, '');
447 ($manpage, $section) = split (/\s*\/\s*/, $_, 2);
450 # Now build the actual output text.
452 if (!length $section) {
453 $text = "the $manpage manpage" if length $manpage;
454 } elsif ($section =~ /^[:\w]+(?:\(\))?/) {
455 $text .= 'the ' . $section . ' entry';
456 $text .= (length $manpage) ? " in the $manpage manpage"
457 : " elsewhere in this document";
459 $section =~ s/^\"\s*//;
460 $section =~ s/\s*\"$//;
461 $text .= 'the section on "' . $section . '"';
462 $text .= " in the $manpage manpage" if length $manpage;
468 ############################################################################
470 ############################################################################
472 # This method is called whenever an =item command is complete (in other
473 # words, we've seen its associated paragraph or know for certain that it
474 # doesn't have one). It gets the paragraph associated with the item as an
475 # argument. If that argument is empty, just output the item tag; if it
476 # contains a newline, output the item tag followed by the newline.
477 # Otherwise, see if there's enough room for us to output the item tag in the
478 # margin of the text or if we have to put it on a separate line.
482 my $tag = $$self{ITEM};
483 unless (defined $tag) {
484 carp "item called without tag";
488 my $indent = $$self{INDENTS}[-1];
489 unless (defined $indent) { $indent = $$self{indent} }
490 my $space = ' ' x $indent;
491 $space =~ s/^ /:/ if $$self{alt};
492 if (!$_ || /^\s+$/ || ($$self{MARGIN} - $indent < length ($tag) + 1)) {
493 my $margin = $$self{MARGIN};
494 $$self{MARGIN} = $indent;
495 my $output = $self->reformat ($tag);
496 $output =~ s/\n*$/\n/;
497 $self->output ($output);
498 $$self{MARGIN} = $margin;
499 $self->output ($self->reformat ($_)) if /\S/;
501 $_ = $self->reformat ($_);
502 s/^ /:/ if ($$self{alt} && $indent > 0);
503 my $tagspace = ' ' x length $tag;
504 s/^($space)$tagspace/$1$tag/ or warn "Bizarre space in item";
510 ############################################################################
512 ############################################################################
514 # Wrap a line, indenting by the current left margin. We can't use
515 # Text::Wrap because it plays games with tabs. We can't use formline, even
516 # though we'd really like to, because it screws up non-printing characters.
517 # So we have to do the wrapping ourselves.
522 my $spaces = ' ' x $$self{MARGIN};
523 my $width = $$self{width} - $$self{MARGIN};
524 while (length > $width) {
525 if (s/^([^\n]{0,$width})\s+// || s/^([^\n]{$width})//) {
526 $output .= $spaces . $1 . "\n";
531 $output .= $spaces . $_;
532 $output =~ s/\s+$/\n\n/;
536 # Reformat a paragraph of text for the current margin. Takes the text to
537 # reformat and returns the formatted text.
542 # If we're trying to preserve two spaces after sentences, do some
543 # munging to support that. Otherwise, smash all repeated whitespace.
544 if ($$self{sentence}) {
555 # Output text to the output device.
556 sub output { $_[1] =~ tr/\01/ /; print { $_[0]->output_handle } $_[1] }
559 ############################################################################
560 # Backwards compatibility
561 ############################################################################
563 # The old Pod::Text module did everything in a pod2text() function. This
564 # tries to provide the same interface for legacy applications.
568 # This is really ugly; I hate doing option parsing in the middle of a
569 # module. But the old Pod::Text module supported passing flags to its
570 # entry function, so handle -a and -<number>.
571 while ($_[0] =~ /^-/) {
573 if ($flag eq '-a') { push (@args, alt => 1) }
574 elsif ($flag =~ /^-(\d+)$/) { push (@args, width => $1) }
581 # Now that we know what arguments we're using, create the parser.
582 my $parser = Pod::Text->new (@args);
584 # If two arguments were given, the second argument is going to be a file
585 # handle. That means we want to call parse_from_filehandle(), which
586 # means we need to turn the first argument into a file handle. Magic
587 # open will handle the <&STDIN case automagically.
590 unless (open (IN, $_[0])) {
591 croak ("Can't open $_[0] for reading: $!\n");
595 return $parser->parse_from_filehandle (@_);
597 return $parser->parse_from_file (@_);
602 ############################################################################
603 # Module return value and documentation
604 ############################################################################
611 Pod::Text - Convert POD data to formatted ASCII text
616 my $parser = Pod::Text->new (sentence => 0, width => 78);
618 # Read POD from STDIN and write to STDOUT.
619 $parser->parse_from_filehandle;
621 # Read POD from file.pod and write to file.txt.
622 $parser->parse_from_file ('file.pod', 'file.txt');
626 Pod::Text is a module that can convert documentation in the POD format (the
627 preferred language for documenting Perl) into formatted ASCII. It uses no
628 special formatting controls or codes whatsoever, and its output is therefore
629 suitable for nearly any device.
631 As a derived class from Pod::Parser, Pod::Text supports the same methods and
632 interfaces. See L<Pod::Parser> for all the details; briefly, one creates a
633 new parser with C<Pod::Text-E<gt>new()> and then calls either
634 parse_from_filehandle() or parse_from_file().
636 new() can take options, in the form of key/value pairs, that control the
637 behavior of the parser. The currently recognized options are:
643 If set to a true value, selects an alternate output format that, among other
644 things, uses a different heading style and marks C<=item> entries with a
645 colon in the left margin. Defaults to false.
649 The number of spaces to indent regular text, and the default indentation for
650 C<=over> blocks. Defaults to 4.
654 If set to a true value, a blank line is printed after a C<=head1> heading.
655 If set to false (the default), no blank line is printed after C<=head1>,
656 although one is still printed after C<=head2>. This is the default because
657 it's the expected formatting for manual pages; if you're formatting
658 arbitrary text documents, setting this to true may result in more pleasing
663 If set to a true value, Pod::Text will assume that each sentence ends in two
664 spaces, and will try to preserve that spacing. If set to false, all
665 consecutive whitespace in non-verbatim paragraphs is compressed into a
666 single space. Defaults to true.
670 The column at which to wrap text on the right-hand side. Defaults to 76.
674 The standard Pod::Parser method parse_from_filehandle() takes up to two
675 arguments, the first being the file handle to read POD from and the second
676 being the file handle to write the formatted output to. The first defaults
677 to STDIN if not given, and the second defaults to STDOUT. The method
678 parse_from_file() is almost identical, except that its two arguments are the
679 input and output disk files instead. See L<Pod::Parser> for the specific
686 =item Bizarre space in item
688 (W) Something has gone wrong in internal C<=item> processing. This message
689 indicates a bug in Pod::Text; you should never see it.
691 =item Can't open %s for reading: %s
693 (F) Pod::Text was invoked via the compatibility mode pod2text() interface
694 and the input file it was given could not be opened.
696 =item Unknown escape: %s
698 (W) The POD source contained an C<EE<lt>E<gt>> escape that Pod::Text didn't
701 =item Unknown sequence: %s
703 (W) The POD source contained a non-standard internal sequence (something of
704 the form C<XE<lt>E<gt>>) that Pod::Text didn't know about.
706 =item Unmatched =back
708 (W) Pod::Text encountered a C<=back> command that didn't correspond to an
715 Embedded Ctrl-As (octal 001) in the input will be mapped to spaces on
716 output, due to an internal implementation detail.
720 This is a replacement for an earlier Pod::Text module written by Tom
721 Christiansen. It has a revamped interface, since it now uses Pod::Parser,
722 but an interface roughly compatible with the old Pod::Text::pod2text()
723 function is still available. Please change to the new calling convention,
726 The original Pod::Text contained code to do formatting via termcap
727 sequences, although it wasn't turned on by default and it was problematic to
728 get it to work at all. This rewrite doesn't even try to do that, but a
729 subclass of it does. Look for L<Pod::Text::Termcap|Pod::Text::Termcap>.
733 L<Pod::Parser|Pod::Parser>, L<Pod::Text::Termcap|Pod::Text::Termcap>,
738 Russ Allbery E<lt>rra@stanford.eduE<gt>, based I<very> heavily on the
739 original Pod::Text by Tom Christiansen E<lt>tchrist@mox.perl.comE<gt> and
740 its conversion to Pod::Parser by Brad Appleton
741 E<lt>bradapp@enteract.comE<gt>.