lib/Pod/Text.pm

   1 package Pod::Text;
   2
   3 =head1 NAME
   4
   5 Pod::Text - convert POD data to formatted ASCII text
   6
   7 =head1 SYNOPSIS
   8
   9         use Pod::Text;
  10
  11         pod2text("perlfunc.pod");
  12
  13 Also:
  14
  15         pod2text [B<-a>] [B<->I<width>] < input.pod
  16
  17 =head1 DESCRIPTION
  18
  19 Pod::Text is a module that can convert documentation in the POD format (such
  20 as can be found throughout the Perl distribution) into formatted ASCII.
  21 Termcap is optionally supported for boldface/underline, and can enabled via
  22 C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces
  23 will be used to simulate bold and underlined text.
  24
  25 A separate F<pod2text> program is included that is primarily a wrapper for
  26 Pod::Text.
  27
  28 The single function C<pod2text()> can take the optional options B<-a>
  29 for an alternative output format, then a B<->I<width> option with the
  30 max terminal width, followed by one or two arguments. The first
  31 should be the name of a file to read the pod from, or "E<lt>&STDIN" to read from
  32 STDIN. A second argument, if provided, should be a filehandle glob where
  33 output should be sent.
  34
  35 =head1 AUTHOR
  36
  37 Tom Christiansen E<lt>F<tchrist@mox.perl.com>E<gt>
  38
  39 =head1 TODO
  40
  41 Cleanup work. The input and output locations need to be more flexible,
  42 termcap shouldn't be a global variable, and the terminal speed needs to
  43 be properly calculated.
  44
  45 =cut
  46
  47 use Term::Cap;
  48 require Exporter;
  49 @ISA = Exporter;
  50 @EXPORT = qw(pod2text);
  51
  52 use vars qw($VERSION);
  53 $VERSION = "1.0203";
  54
  55 $termcap=0;
  56
  57 $opt_alt_format = 0;
  58
  59 #$use_format=1;
  60
  61 $UNDL = "\x1b[4m";
  62 $INV = "\x1b[7m";
  63 $BOLD = "\x1b[1m";
  64 $NORM = "\x1b[0m";
  65
  66 sub pod2text {
  67 shift if $opt_alt_format = ($_[0] eq '-a');
  68
  69 if($termcap and !$setuptermcap) {
  70         $setuptermcap=1;
  71
  72     my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 };
  73     $UNDL = $term->{'_us'};
  74     $INV = $term->{'_mr'};
  75     $BOLD = $term->{'_md'};
  76     $NORM = $term->{'_me'};
  77 }
  78
  79 $SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1))
  80        ||  $ENV{COLUMNS}
  81        || ($ENV{TERMCAP} =~ /co#(\d+)/)[0]
  82        || (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0]
  83        || 72;
  84
  85 @_ = ("<&STDIN") unless @_;
  86 local($file,*OUTPUT) = @_;
  87 *OUTPUT = *STDOUT if @_<2;
  88
  89 local $: = $:;
  90 $: = " \n" if $opt_alt_format;  # Do not break ``-L/lib/'' into ``- L/lib/''.
  91
  92 $/ = "";
  93
  94 $FANCY = 0;
  95
  96 $cutting = 1;
  97 $DEF_INDENT = 4;
  98 $indent = $DEF_INDENT;
  99 $needspace = 0;
 100 $begun = "";
 101
 102 open(IN, $file) || die "Couldn't open $file: $!";
 103
 104 POD_DIRECTIVE: while (<IN>) {
 105     if ($cutting) {
 106         next unless /^=/;
 107         $cutting = 0;
 108     }
 109     if ($begun) {
 110         if (/^=end\s+$begun/) {
 111              $begun = "";
 112         }
 113         elsif ($begun eq "text") {
 114             print OUTPUT $_;
 115         }
 116         next;
 117     }
 118     1 while s{^(.*?)(\t+)(.*)$}{
 119         $1
 120         . (' ' x (length($2) * 8 - length($1) % 8))
 121         . $3
 122     }me;
 123     # Translate verbatim paragraph
 124     if (/^\s/) {
 125         output($_);
 126         next;
 127     }
 128
 129     if (/^=for\s+(\S+)\s*(.*)/s) {
 130         if ($1 eq "text") {
 131             print OUTPUT $2,"";
 132         } else {
 133             # ignore unknown for
 134         }
 135         next;
 136     }
 137     elsif (/^=begin\s+(\S+)\s*(.*)/s) {
 138         $begun = $1;
 139         if ($1 eq "text") {
 140             print OUTPUT $2."";
 141         }
 142         next;
 143     }
 144
 145 sub prepare_for_output {
 146
 147     s/\s*$/\n/;
 148     &init_noremap;
 149
 150     # need to hide E<> first; they're processed in clear_noremap
 151     s/(E<[^<>]+>)/noremap($1)/ge;
 152     $maxnest = 10;
 153     while ($maxnest-- && /[A-Z]</) {
 154         unless ($FANCY) {
 155             if ($opt_alt_format) {
 156                 s/[BC]<(.*?)>/``$1''/sg;
 157                 s/F<(.*?)>/"$1"/sg;
 158             } else {
 159                 s/C<(.*?)>/`$1'/sg;
 160             }
 161         } else {
 162             s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/sge;
 163         }
 164         # s/[IF]<(.*?)>/italic($1)/ge;
 165         s/I<(.*?)>/*$1*/sg;
 166         # s/[CB]<(.*?)>/bold($1)/ge;
 167         s/X<.*?>//sg;
 168         # LREF: a manpage(3f)
 169         s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g;
 170         # LREF: an =item on another manpage
 171         s{
 172             L<
 173                 ([^/]+)
 174                 /
 175                 (
 176                     [:\w]+
 177                     (\(\))?
 178                 )
 179             >
 180         } {the "$2" entry in the $1 manpage}gx;
 181
 182         # LREF: an =item on this manpage
 183         s{
 184            ((?:
 185             L<
 186                 /
 187                 (
 188                     [:\w]+
 189                     (\(\))?
 190                 )
 191             >
 192             (,?\s+(and\s+)?)?
 193           )+)
 194         } { internal_lrefs($1) }gex;
 195
 196         # LREF: a =head2 (head1?), maybe on a manpage, maybe right here
 197         # the "func" can disambiguate
 198         s{
 199             L<
 200                 (?:
 201                     ([a-zA-Z]\S+?) /
 202                 )?
 203                 "?(.*?)"?
 204             >
 205         }{
 206             do {
 207                 $1      # if no $1, assume it means on this page.
 208                     ?  "the section on \"$2\" in the $1 manpage"
 209                     :  "the section on \"$2\""
 210             }
 211         }sgex;
 212
 213         s/[A-Z]<(.*?)>/$1/sg;
 214     }
 215     clear_noremap(1);
 216 }
 217
 218     &prepare_for_output;
 219
 220     if (s/^=//) {
 221         # $needspace = 0;               # Assume this.
 222         # s/\n/ /g;
 223         ($Cmd, $_) = split(' ', $_, 2);
 224         # clear_noremap(1);
 225         if ($Cmd eq 'cut') {
 226             $cutting = 1;
 227         }
 228         elsif ($Cmd eq 'pod') {
 229             $cutting = 0;
 230         }
 231         elsif ($Cmd eq 'head1') {
 232             makespace();
 233             if ($opt_alt_format) {
 234                 print OUTPUT "\n";
 235                 s/^(.+?)[ \t]*$/==== $1 ====/;
 236             }
 237             print OUTPUT;
 238             # print OUTPUT uc($_);
 239             $needspace = $opt_alt_format;
 240         }
 241         elsif ($Cmd eq 'head2') {
 242             makespace();
 243             # s/(\w+)/\u\L$1/g;
 244             #print ' ' x $DEF_INDENT, $_;
 245             # print "\xA7";
 246             s/(\w)/\xA7 $1/ if $FANCY;
 247             if ($opt_alt_format) {
 248                 s/^(.+?)[ \t]*$/==   $1   ==/;
 249                 print OUTPUT "\n", $_;
 250             } else {
 251                 print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n";
 252             }
 253             $needspace = $opt_alt_format;
 254         }
 255         elsif ($Cmd eq 'over') {
 256             push(@indent,$indent);
 257             $indent += ($_ + 0) || $DEF_INDENT;
 258         }
 259         elsif ($Cmd eq 'back') {
 260             $indent = pop(@indent);
 261             warn "Unmatched =back\n" unless defined $indent;
 262             $needspace = 1;
 263         }
 264         elsif ($Cmd eq 'item') {
 265             makespace();
 266             # s/\A(\s*)\*/$1\xb7/ if $FANCY;
 267             # s/^(\s*\*\s+)/$1 /;
 268             {
 269                 if (length() + 3 < $indent) {
 270                     my $paratag = $_;
 271                     $_ = <IN>;
 272                     if (/^=/) {  # tricked!
 273                         local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 274                         output($paratag);
 275                         redo POD_DIRECTIVE;
 276                     }
 277                     &prepare_for_output;
 278                     IP_output($paratag, $_);
 279                 } else {
 280                     local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 281                     output($_, 0);
 282                 }
 283             }
 284         }
 285         else {
 286             warn "Unrecognized directive: $Cmd\n";
 287         }
 288     }
 289     else {
 290         # clear_noremap(1);
 291         makespace();
 292         output($_, 1);
 293     }
 294 }
 295
 296 close(IN);
 297
 298 }
 299
 300 #########################################################################
 301
 302 sub makespace {
 303     if ($needspace) {
 304         print OUTPUT "\n";
 305         $needspace = 0;
 306     }
 307 }
 308
 309 sub bold {
 310     my $line = shift;
 311     return $line if $use_format;
 312     if($termcap) {
 313         $line = "$BOLD$line$NORM";
 314     } else {
 315             $line =~ s/(.)/$1\b$1/g;
 316         }
 317 #    $line = "$BOLD$line$NORM" if $ansify;
 318     return $line;
 319 }
 320
 321 sub italic {
 322     my $line = shift;
 323     return $line if $use_format;
 324     if($termcap) {
 325         $line = "$UNDL$line$NORM";
 326     } else {
 327             $line =~ s/(.)/$1\b_/g;
 328     }
 329 #    $line = "$UNDL$line$NORM" if $ansify;
 330     return $line;
 331 }
 332
 333 # Fill a paragraph including underlined and overstricken chars.
 334 # It's not perfect for words longer than the margin, and it's probably
 335 # slow, but it works.
 336 sub fill {
 337     local $_ = shift;
 338     my $par = "";
 339     my $indent_space = " " x $indent;
 340     my $marg = $SCREEN-$indent;
 341     my $line = $indent_space;
 342     my $line_length;
 343     foreach (split) {
 344         my $word_length = length;
 345         $word_length -= 2 while /\010/g;  # Subtract backspaces
 346
 347         if ($line_length + $word_length > $marg) {
 348             $par .= $line . "\n";
 349             $line= $indent_space . $_;
 350             $line_length = $word_length;
 351         }
 352         else {
 353             if ($line_length) {
 354                 $line_length++;
 355                 $line .= " ";
 356             }
 357             $line_length += $word_length;
 358             $line .= $_;
 359         }
 360     }
 361     $par .= "$line\n" if $line;
 362     $par .= "\n";
 363     return $par;
 364 }
 365
 366 sub IP_output {
 367     local($tag, $_) = @_;
 368     local($tag_indent) = $indent[$#index - 1] || $DEF_INDENT;
 369     $tag_cols = $SCREEN - $tag_indent;
 370     $cols = $SCREEN - $indent;
 371     $tag =~ s/\s*$//;
 372     s/\s+/ /g;
 373     s/^ //;
 374     $str = "format OUTPUT = \n"
 375         . (($opt_alt_format && $tag_indent > 1)
 376            ? ":" . " " x ($tag_indent - 1)
 377            : " " x ($tag_indent))
 378         . '@' . ('<' x ($indent - $tag_indent - 1))
 379         . "^" .  ("<" x ($cols - 1)) . "\n"
 380         . '$tag, $_'
 381         . "\n~~"
 382         . (" " x ($indent-2))
 383         . "^" .  ("<" x ($cols - 5)) . "\n"
 384         . '$_' . "\n\n.\n1";
 385     #warn $str; warn "tag is $tag, _ is $_";
 386     eval $str || die;
 387     write OUTPUT;
 388 }
 389
 390 sub output {
 391     local($_, $reformat) = @_;
 392     if ($reformat) {
 393         $cols = $SCREEN - $indent;
 394         s/\s+/ /g;
 395         s/^ //;
 396         $str = "format OUTPUT = \n~~"
 397             . (" " x ($indent-2))
 398             . "^" .  ("<" x ($cols - 5)) . "\n"
 399             . '$_' . "\n\n.\n1";
 400         eval $str || die;
 401         write OUTPUT;
 402     } else {
 403         s/^/' ' x $indent/gem;
 404         s/^\s+\n$/\n/gm;
 405         s/^  /: /s if defined($reformat) && $opt_alt_format;
 406         print OUTPUT;
 407     }
 408 }
 409
 410 sub noremap {
 411     local($thing_to_hide) = shift;
 412     $thing_to_hide =~ tr/\000-\177/\200-\377/;
 413     return $thing_to_hide;
 414 }
 415
 416 sub init_noremap {
 417     die "unmatched init" if $mapready++;
 418     #mask off high bit characters in input stream
 419     s/([\200-\377])/"E<".ord($1).">"/ge;
 420 }
 421
 422 sub clear_noremap {
 423     my $ready_to_print = $_[0];
 424     die "unmatched clear" unless $mapready--;
 425     tr/\200-\377/\000-\177/;
 426     # now for the E<>s, which have been hidden until now
 427     # otherwise the interative \w<> processing would have
 428     # been hosed by the E<gt>
 429     s {
 430             E<
 431             (
 432                 ( \d+ )
 433                 | ( [A-Za-z]+ )
 434             )
 435             >
 436     } {
 437          do {
 438                 defined $2
 439                 ? chr($2)
 440                 :
 441              defined $HTML_Escapes{$3}
 442                 ? do { $HTML_Escapes{$3} }
 443                 : do {
 444                     warn "Unknown escape: E<$1> in $_";
 445                     "E<$1>";
 446                 }
 447          }
 448     }egx if $ready_to_print;
 449 }
 450
 451 sub internal_lrefs {
 452     local($_) = shift;
 453     s{L</([^>]+)>}{$1}g;
 454     my(@items) = split( /(?:,?\s+(?:and\s+)?)/ );
 455     my $retstr = "the ";
 456     my $i;
 457     for ($i = 0; $i <= $#items; $i++) {
 458         $retstr .= "C<$items[$i]>";
 459         $retstr .= ", " if @items > 2 && $i != $#items;
 460         $retstr .= " and " if $i+2 == @items;
 461     }
 462
 463     $retstr .= " entr" . ( @items > 1  ? "ies" : "y" )
 464             .  " elsewhere in this document ";
 465
 466     return $retstr;
 467
 468 }
 469
 470 BEGIN {
 471
 472 %HTML_Escapes = (
 473     'amp'       =>      '&',    #   ampersand
 474     'lt'        =>      '<',    #   left chevron, less-than
 475     'gt'        =>      '>',    #   right chevron, greater-than
 476     'quot'      =>      '"',    #   double quote
 477
 478     "Aacute"    =>      "\xC1", #   capital A, acute accent
 479     "aacute"    =>      "\xE1", #   small a, acute accent
 480     "Acirc"     =>      "\xC2", #   capital A, circumflex accent
 481     "acirc"     =>      "\xE2", #   small a, circumflex accent
 482     "AElig"     =>      "\xC6", #   capital AE diphthong (ligature)
 483     "aelig"     =>      "\xE6", #   small ae diphthong (ligature)
 484     "Agrave"    =>      "\xC0", #   capital A, grave accent
 485     "agrave"    =>      "\xE0", #   small a, grave accent
 486     "Aring"     =>      "\xC5", #   capital A, ring
 487     "aring"     =>      "\xE5", #   small a, ring
 488     "Atilde"    =>      "\xC3", #   capital A, tilde
 489     "atilde"    =>      "\xE3", #   small a, tilde
 490     "Auml"      =>      "\xC4", #   capital A, dieresis or umlaut mark
 491     "auml"      =>      "\xE4", #   small a, dieresis or umlaut mark
 492     "Ccedil"    =>      "\xC7", #   capital C, cedilla
 493     "ccedil"    =>      "\xE7", #   small c, cedilla
 494     "Eacute"    =>      "\xC9", #   capital E, acute accent
 495     "eacute"    =>      "\xE9", #   small e, acute accent
 496     "Ecirc"     =>      "\xCA", #   capital E, circumflex accent
 497     "ecirc"     =>      "\xEA", #   small e, circumflex accent
 498     "Egrave"    =>      "\xC8", #   capital E, grave accent
 499     "egrave"    =>      "\xE8", #   small e, grave accent
 500     "ETH"       =>      "\xD0", #   capital Eth, Icelandic
 501     "eth"       =>      "\xF0", #   small eth, Icelandic
 502     "Euml"      =>      "\xCB", #   capital E, dieresis or umlaut mark
 503     "euml"      =>      "\xEB", #   small e, dieresis or umlaut mark
 504     "Iacute"    =>      "\xCD", #   capital I, acute accent
 505     "iacute"    =>      "\xED", #   small i, acute accent
 506     "Icirc"     =>      "\xCE", #   capital I, circumflex accent
 507     "icirc"     =>      "\xEE", #   small i, circumflex accent
 508     "Igrave"    =>      "\xCD", #   capital I, grave accent
 509     "igrave"    =>      "\xED", #   small i, grave accent
 510     "Iuml"      =>      "\xCF", #   capital I, dieresis or umlaut mark
 511     "iuml"      =>      "\xEF", #   small i, dieresis or umlaut mark
 512     "Ntilde"    =>      "\xD1",         #   capital N, tilde
 513     "ntilde"    =>      "\xF1",         #   small n, tilde
 514     "Oacute"    =>      "\xD3", #   capital O, acute accent
 515     "oacute"    =>      "\xF3", #   small o, acute accent
 516     "Ocirc"     =>      "\xD4", #   capital O, circumflex accent
 517     "ocirc"     =>      "\xF4", #   small o, circumflex accent
 518     "Ograve"    =>      "\xD2", #   capital O, grave accent
 519     "ograve"    =>      "\xF2", #   small o, grave accent
 520     "Oslash"    =>      "\xD8", #   capital O, slash
 521     "oslash"    =>      "\xF8", #   small o, slash
 522     "Otilde"    =>      "\xD5", #   capital O, tilde
 523     "otilde"    =>      "\xF5", #   small o, tilde
 524     "Ouml"      =>      "\xD6", #   capital O, dieresis or umlaut mark
 525     "ouml"      =>      "\xF6", #   small o, dieresis or umlaut mark
 526     "szlig"     =>      "\xDF",         #   small sharp s, German (sz ligature)
 527     "THORN"     =>      "\xDE", #   capital THORN, Icelandic
 528     "thorn"     =>      "\xFE", #   small thorn, Icelandic
 529     "Uacute"    =>      "\xDA", #   capital U, acute accent
 530     "uacute"    =>      "\xFA", #   small u, acute accent
 531     "Ucirc"     =>      "\xDB", #   capital U, circumflex accent
 532     "ucirc"     =>      "\xFB", #   small u, circumflex accent
 533     "Ugrave"    =>      "\xD9", #   capital U, grave accent
 534     "ugrave"    =>      "\xF9", #   small u, grave accent
 535     "Uuml"      =>      "\xDC", #   capital U, dieresis or umlaut mark
 536     "uuml"      =>      "\xFC", #   small u, dieresis or umlaut mark
 537     "Yacute"    =>      "\xDD", #   capital Y, acute accent
 538     "yacute"    =>      "\xFD", #   small y, acute accent
 539     "yuml"      =>      "\xFF", #   small y, dieresis or umlaut mark
 540
 541     "lchevron"  =>      "\xAB", #   left chevron (double less than)
 542     "rchevron"  =>      "\xBB", #   right chevron (double greater than)
 543 );
 544 }
 545
 546 1;