lib/Pod/Text.pm

   1 package Pod::Text;
   2
   3 # Version 1.01
   4
   5 =head1 NAME
   6
   7 Pod::Text - convert POD data to formatted ASCII text
   8
   9 =head1 SYNOPSIS
  10
  11         use Pod::Text;
  12
  13         pod2text("perlfunc.pod");
  14
  15 Also:
  16
  17         pod2text < input.pod
  18
  19 =head1 DESCRIPTION
  20
  21 Pod::Text is a module that can convert documentation in the POD format (such
  22 as can be found throughout the Perl distribution) into formatted ASCII.
  23 Termcap is optionally supported for boldface/underline, and can enabled via
  24 C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces
  25 will be used to simulate bold and underlined text.
  26
  27 A separate F<pod2text> program is included that is primarily a wrapper for
  28 Pod::Text.
  29
  30 The single function C<pod2text()> can take one or two arguments. The first
  31 should be the name of a file to read the pod from, or "E<lt>&STDIN" to read from
  32 STDIN. A second argument, if provided, should be a filehandle glob where
  33 output should be sent.
  34
  35 =head1 AUTHOR
  36
  37 Tom Christiansen E<lt>F<tchrist@mox.perl.com>E<gt>
  38
  39 =head1 TODO
  40
  41 Cleanup work. The input and output locations need to be more flexible,
  42 termcap shouldn't be a global variable, and the terminal speed needs to
  43 be properly calculated.
  44
  45 =cut
  46
  47 use Term::Cap;
  48 require Exporter;
  49 @ISA = Exporter;
  50 @EXPORT = qw(pod2text);
  51
  52 $termcap=0;
  53
  54 #$use_format=1;
  55
  56 $UNDL = "\x1b[4m";
  57 $INV = "\x1b[7m";
  58 $BOLD = "\x1b[1m";
  59 $NORM = "\x1b[0m";
  60
  61 sub pod2text {
  62 local($file,*OUTPUT) = @_;
  63 *OUTPUT = *STDOUT if @_<2;
  64
  65 if($termcap and !$setuptermcap) {
  66         $setuptermcap=1;
  67
  68     my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 };
  69     $UNDL = $term->{'_us'};
  70     $INV = $term->{'_mr'};
  71     $BOLD = $term->{'_md'};
  72     $NORM = $term->{'_me'};
  73 }
  74
  75 $SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1))
  76        || ($ENV{TERMCAP} =~ /co#(\d+)/)[0]
  77        ||  $ENV{COLUMNS}
  78        || (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0]
  79        || 72;
  80
  81 $/ = "";
  82
  83 $FANCY = 0;
  84
  85 $cutting = 1;
  86 $DEF_INDENT = 4;
  87 $indent = $DEF_INDENT;
  88 $needspace = 0;
  89
  90 open(IN, $file) || die "Couldn't open $file: $!";
  91
  92 POD_DIRECTIVE: while (<IN>) {
  93     if ($cutting) {
  94         next unless /^=/;
  95         $cutting = 0;
  96     }
  97     1 while s{^(.*?)(\t+)(.*)$}{
  98         $1
  99         . (' ' x (length($2) * 8 - length($1) % 8))
 100         . $3
 101     }me;
 102     # Translate verbatim paragraph
 103     if (/^\s/) {
 104         $needspace = 1;
 105         output($_);
 106         next;
 107     }
 108
 109 sub prepare_for_output {
 110
 111     s/\s*$/\n/;
 112     &init_noremap;
 113
 114     # need to hide E<> first; they're processed in clear_noremap
 115     s/(E<[^<>]+>)/noremap($1)/ge;
 116     $maxnest = 10;
 117     while ($maxnest-- && /[A-Z]</) {
 118         unless ($FANCY) {
 119             s/C<(.*?)>/`$1'/g;
 120         } else {
 121             s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/ge;
 122         }
 123         # s/[IF]<(.*?)>/italic($1)/ge;
 124         s/I<(.*?)>/*$1*/g;
 125         # s/[CB]<(.*?)>/bold($1)/ge;
 126         s/X<.*?>//g;
 127         # LREF: a manpage(3f)
 128         s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g;
 129         # LREF: an =item on another manpage
 130         s{
 131             L<
 132                 ([^/]+)
 133                 /
 134                 (
 135                     [:\w]+
 136                     (\(\))?
 137                 )
 138             >
 139         } {the "$2" entry in the $1 manpage}gx;
 140
 141         # LREF: an =item on this manpage
 142         s{
 143            ((?:
 144             L<
 145                 /
 146                 (
 147                     [:\w]+
 148                     (\(\))?
 149                 )
 150             >
 151             (,?\s+(and\s+)?)?
 152           )+)
 153         } { internal_lrefs($1) }gex;
 154
 155         # LREF: a =head2 (head1?), maybe on a manpage, maybe right here
 156         # the "func" can disambiguate
 157         s{
 158             L<
 159                 (?:
 160                     ([a-zA-Z]\S+?) /
 161                 )?
 162                 "?(.*?)"?
 163             >
 164         }{
 165             do {
 166                 $1      # if no $1, assume it means on this page.
 167                     ?  "the section on \"$2\" in the $1 manpage"
 168                     :  "the section on \"$2\""
 169             }
 170         }gex;
 171
 172         s/[A-Z]<(.*?)>/$1/g;
 173     }
 174     clear_noremap(1);
 175 }
 176
 177     &prepare_for_output;
 178
 179     if (s/^=//) {
 180         # $needspace = 0;               # Assume this.
 181         # s/\n/ /g;
 182         ($Cmd, $_) = split(' ', $_, 2);
 183         # clear_noremap(1);
 184         if ($Cmd eq 'cut') {
 185             $cutting = 1;
 186         }
 187         elsif ($Cmd eq 'head1') {
 188             makespace();
 189             print OUTPUT;
 190             # print OUTPUT uc($_);
 191         }
 192         elsif ($Cmd eq 'head2') {
 193             makespace();
 194             # s/(\w+)/\u\L$1/g;
 195             #print ' ' x $DEF_INDENT, $_;
 196             # print "\xA7";
 197             s/(\w)/\xA7 $1/ if $FANCY;
 198             print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n";
 199         }
 200         elsif ($Cmd eq 'over') {
 201             push(@indent,$indent);
 202             $indent += ($_ + 0) || $DEF_INDENT;
 203         }
 204         elsif ($Cmd eq 'back') {
 205             $indent = pop(@indent);
 206             warn "Unmatched =back\n" unless defined $indent;
 207             $needspace = 1;
 208         }
 209         elsif ($Cmd eq 'item') {
 210             makespace();
 211             # s/\A(\s*)\*/$1\xb7/ if $FANCY;
 212             # s/^(\s*\*\s+)/$1 /;
 213             {
 214                 if (length() + 3 < $indent) {
 215                     my $paratag = $_;
 216                     $_ = <IN>;
 217                     if (/^=/) {  # tricked!
 218                         local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 219                         output($paratag);
 220                         redo POD_DIRECTIVE;
 221                     }
 222                     &prepare_for_output;
 223                     IP_output($paratag, $_);
 224                 } else {
 225                     local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 226                     output($_);
 227                 }
 228             }
 229         }
 230         else {
 231             warn "Unrecognized directive: $Cmd\n";
 232         }
 233     }
 234     else {
 235         # clear_noremap(1);
 236         makespace();
 237         output($_, 1);
 238     }
 239 }
 240
 241 close(IN);
 242
 243 }
 244
 245 #########################################################################
 246
 247 sub makespace {
 248     if ($needspace) {
 249         print OUTPUT "\n";
 250         $needspace = 0;
 251     }
 252 }
 253
 254 sub bold {
 255     my $line = shift;
 256     return $line if $use_format;
 257     if($termcap) {
 258         $line = "$BOLD$line$NORM";
 259     } else {
 260             $line =~ s/(.)/$1\b$1/g;
 261         }
 262 #    $line = "$BOLD$line$NORM" if $ansify;
 263     return $line;
 264 }
 265
 266 sub italic {
 267     my $line = shift;
 268     return $line if $use_format;
 269     if($termcap) {
 270         $line = "$UNDL$line$NORM";
 271     } else {
 272             $line =~ s/(.)/$1\b_/g;
 273     }
 274 #    $line = "$UNDL$line$NORM" if $ansify;
 275     return $line;
 276 }
 277
 278 # Fill a paragraph including underlined and overstricken chars.
 279 # It's not perfect for words longer than the margin, and it's probably
 280 # slow, but it works.
 281 sub fill {
 282     local $_ = shift;
 283     my $par = "";
 284     my $indent_space = " " x $indent;
 285     my $marg = $SCREEN-$indent;
 286     my $line = $indent_space;
 287     my $line_length;
 288     foreach (split) {
 289         my $word_length = length;
 290         $word_length -= 2 while /\010/g;  # Subtract backspaces
 291
 292         if ($line_length + $word_length > $marg) {
 293             $par .= $line . "\n";
 294             $line= $indent_space . $_;
 295             $line_length = $word_length;
 296         }
 297         else {
 298             if ($line_length) {
 299                 $line_length++;
 300                 $line .= " ";
 301             }
 302             $line_length += $word_length;
 303             $line .= $_;
 304         }
 305     }
 306     $par .= "$line\n" if $line;
 307     $par .= "\n";
 308     return $par;
 309 }
 310
 311 sub IP_output {
 312     local($tag, $_) = @_;
 313     local($tag_indent) = $indent[$#index - 1] || $DEF_INDENT;
 314     $tag_cols = $SCREEN - $tag_indent;
 315     $cols = $SCREEN - $indent;
 316     $tag =~ s/\s*$//;
 317     s/\s+/ /g;
 318     s/^ //;
 319     $str = "format OUTPUT = \n"
 320         . (" " x ($tag_indent))
 321         . '@' . ('<' x ($indent - $tag_indent - 1))
 322         . "^" .  ("<" x ($cols - 1)) . "\n"
 323         . '$tag, $_'
 324         . "\n~~"
 325         . (" " x ($indent-2))
 326         . "^" .  ("<" x ($cols - 5)) . "\n"
 327         . '$_' . "\n\n.\n1";
 328     #warn $str; warn "tag is $tag, _ is $_";
 329     eval $str || die;
 330     write OUTPUT;
 331 }
 332
 333 sub output {
 334     local($_, $reformat) = @_;
 335     if ($reformat) {
 336         $cols = $SCREEN - $indent;
 337         s/\s+/ /g;
 338         s/^ //;
 339         $str = "format OUTPUT = \n~~"
 340             . (" " x ($indent-2))
 341             . "^" .  ("<" x ($cols - 5)) . "\n"
 342             . '$_' . "\n\n.\n1";
 343         eval $str || die;
 344         write OUTPUT;
 345     } else {
 346         s/^/' ' x $indent/gem;
 347         s/^\s+\n$/\n/gm;
 348         print OUTPUT;
 349     }
 350 }
 351
 352 sub noremap {
 353     local($thing_to_hide) = shift;
 354     $thing_to_hide =~ tr/\000-\177/\200-\377/;
 355     return $thing_to_hide;
 356 }
 357
 358 sub init_noremap {
 359     die "unmatched init" if $mapready++;
 360     #mask off high bit characters in input stream
 361     s/([\200-\377])/"E<".ord($1).">"/ge;
 362 }
 363
 364 sub clear_noremap {
 365     my $ready_to_print = $_[0];
 366     die "unmatched clear" unless $mapready--;
 367     tr/\200-\377/\000-\177/;
 368     # now for the E<>s, which have been hidden until now
 369     # otherwise the interative \w<> processing would have
 370     # been hosed by the E<gt>
 371     s {
 372             E<
 373             (
 374                 ( \d+ )
 375                 | ( [A-Za-z]+ )
 376             )
 377             >
 378     } {
 379          do {
 380                 defined $2
 381                 ? chr($2)
 382                 :
 383              defined $HTML_Escapes{$3}
 384                 ? do { $HTML_Escapes{$3} }
 385                 : do {
 386                     warn "Unknown escape: $& in $_";
 387                     "E<$1>";
 388                 }
 389          }
 390     }egx if $ready_to_print;
 391 }
 392
 393 sub internal_lrefs {
 394     local($_) = shift;
 395     s{L</([^>]+)>}{$1}g;
 396     my(@items) = split( /(?:,?\s+(?:and\s+)?)/ );
 397     my $retstr = "the ";
 398     my $i;
 399     for ($i = 0; $i <= $#items; $i++) {
 400         $retstr .= "C<$items[$i]>";
 401         $retstr .= ", " if @items > 2 && $i != $#items;
 402         $retstr .= " and " if $i+2 == @items;
 403     }
 404
 405     $retstr .= " entr" . ( @items > 1  ? "ies" : "y" )
 406             .  " elsewhere in this document ";
 407
 408     return $retstr;
 409
 410 }
 411
 412 BEGIN {
 413
 414 %HTML_Escapes = (
 415     'amp'       =>      '&',    #   ampersand
 416     'lt'        =>      '<',    #   left chevron, less-than
 417     'gt'        =>      '>',    #   right chevron, greater-than
 418     'quot'      =>      '"',    #   double quote
 419
 420     "Aacute"    =>      "\xC1", #   capital A, acute accent
 421     "aacute"    =>      "\xE1", #   small a, acute accent
 422     "Acirc"     =>      "\xC2", #   capital A, circumflex accent
 423     "acirc"     =>      "\xE2", #   small a, circumflex accent
 424     "AElig"     =>      "\xC6", #   capital AE diphthong (ligature)
 425     "aelig"     =>      "\xE6", #   small ae diphthong (ligature)
 426     "Agrave"    =>      "\xC0", #   capital A, grave accent
 427     "agrave"    =>      "\xE0", #   small a, grave accent
 428     "Aring"     =>      "\xC5", #   capital A, ring
 429     "aring"     =>      "\xE5", #   small a, ring
 430     "Atilde"    =>      "\xC3", #   capital A, tilde
 431     "atilde"    =>      "\xE3", #   small a, tilde
 432     "Auml"      =>      "\xC4", #   capital A, dieresis or umlaut mark
 433     "auml"      =>      "\xE4", #   small a, dieresis or umlaut mark
 434     "Ccedil"    =>      "\xC7", #   capital C, cedilla
 435     "ccedil"    =>      "\xE7", #   small c, cedilla
 436     "Eacute"    =>      "\xC9", #   capital E, acute accent
 437     "eacute"    =>      "\xE9", #   small e, acute accent
 438     "Ecirc"     =>      "\xCA", #   capital E, circumflex accent
 439     "ecirc"     =>      "\xEA", #   small e, circumflex accent
 440     "Egrave"    =>      "\xC8", #   capital E, grave accent
 441     "egrave"    =>      "\xE8", #   small e, grave accent
 442     "ETH"       =>      "\xD0", #   capital Eth, Icelandic
 443     "eth"       =>      "\xF0", #   small eth, Icelandic
 444     "Euml"      =>      "\xCB", #   capital E, dieresis or umlaut mark
 445     "euml"      =>      "\xEB", #   small e, dieresis or umlaut mark
 446     "Iacute"    =>      "\xCD", #   capital I, acute accent
 447     "iacute"    =>      "\xED", #   small i, acute accent
 448     "Icirc"     =>      "\xCE", #   capital I, circumflex accent
 449     "icirc"     =>      "\xEE", #   small i, circumflex accent
 450     "Igrave"    =>      "\xCD", #   capital I, grave accent
 451     "igrave"    =>      "\xED", #   small i, grave accent
 452     "Iuml"      =>      "\xCF", #   capital I, dieresis or umlaut mark
 453     "iuml"      =>      "\xEF", #   small i, dieresis or umlaut mark
 454     "Ntilde"    =>      "\xD1",         #   capital N, tilde
 455     "ntilde"    =>      "\xF1",         #   small n, tilde
 456     "Oacute"    =>      "\xD3", #   capital O, acute accent
 457     "oacute"    =>      "\xF3", #   small o, acute accent
 458     "Ocirc"     =>      "\xD4", #   capital O, circumflex accent
 459     "ocirc"     =>      "\xF4", #   small o, circumflex accent
 460     "Ograve"    =>      "\xD2", #   capital O, grave accent
 461     "ograve"    =>      "\xF2", #   small o, grave accent
 462     "Oslash"    =>      "\xD8", #   capital O, slash
 463     "oslash"    =>      "\xF8", #   small o, slash
 464     "Otilde"    =>      "\xD5", #   capital O, tilde
 465     "otilde"    =>      "\xF5", #   small o, tilde
 466     "Ouml"      =>      "\xD6", #   capital O, dieresis or umlaut mark
 467     "ouml"      =>      "\xF6", #   small o, dieresis or umlaut mark
 468     "szlig"     =>      "\xDF",         #   small sharp s, German (sz ligature)
 469     "THORN"     =>      "\xDE", #   capital THORN, Icelandic
 470     "thorn"     =>      "\xFE", #   small thorn, Icelandic
 471     "Uacute"    =>      "\xDA", #   capital U, acute accent
 472     "uacute"    =>      "\xFA", #   small u, acute accent
 473     "Ucirc"     =>      "\xDB", #   capital U, circumflex accent
 474     "ucirc"     =>      "\xFB", #   small u, circumflex accent
 475     "Ugrave"    =>      "\xD9", #   capital U, grave accent
 476     "ugrave"    =>      "\xF9", #   small u, grave accent
 477     "Uuml"      =>      "\xDC", #   capital U, dieresis or umlaut mark
 478     "uuml"      =>      "\xFC", #   small u, dieresis or umlaut mark
 479     "Yacute"    =>      "\xDD", #   capital Y, acute accent
 480     "yacute"    =>      "\xFD", #   small y, acute accent
 481     "yuml"      =>      "\xFF", #   small y, dieresis or umlaut mark
 482
 483     "lchevron"  =>      "\xAB", #   left chevron (double less than)
 484     "rchevron"  =>      "\xBB", #   right chevron (double greater than)
 485 );
 486 }
 487
 488 1;