lib/Pod/Text.pm

   1 package Pod::Text;
   2
   3 =head1 NAME
   4
   5 Pod::Text - convert POD data to formatted ASCII text
   6
   7 =head1 SYNOPSIS
   8
   9         use Pod::Text;
  10
  11         pod2text("perlfunc.pod");
  12
  13 Also:
  14
  15         pod2text < input.pod
  16
  17 =head1 DESCRIPTION
  18
  19 Pod::Text is a module that can convert documentation in the POD format (such
  20 as can be found throughout the Perl distribution) into formatted ASCII.
  21 Termcap is optionally supported for boldface/underline, and can enabled via
  22 C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces
  23 will be used to simulate bold and underlined text.
  24
  25 A separate F<pod2text> program is included that is primarily a wrapper for
  26 Pod::Text.
  27
  28 The single function C<pod2text()> can take one or two arguments. The first
  29 should be the name of a file to read the pod from, or "E<lt>&STDIN" to read from
  30 STDIN. A second argument, if provided, should be a filehandle glob where
  31 output should be sent.
  32
  33 =head1 AUTHOR
  34
  35 Tom Christiansen E<lt>F<tchrist@mox.perl.com>E<gt>
  36
  37 =head1 TODO
  38
  39 Cleanup work. The input and output locations need to be more flexible,
  40 termcap shouldn't be a global variable, and the terminal speed needs to
  41 be properly calculated.
  42
  43 =cut
  44
  45 use Term::Cap;
  46 require Exporter;
  47 @ISA = Exporter;
  48 @EXPORT = qw(pod2text);
  49
  50 use vars qw($VERSION);
  51 $VERSION = "1.0202";
  52
  53 $termcap=0;
  54
  55 #$use_format=1;
  56
  57 $UNDL = "\x1b[4m";
  58 $INV = "\x1b[7m";
  59 $BOLD = "\x1b[1m";
  60 $NORM = "\x1b[0m";
  61
  62 sub pod2text {
  63 local($file,*OUTPUT) = @_;
  64 *OUTPUT = *STDOUT if @_<2;
  65
  66 if($termcap and !$setuptermcap) {
  67         $setuptermcap=1;
  68
  69     my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 };
  70     $UNDL = $term->{'_us'};
  71     $INV = $term->{'_mr'};
  72     $BOLD = $term->{'_md'};
  73     $NORM = $term->{'_me'};
  74 }
  75
  76 $SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1))
  77        ||  $ENV{COLUMNS}
  78        || ($ENV{TERMCAP} =~ /co#(\d+)/)[0]
  79        || (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0]
  80        || 72;
  81
  82 $/ = "";
  83
  84 $FANCY = 0;
  85
  86 $cutting = 1;
  87 $DEF_INDENT = 4;
  88 $indent = $DEF_INDENT;
  89 $needspace = 0;
  90 $begun = "";
  91
  92 open(IN, $file) || die "Couldn't open $file: $!";
  93
  94 POD_DIRECTIVE: while (<IN>) {
  95     if ($cutting) {
  96         next unless /^=/;
  97         $cutting = 0;
  98     }
  99     if ($begun) {
 100         if (/^=end\s+$begun/) {
 101              $begun = "";
 102         }
 103         elsif ($begun eq "text") {
 104             print OUTPUT $_;
 105         }
 106         next;
 107     }
 108     1 while s{^(.*?)(\t+)(.*)$}{
 109         $1
 110         . (' ' x (length($2) * 8 - length($1) % 8))
 111         . $3
 112     }me;
 113     # Translate verbatim paragraph
 114     if (/^\s/) {
 115         $needspace = 1;
 116         output($_);
 117         next;
 118     }
 119
 120     if (/^=for\s+(\S+)\s*(.*)/s) {
 121         if ($1 eq "text") {
 122             print OUTPUT $2,"";
 123         } else {
 124             # ignore unknown for
 125         }
 126         next;
 127     }
 128     elsif (/^=begin\s+(\S+)\s*(.*)/s) {
 129         $begun = $1;
 130         if ($1 eq "text") {
 131             print OUTPUT $2."";
 132         }
 133         next;
 134     }
 135
 136 sub prepare_for_output {
 137
 138     s/\s*$/\n/;
 139     &init_noremap;
 140
 141     # need to hide E<> first; they're processed in clear_noremap
 142     s/(E<[^<>]+>)/noremap($1)/ge;
 143     $maxnest = 10;
 144     while ($maxnest-- && /[A-Z]</) {
 145         unless ($FANCY) {
 146             s/C<(.*?)>/`$1'/sg;
 147         } else {
 148             s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/sge;
 149         }
 150         # s/[IF]<(.*?)>/italic($1)/ge;
 151         s/I<(.*?)>/*$1*/sg;
 152         # s/[CB]<(.*?)>/bold($1)/ge;
 153         s/X<.*?>//sg;
 154         # LREF: a manpage(3f)
 155         s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g;
 156         # LREF: an =item on another manpage
 157         s{
 158             L<
 159                 ([^/]+)
 160                 /
 161                 (
 162                     [:\w]+
 163                     (\(\))?
 164                 )
 165             >
 166         } {the "$2" entry in the $1 manpage}gx;
 167
 168         # LREF: an =item on this manpage
 169         s{
 170            ((?:
 171             L<
 172                 /
 173                 (
 174                     [:\w]+
 175                     (\(\))?
 176                 )
 177             >
 178             (,?\s+(and\s+)?)?
 179           )+)
 180         } { internal_lrefs($1) }gex;
 181
 182         # LREF: a =head2 (head1?), maybe on a manpage, maybe right here
 183         # the "func" can disambiguate
 184         s{
 185             L<
 186                 (?:
 187                     ([a-zA-Z]\S+?) /
 188                 )?
 189                 "?(.*?)"?
 190             >
 191         }{
 192             do {
 193                 $1      # if no $1, assume it means on this page.
 194                     ?  "the section on \"$2\" in the $1 manpage"
 195                     :  "the section on \"$2\""
 196             }
 197         }sgex;
 198
 199         s/[A-Z]<(.*?)>/$1/sg;
 200     }
 201     clear_noremap(1);
 202 }
 203
 204     &prepare_for_output;
 205
 206     if (s/^=//) {
 207         # $needspace = 0;               # Assume this.
 208         # s/\n/ /g;
 209         ($Cmd, $_) = split(' ', $_, 2);
 210         # clear_noremap(1);
 211         if ($Cmd eq 'cut') {
 212             $cutting = 1;
 213         }
 214         elsif ($Cmd eq 'pod') {
 215             $cutting = 0;
 216         }
 217         elsif ($Cmd eq 'head1') {
 218             makespace();
 219             print OUTPUT;
 220             # print OUTPUT uc($_);
 221         }
 222         elsif ($Cmd eq 'head2') {
 223             makespace();
 224             # s/(\w+)/\u\L$1/g;
 225             #print ' ' x $DEF_INDENT, $_;
 226             # print "\xA7";
 227             s/(\w)/\xA7 $1/ if $FANCY;
 228             print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n";
 229         }
 230         elsif ($Cmd eq 'over') {
 231             push(@indent,$indent);
 232             $indent += ($_ + 0) || $DEF_INDENT;
 233         }
 234         elsif ($Cmd eq 'back') {
 235             $indent = pop(@indent);
 236             warn "Unmatched =back\n" unless defined $indent;
 237             $needspace = 1;
 238         }
 239         elsif ($Cmd eq 'item') {
 240             makespace();
 241             # s/\A(\s*)\*/$1\xb7/ if $FANCY;
 242             # s/^(\s*\*\s+)/$1 /;
 243             {
 244                 if (length() + 3 < $indent) {
 245                     my $paratag = $_;
 246                     $_ = <IN>;
 247                     if (/^=/) {  # tricked!
 248                         local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 249                         output($paratag);
 250                         redo POD_DIRECTIVE;
 251                     }
 252                     &prepare_for_output;
 253                     IP_output($paratag, $_);
 254                 } else {
 255                     local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 256                     output($_);
 257                 }
 258             }
 259         }
 260         else {
 261             warn "Unrecognized directive: $Cmd\n";
 262         }
 263     }
 264     else {
 265         # clear_noremap(1);
 266         makespace();
 267         output($_, 1);
 268     }
 269 }
 270
 271 close(IN);
 272
 273 }
 274
 275 #########################################################################
 276
 277 sub makespace {
 278     if ($needspace) {
 279         print OUTPUT "\n";
 280         $needspace = 0;
 281     }
 282 }
 283
 284 sub bold {
 285     my $line = shift;
 286     return $line if $use_format;
 287     if($termcap) {
 288         $line = "$BOLD$line$NORM";
 289     } else {
 290             $line =~ s/(.)/$1\b$1/g;
 291         }
 292 #    $line = "$BOLD$line$NORM" if $ansify;
 293     return $line;
 294 }
 295
 296 sub italic {
 297     my $line = shift;
 298     return $line if $use_format;
 299     if($termcap) {
 300         $line = "$UNDL$line$NORM";
 301     } else {
 302             $line =~ s/(.)/$1\b_/g;
 303     }
 304 #    $line = "$UNDL$line$NORM" if $ansify;
 305     return $line;
 306 }
 307
 308 # Fill a paragraph including underlined and overstricken chars.
 309 # It's not perfect for words longer than the margin, and it's probably
 310 # slow, but it works.
 311 sub fill {
 312     local $_ = shift;
 313     my $par = "";
 314     my $indent_space = " " x $indent;
 315     my $marg = $SCREEN-$indent;
 316     my $line = $indent_space;
 317     my $line_length;
 318     foreach (split) {
 319         my $word_length = length;
 320         $word_length -= 2 while /\010/g;  # Subtract backspaces
 321
 322         if ($line_length + $word_length > $marg) {
 323             $par .= $line . "\n";
 324             $line= $indent_space . $_;
 325             $line_length = $word_length;
 326         }
 327         else {
 328             if ($line_length) {
 329                 $line_length++;
 330                 $line .= " ";
 331             }
 332             $line_length += $word_length;
 333             $line .= $_;
 334         }
 335     }
 336     $par .= "$line\n" if $line;
 337     $par .= "\n";
 338     return $par;
 339 }
 340
 341 sub IP_output {
 342     local($tag, $_) = @_;
 343     local($tag_indent) = $indent[$#index - 1] || $DEF_INDENT;
 344     $tag_cols = $SCREEN - $tag_indent;
 345     $cols = $SCREEN - $indent;
 346     $tag =~ s/\s*$//;
 347     s/\s+/ /g;
 348     s/^ //;
 349     $str = "format OUTPUT = \n"
 350         . (" " x ($tag_indent))
 351         . '@' . ('<' x ($indent - $tag_indent - 1))
 352         . "^" .  ("<" x ($cols - 1)) . "\n"
 353         . '$tag, $_'
 354         . "\n~~"
 355         . (" " x ($indent-2))
 356         . "^" .  ("<" x ($cols - 5)) . "\n"
 357         . '$_' . "\n\n.\n1";
 358     #warn $str; warn "tag is $tag, _ is $_";
 359     eval $str || die;
 360     write OUTPUT;
 361 }
 362
 363 sub output {
 364     local($_, $reformat) = @_;
 365     if ($reformat) {
 366         $cols = $SCREEN - $indent;
 367         s/\s+/ /g;
 368         s/^ //;
 369         $str = "format OUTPUT = \n~~"
 370             . (" " x ($indent-2))
 371             . "^" .  ("<" x ($cols - 5)) . "\n"
 372             . '$_' . "\n\n.\n1";
 373         eval $str || die;
 374         write OUTPUT;
 375     } else {
 376         s/^/' ' x $indent/gem;
 377         s/^\s+\n$/\n/gm;
 378         print OUTPUT;
 379     }
 380 }
 381
 382 sub noremap {
 383     local($thing_to_hide) = shift;
 384     $thing_to_hide =~ tr/\000-\177/\200-\377/;
 385     return $thing_to_hide;
 386 }
 387
 388 sub init_noremap {
 389     die "unmatched init" if $mapready++;
 390     #mask off high bit characters in input stream
 391     s/([\200-\377])/"E<".ord($1).">"/ge;
 392 }
 393
 394 sub clear_noremap {
 395     my $ready_to_print = $_[0];
 396     die "unmatched clear" unless $mapready--;
 397     tr/\200-\377/\000-\177/;
 398     # now for the E<>s, which have been hidden until now
 399     # otherwise the interative \w<> processing would have
 400     # been hosed by the E<gt>
 401     s {
 402             E<
 403             (
 404                 ( \d+ )
 405                 | ( [A-Za-z]+ )
 406             )
 407             >
 408     } {
 409          do {
 410                 defined $2
 411                 ? chr($2)
 412                 :
 413              defined $HTML_Escapes{$3}
 414                 ? do { $HTML_Escapes{$3} }
 415                 : do {
 416                     warn "Unknown escape: E<$1> in $_";
 417                     "E<$1>";
 418                 }
 419          }
 420     }egx if $ready_to_print;
 421 }
 422
 423 sub internal_lrefs {
 424     local($_) = shift;
 425     s{L</([^>]+)>}{$1}g;
 426     my(@items) = split( /(?:,?\s+(?:and\s+)?)/ );
 427     my $retstr = "the ";
 428     my $i;
 429     for ($i = 0; $i <= $#items; $i++) {
 430         $retstr .= "C<$items[$i]>";
 431         $retstr .= ", " if @items > 2 && $i != $#items;
 432         $retstr .= " and " if $i+2 == @items;
 433     }
 434
 435     $retstr .= " entr" . ( @items > 1  ? "ies" : "y" )
 436             .  " elsewhere in this document ";
 437
 438     return $retstr;
 439
 440 }
 441
 442 BEGIN {
 443
 444 %HTML_Escapes = (
 445     'amp'       =>      '&',    #   ampersand
 446     'lt'        =>      '<',    #   left chevron, less-than
 447     'gt'        =>      '>',    #   right chevron, greater-than
 448     'quot'      =>      '"',    #   double quote
 449
 450     "Aacute"    =>      "\xC1", #   capital A, acute accent
 451     "aacute"    =>      "\xE1", #   small a, acute accent
 452     "Acirc"     =>      "\xC2", #   capital A, circumflex accent
 453     "acirc"     =>      "\xE2", #   small a, circumflex accent
 454     "AElig"     =>      "\xC6", #   capital AE diphthong (ligature)
 455     "aelig"     =>      "\xE6", #   small ae diphthong (ligature)
 456     "Agrave"    =>      "\xC0", #   capital A, grave accent
 457     "agrave"    =>      "\xE0", #   small a, grave accent
 458     "Aring"     =>      "\xC5", #   capital A, ring
 459     "aring"     =>      "\xE5", #   small a, ring
 460     "Atilde"    =>      "\xC3", #   capital A, tilde
 461     "atilde"    =>      "\xE3", #   small a, tilde
 462     "Auml"      =>      "\xC4", #   capital A, dieresis or umlaut mark
 463     "auml"      =>      "\xE4", #   small a, dieresis or umlaut mark
 464     "Ccedil"    =>      "\xC7", #   capital C, cedilla
 465     "ccedil"    =>      "\xE7", #   small c, cedilla
 466     "Eacute"    =>      "\xC9", #   capital E, acute accent
 467     "eacute"    =>      "\xE9", #   small e, acute accent
 468     "Ecirc"     =>      "\xCA", #   capital E, circumflex accent
 469     "ecirc"     =>      "\xEA", #   small e, circumflex accent
 470     "Egrave"    =>      "\xC8", #   capital E, grave accent
 471     "egrave"    =>      "\xE8", #   small e, grave accent
 472     "ETH"       =>      "\xD0", #   capital Eth, Icelandic
 473     "eth"       =>      "\xF0", #   small eth, Icelandic
 474     "Euml"      =>      "\xCB", #   capital E, dieresis or umlaut mark
 475     "euml"      =>      "\xEB", #   small e, dieresis or umlaut mark
 476     "Iacute"    =>      "\xCD", #   capital I, acute accent
 477     "iacute"    =>      "\xED", #   small i, acute accent
 478     "Icirc"     =>      "\xCE", #   capital I, circumflex accent
 479     "icirc"     =>      "\xEE", #   small i, circumflex accent
 480     "Igrave"    =>      "\xCD", #   capital I, grave accent
 481     "igrave"    =>      "\xED", #   small i, grave accent
 482     "Iuml"      =>      "\xCF", #   capital I, dieresis or umlaut mark
 483     "iuml"      =>      "\xEF", #   small i, dieresis or umlaut mark
 484     "Ntilde"    =>      "\xD1",         #   capital N, tilde
 485     "ntilde"    =>      "\xF1",         #   small n, tilde
 486     "Oacute"    =>      "\xD3", #   capital O, acute accent
 487     "oacute"    =>      "\xF3", #   small o, acute accent
 488     "Ocirc"     =>      "\xD4", #   capital O, circumflex accent
 489     "ocirc"     =>      "\xF4", #   small o, circumflex accent
 490     "Ograve"    =>      "\xD2", #   capital O, grave accent
 491     "ograve"    =>      "\xF2", #   small o, grave accent
 492     "Oslash"    =>      "\xD8", #   capital O, slash
 493     "oslash"    =>      "\xF8", #   small o, slash
 494     "Otilde"    =>      "\xD5", #   capital O, tilde
 495     "otilde"    =>      "\xF5", #   small o, tilde
 496     "Ouml"      =>      "\xD6", #   capital O, dieresis or umlaut mark
 497     "ouml"      =>      "\xF6", #   small o, dieresis or umlaut mark
 498     "szlig"     =>      "\xDF",         #   small sharp s, German (sz ligature)
 499     "THORN"     =>      "\xDE", #   capital THORN, Icelandic
 500     "thorn"     =>      "\xFE", #   small thorn, Icelandic
 501     "Uacute"    =>      "\xDA", #   capital U, acute accent
 502     "uacute"    =>      "\xFA", #   small u, acute accent
 503     "Ucirc"     =>      "\xDB", #   capital U, circumflex accent
 504     "ucirc"     =>      "\xFB", #   small u, circumflex accent
 505     "Ugrave"    =>      "\xD9", #   capital U, grave accent
 506     "ugrave"    =>      "\xF9", #   small u, grave accent
 507     "Uuml"      =>      "\xDC", #   capital U, dieresis or umlaut mark
 508     "uuml"      =>      "\xFC", #   small u, dieresis or umlaut mark
 509     "Yacute"    =>      "\xDD", #   capital Y, acute accent
 510     "yacute"    =>      "\xFD", #   small y, acute accent
 511     "yuml"      =>      "\xFF", #   small y, dieresis or umlaut mark
 512
 513     "lchevron"  =>      "\xAB", #   left chevron (double less than)
 514     "rchevron"  =>      "\xBB", #   right chevron (double greater than)
 515 );
 516 }
 517
 518 1;