lib/Pod/Text.pm

   1 package Pod::Text;
   2
   3 # Version 1.02
   4
   5 =head1 NAME
   6
   7 Pod::Text - convert POD data to formatted ASCII text
   8
   9 =head1 SYNOPSIS
  10
  11         use Pod::Text;
  12
  13         pod2text("perlfunc.pod");
  14
  15 Also:
  16
  17         pod2text < input.pod
  18
  19 =head1 DESCRIPTION
  20
  21 Pod::Text is a module that can convert documentation in the POD format (such
  22 as can be found throughout the Perl distribution) into formatted ASCII.
  23 Termcap is optionally supported for boldface/underline, and can enabled via
  24 C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces
  25 will be used to simulate bold and underlined text.
  26
  27 A separate F<pod2text> program is included that is primarily a wrapper for
  28 Pod::Text.
  29
  30 The single function C<pod2text()> can take one or two arguments. The first
  31 should be the name of a file to read the pod from, or "E<lt>&STDIN" to read from
  32 STDIN. A second argument, if provided, should be a filehandle glob where
  33 output should be sent.
  34
  35 =head1 AUTHOR
  36
  37 Tom Christiansen E<lt>F<tchrist@mox.perl.com>E<gt>
  38
  39 =head1 TODO
  40
  41 Cleanup work. The input and output locations need to be more flexible,
  42 termcap shouldn't be a global variable, and the terminal speed needs to
  43 be properly calculated.
  44
  45 =cut
  46
  47 use Term::Cap;
  48 require Exporter;
  49 @ISA = Exporter;
  50 @EXPORT = qw(pod2text);
  51
  52 $termcap=0;
  53
  54 #$use_format=1;
  55
  56 $UNDL = "\x1b[4m";
  57 $INV = "\x1b[7m";
  58 $BOLD = "\x1b[1m";
  59 $NORM = "\x1b[0m";
  60
  61 sub pod2text {
  62 local($file,*OUTPUT) = @_;
  63 *OUTPUT = *STDOUT if @_<2;
  64
  65 if($termcap and !$setuptermcap) {
  66         $setuptermcap=1;
  67
  68     my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 };
  69     $UNDL = $term->{'_us'};
  70     $INV = $term->{'_mr'};
  71     $BOLD = $term->{'_md'};
  72     $NORM = $term->{'_me'};
  73 }
  74
  75 $SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1))
  76        ||  $ENV{COLUMNS}
  77        || ($ENV{TERMCAP} =~ /co#(\d+)/)[0]
  78        || (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0]
  79        || 72;
  80
  81 $/ = "";
  82
  83 $FANCY = 0;
  84
  85 $cutting = 1;
  86 $DEF_INDENT = 4;
  87 $indent = $DEF_INDENT;
  88 $needspace = 0;
  89 $begun = "";
  90
  91 open(IN, $file) || die "Couldn't open $file: $!";
  92
  93 POD_DIRECTIVE: while (<IN>) {
  94     if ($cutting) {
  95         next unless /^=/;
  96         $cutting = 0;
  97     }
  98     if ($begun) {
  99         if (/^=end\s+$begun/) {
 100              $begun = "";
 101         }
 102         elsif ($begun eq "text") {
 103             print STDOUT $_;
 104         }
 105         next;
 106     }
 107     1 while s{^(.*?)(\t+)(.*)$}{
 108         $1
 109         . (' ' x (length($2) * 8 - length($1) % 8))
 110         . $3
 111     }me;
 112     # Translate verbatim paragraph
 113     if (/^\s/) {
 114         $needspace = 1;
 115         output($_);
 116         next;
 117     }
 118
 119     if (/^=for\s+(\S+)\s*(.*)/s) {
 120         if ($1 eq "text") {
 121             print STDOUT $2,"";
 122         } else {
 123             # ignore unknown for
 124         }
 125         next;
 126     }
 127     elsif (/^=begin\s+(\S+)\s*(.*)/s) {
 128         $begun = $1;
 129         if ($1 eq "text") {
 130             print STDOUT $2."";
 131         }
 132         next;
 133     }
 134
 135 sub prepare_for_output {
 136
 137     s/\s*$/\n/;
 138     &init_noremap;
 139
 140     # need to hide E<> first; they're processed in clear_noremap
 141     s/(E<[^<>]+>)/noremap($1)/ge;
 142     $maxnest = 10;
 143     while ($maxnest-- && /[A-Z]</) {
 144         unless ($FANCY) {
 145             s/C<(.*?)>/`$1'/sg;
 146         } else {
 147             s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/sge;
 148         }
 149         # s/[IF]<(.*?)>/italic($1)/ge;
 150         s/I<(.*?)>/*$1*/sg;
 151         # s/[CB]<(.*?)>/bold($1)/ge;
 152         s/X<.*?>//sg;
 153         # LREF: a manpage(3f)
 154         s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g;
 155         # LREF: an =item on another manpage
 156         s{
 157             L<
 158                 ([^/]+)
 159                 /
 160                 (
 161                     [:\w]+
 162                     (\(\))?
 163                 )
 164             >
 165         } {the "$2" entry in the $1 manpage}gx;
 166
 167         # LREF: an =item on this manpage
 168         s{
 169            ((?:
 170             L<
 171                 /
 172                 (
 173                     [:\w]+
 174                     (\(\))?
 175                 )
 176             >
 177             (,?\s+(and\s+)?)?
 178           )+)
 179         } { internal_lrefs($1) }gex;
 180
 181         # LREF: a =head2 (head1?), maybe on a manpage, maybe right here
 182         # the "func" can disambiguate
 183         s{
 184             L<
 185                 (?:
 186                     ([a-zA-Z]\S+?) /
 187                 )?
 188                 "?(.*?)"?
 189             >
 190         }{
 191             do {
 192                 $1      # if no $1, assume it means on this page.
 193                     ?  "the section on \"$2\" in the $1 manpage"
 194                     :  "the section on \"$2\""
 195             }
 196         }sgex;
 197
 198         s/[A-Z]<(.*?)>/$1/sg;
 199     }
 200     clear_noremap(1);
 201 }
 202
 203     &prepare_for_output;
 204
 205     if (s/^=//) {
 206         # $needspace = 0;               # Assume this.
 207         # s/\n/ /g;
 208         ($Cmd, $_) = split(' ', $_, 2);
 209         # clear_noremap(1);
 210         if ($Cmd eq 'cut') {
 211             $cutting = 1;
 212         }
 213         elsif ($Cmd eq 'head1') {
 214             makespace();
 215             print OUTPUT;
 216             # print OUTPUT uc($_);
 217         }
 218         elsif ($Cmd eq 'head2') {
 219             makespace();
 220             # s/(\w+)/\u\L$1/g;
 221             #print ' ' x $DEF_INDENT, $_;
 222             # print "\xA7";
 223             s/(\w)/\xA7 $1/ if $FANCY;
 224             print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n";
 225         }
 226         elsif ($Cmd eq 'over') {
 227             push(@indent,$indent);
 228             $indent += ($_ + 0) || $DEF_INDENT;
 229         }
 230         elsif ($Cmd eq 'back') {
 231             $indent = pop(@indent);
 232             warn "Unmatched =back\n" unless defined $indent;
 233             $needspace = 1;
 234         }
 235         elsif ($Cmd eq 'item') {
 236             makespace();
 237             # s/\A(\s*)\*/$1\xb7/ if $FANCY;
 238             # s/^(\s*\*\s+)/$1 /;
 239             {
 240                 if (length() + 3 < $indent) {
 241                     my $paratag = $_;
 242                     $_ = <IN>;
 243                     if (/^=/) {  # tricked!
 244                         local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 245                         output($paratag);
 246                         redo POD_DIRECTIVE;
 247                     }
 248                     &prepare_for_output;
 249                     IP_output($paratag, $_);
 250                 } else {
 251                     local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 252                     output($_);
 253                 }
 254             }
 255         }
 256         else {
 257             warn "Unrecognized directive: $Cmd\n";
 258         }
 259     }
 260     else {
 261         # clear_noremap(1);
 262         makespace();
 263         output($_, 1);
 264     }
 265 }
 266
 267 close(IN);
 268
 269 }
 270
 271 #########################################################################
 272
 273 sub makespace {
 274     if ($needspace) {
 275         print OUTPUT "\n";
 276         $needspace = 0;
 277     }
 278 }
 279
 280 sub bold {
 281     my $line = shift;
 282     return $line if $use_format;
 283     if($termcap) {
 284         $line = "$BOLD$line$NORM";
 285     } else {
 286             $line =~ s/(.)/$1\b$1/g;
 287         }
 288 #    $line = "$BOLD$line$NORM" if $ansify;
 289     return $line;
 290 }
 291
 292 sub italic {
 293     my $line = shift;
 294     return $line if $use_format;
 295     if($termcap) {
 296         $line = "$UNDL$line$NORM";
 297     } else {
 298             $line =~ s/(.)/$1\b_/g;
 299     }
 300 #    $line = "$UNDL$line$NORM" if $ansify;
 301     return $line;
 302 }
 303
 304 # Fill a paragraph including underlined and overstricken chars.
 305 # It's not perfect for words longer than the margin, and it's probably
 306 # slow, but it works.
 307 sub fill {
 308     local $_ = shift;
 309     my $par = "";
 310     my $indent_space = " " x $indent;
 311     my $marg = $SCREEN-$indent;
 312     my $line = $indent_space;
 313     my $line_length;
 314     foreach (split) {
 315         my $word_length = length;
 316         $word_length -= 2 while /\010/g;  # Subtract backspaces
 317
 318         if ($line_length + $word_length > $marg) {
 319             $par .= $line . "\n";
 320             $line= $indent_space . $_;
 321             $line_length = $word_length;
 322         }
 323         else {
 324             if ($line_length) {
 325                 $line_length++;
 326                 $line .= " ";
 327             }
 328             $line_length += $word_length;
 329             $line .= $_;
 330         }
 331     }
 332     $par .= "$line\n" if $line;
 333     $par .= "\n";
 334     return $par;
 335 }
 336
 337 sub IP_output {
 338     local($tag, $_) = @_;
 339     local($tag_indent) = $indent[$#index - 1] || $DEF_INDENT;
 340     $tag_cols = $SCREEN - $tag_indent;
 341     $cols = $SCREEN - $indent;
 342     $tag =~ s/\s*$//;
 343     s/\s+/ /g;
 344     s/^ //;
 345     $str = "format OUTPUT = \n"
 346         . (" " x ($tag_indent))
 347         . '@' . ('<' x ($indent - $tag_indent - 1))
 348         . "^" .  ("<" x ($cols - 1)) . "\n"
 349         . '$tag, $_'
 350         . "\n~~"
 351         . (" " x ($indent-2))
 352         . "^" .  ("<" x ($cols - 5)) . "\n"
 353         . '$_' . "\n\n.\n1";
 354     #warn $str; warn "tag is $tag, _ is $_";
 355     eval $str || die;
 356     write OUTPUT;
 357 }
 358
 359 sub output {
 360     local($_, $reformat) = @_;
 361     if ($reformat) {
 362         $cols = $SCREEN - $indent;
 363         s/\s+/ /g;
 364         s/^ //;
 365         $str = "format OUTPUT = \n~~"
 366             . (" " x ($indent-2))
 367             . "^" .  ("<" x ($cols - 5)) . "\n"
 368             . '$_' . "\n\n.\n1";
 369         eval $str || die;
 370         write OUTPUT;
 371     } else {
 372         s/^/' ' x $indent/gem;
 373         s/^\s+\n$/\n/gm;
 374         print OUTPUT;
 375     }
 376 }
 377
 378 sub noremap {
 379     local($thing_to_hide) = shift;
 380     $thing_to_hide =~ tr/\000-\177/\200-\377/;
 381     return $thing_to_hide;
 382 }
 383
 384 sub init_noremap {
 385     die "unmatched init" if $mapready++;
 386     #mask off high bit characters in input stream
 387     s/([\200-\377])/"E<".ord($1).">"/ge;
 388 }
 389
 390 sub clear_noremap {
 391     my $ready_to_print = $_[0];
 392     die "unmatched clear" unless $mapready--;
 393     tr/\200-\377/\000-\177/;
 394     # now for the E<>s, which have been hidden until now
 395     # otherwise the interative \w<> processing would have
 396     # been hosed by the E<gt>
 397     s {
 398             E<
 399             (
 400                 ( \d+ )
 401                 | ( [A-Za-z]+ )
 402             )
 403             >
 404     } {
 405          do {
 406                 defined $2
 407                 ? chr($2)
 408                 :
 409              defined $HTML_Escapes{$3}
 410                 ? do { $HTML_Escapes{$3} }
 411                 : do {
 412                     warn "Unknown escape: E<$1> in $_";
 413                     "E<$1>";
 414                 }
 415          }
 416     }egx if $ready_to_print;
 417 }
 418
 419 sub internal_lrefs {
 420     local($_) = shift;
 421     s{L</([^>]+)>}{$1}g;
 422     my(@items) = split( /(?:,?\s+(?:and\s+)?)/ );
 423     my $retstr = "the ";
 424     my $i;
 425     for ($i = 0; $i <= $#items; $i++) {
 426         $retstr .= "C<$items[$i]>";
 427         $retstr .= ", " if @items > 2 && $i != $#items;
 428         $retstr .= " and " if $i+2 == @items;
 429     }
 430
 431     $retstr .= " entr" . ( @items > 1  ? "ies" : "y" )
 432             .  " elsewhere in this document ";
 433
 434     return $retstr;
 435
 436 }
 437
 438 BEGIN {
 439
 440 %HTML_Escapes = (
 441     'amp'       =>      '&',    #   ampersand
 442     'lt'        =>      '<',    #   left chevron, less-than
 443     'gt'        =>      '>',    #   right chevron, greater-than
 444     'quot'      =>      '"',    #   double quote
 445
 446     "Aacute"    =>      "\xC1", #   capital A, acute accent
 447     "aacute"    =>      "\xE1", #   small a, acute accent
 448     "Acirc"     =>      "\xC2", #   capital A, circumflex accent
 449     "acirc"     =>      "\xE2", #   small a, circumflex accent
 450     "AElig"     =>      "\xC6", #   capital AE diphthong (ligature)
 451     "aelig"     =>      "\xE6", #   small ae diphthong (ligature)
 452     "Agrave"    =>      "\xC0", #   capital A, grave accent
 453     "agrave"    =>      "\xE0", #   small a, grave accent
 454     "Aring"     =>      "\xC5", #   capital A, ring
 455     "aring"     =>      "\xE5", #   small a, ring
 456     "Atilde"    =>      "\xC3", #   capital A, tilde
 457     "atilde"    =>      "\xE3", #   small a, tilde
 458     "Auml"      =>      "\xC4", #   capital A, dieresis or umlaut mark
 459     "auml"      =>      "\xE4", #   small a, dieresis or umlaut mark
 460     "Ccedil"    =>      "\xC7", #   capital C, cedilla
 461     "ccedil"    =>      "\xE7", #   small c, cedilla
 462     "Eacute"    =>      "\xC9", #   capital E, acute accent
 463     "eacute"    =>      "\xE9", #   small e, acute accent
 464     "Ecirc"     =>      "\xCA", #   capital E, circumflex accent
 465     "ecirc"     =>      "\xEA", #   small e, circumflex accent
 466     "Egrave"    =>      "\xC8", #   capital E, grave accent
 467     "egrave"    =>      "\xE8", #   small e, grave accent
 468     "ETH"       =>      "\xD0", #   capital Eth, Icelandic
 469     "eth"       =>      "\xF0", #   small eth, Icelandic
 470     "Euml"      =>      "\xCB", #   capital E, dieresis or umlaut mark
 471     "euml"      =>      "\xEB", #   small e, dieresis or umlaut mark
 472     "Iacute"    =>      "\xCD", #   capital I, acute accent
 473     "iacute"    =>      "\xED", #   small i, acute accent
 474     "Icirc"     =>      "\xCE", #   capital I, circumflex accent
 475     "icirc"     =>      "\xEE", #   small i, circumflex accent
 476     "Igrave"    =>      "\xCD", #   capital I, grave accent
 477     "igrave"    =>      "\xED", #   small i, grave accent
 478     "Iuml"      =>      "\xCF", #   capital I, dieresis or umlaut mark
 479     "iuml"      =>      "\xEF", #   small i, dieresis or umlaut mark
 480     "Ntilde"    =>      "\xD1",         #   capital N, tilde
 481     "ntilde"    =>      "\xF1",         #   small n, tilde
 482     "Oacute"    =>      "\xD3", #   capital O, acute accent
 483     "oacute"    =>      "\xF3", #   small o, acute accent
 484     "Ocirc"     =>      "\xD4", #   capital O, circumflex accent
 485     "ocirc"     =>      "\xF4", #   small o, circumflex accent
 486     "Ograve"    =>      "\xD2", #   capital O, grave accent
 487     "ograve"    =>      "\xF2", #   small o, grave accent
 488     "Oslash"    =>      "\xD8", #   capital O, slash
 489     "oslash"    =>      "\xF8", #   small o, slash
 490     "Otilde"    =>      "\xD5", #   capital O, tilde
 491     "otilde"    =>      "\xF5", #   small o, tilde
 492     "Ouml"      =>      "\xD6", #   capital O, dieresis or umlaut mark
 493     "ouml"      =>      "\xF6", #   small o, dieresis or umlaut mark
 494     "szlig"     =>      "\xDF",         #   small sharp s, German (sz ligature)
 495     "THORN"     =>      "\xDE", #   capital THORN, Icelandic
 496     "thorn"     =>      "\xFE", #   small thorn, Icelandic
 497     "Uacute"    =>      "\xDA", #   capital U, acute accent
 498     "uacute"    =>      "\xFA", #   small u, acute accent
 499     "Ucirc"     =>      "\xDB", #   capital U, circumflex accent
 500     "ucirc"     =>      "\xFB", #   small u, circumflex accent
 501     "Ugrave"    =>      "\xD9", #   capital U, grave accent
 502     "ugrave"    =>      "\xF9", #   small u, grave accent
 503     "Uuml"      =>      "\xDC", #   capital U, dieresis or umlaut mark
 504     "uuml"      =>      "\xFC", #   small u, dieresis or umlaut mark
 505     "Yacute"    =>      "\xDD", #   capital Y, acute accent
 506     "yacute"    =>      "\xFD", #   small y, acute accent
 507     "yuml"      =>      "\xFF", #   small y, dieresis or umlaut mark
 508
 509     "lchevron"  =>      "\xAB", #   left chevron (double less than)
 510     "rchevron"  =>      "\xBB", #   right chevron (double greater than)
 511 );
 512 }
 513
 514 1;