pod/pod2text.PL

   1 #!/usr/local/bin/perl
   2
   3 $SCREEN = ($ARGV[0] =~ /^-(\d+)/ && (shift, $1))
   4        || ($ENV{TERMCAP} =~ /co#(\d+)/)[0]
   5        ||  $ENV{COLUMNS}
   6        || (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0]
   7        || 72;
   8
   9 $/ = "";
  10
  11 $FANCY = 0;
  12
  13 $cutting = 1;
  14 $DEF_INDENT = 4;
  15 $indent = $DEF_INDENT;
  16 $needspace = 0;
  17
  18 POD_DIRECTIVE: while (<>) {
  19     if ($cutting) {
  20         next unless /^=/;
  21         $cutting = 0;
  22     }
  23     1 while s{^(.*?)(\t+)(.*)$}{
  24         $1
  25         . (' ' x (length($2) * 8 - length($1) % 8))
  26         . $3
  27     }me;
  28     # Translate verbatim paragraph
  29     if (/^\s/) {
  30         $needspace = 1;
  31         output($_);
  32         next;
  33     }
  34
  35 sub prepare_for_output {
  36
  37     s/\s*$/\n/;
  38     &init_noremap;
  39
  40     # need to hide E<> first; they're processed in clear_noremap
  41     s/(E<[^<>]+>)/noremap($1)/ge;
  42     $maxnest = 10;
  43     while ($maxnest-- && /[A-Z]</) {
  44         unless ($FANCY) {
  45             s/C<(.*?)>/`$1'/g;
  46         } else {
  47             s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/ge;
  48         }
  49         # s/[IF]<(.*?)>/italic($1)/ge;
  50         s/I<(.*?)>/*$1*/g;
  51         # s/[CB]<(.*?)>/bold($1)/ge;
  52         s/X<.*?>//g;
  53         # LREF: a manpage(3f)
  54         s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g;
  55         # LREF: an =item on another manpage
  56         s{
  57             L<
  58                 ([^/]+)
  59                 /
  60                 (
  61                     [:\w]+
  62                     (\(\))?
  63                 )
  64             >
  65         } {the "$2" entry in the $1 manpage}gx;
  66
  67         # LREF: an =item on this manpage
  68         s{
  69            ((?:
  70             L<
  71                 /
  72                 (
  73                     [:\w]+
  74                     (\(\))?
  75                 )
  76             >
  77             (,?\s+(and\s+)?)?
  78           )+)
  79         } { internal_lrefs($1) }gex;
  80
  81         # LREF: a =head2 (head1?), maybe on a manpage, maybe right here
  82         # the "func" can disambiguate
  83         s{
  84             L<
  85                 (?:
  86                     ([a-zA-Z]\S+?) /
  87                 )?
  88                 "?(.*?)"?
  89             >
  90         }{
  91             do {
  92                 $1      # if no $1, assume it means on this page.
  93                     ?  "the section on \"$2\" in the $1 manpage"
  94                     :  "the section on \"$2\""
  95             }
  96         }gex;
  97
  98         s/[A-Z]<(.*?)>/$1/g;
  99     }
 100     clear_noremap(1);
 101 }
 102
 103     &prepare_for_output;
 104
 105     if (s/^=//) {
 106         # $needspace = 0;               # Assume this.
 107         # s/\n/ /g;
 108         ($Cmd, $_) = split(' ', $_, 2);
 109         # clear_noremap(1);
 110         if ($Cmd eq 'cut') {
 111             $cutting = 1;
 112         }
 113         elsif ($Cmd eq 'head1') {
 114             makespace();
 115             print;
 116             #print uc($_);
 117         }
 118         elsif ($Cmd eq 'head2') {
 119             makespace();
 120             # s/(\w+)/\u\L$1/g;
 121             #print ' ' x $DEF_INDENT, $_;
 122             # print "\xA7";
 123             s/(\w)/\xA7 $1/ if $FANCY;
 124             print ' ' x ($DEF_INDENT/2), $_, "\n";
 125         }
 126         elsif ($Cmd eq 'over') {
 127             push(@indent,$indent);
 128             $indent += ($_ + 0) || $DEF_INDENT;
 129         }
 130         elsif ($Cmd eq 'back') {
 131             $indent = pop(@indent);
 132             warn "Unmatched =back\n" unless defined $indent;
 133             $needspace = 1;
 134         }
 135         elsif ($Cmd eq 'item') {
 136             makespace();
 137             # s/\A(\s*)\*/$1\xb7/ if $FANCY;
 138             # s/^(\s*\*\s+)/$1 /;
 139             {
 140                 if (length() + 3 < $indent) {
 141                     my $paratag = $_;
 142                     $_ = <>;
 143                     if (/^=/) {  # tricked!
 144                         local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 145                         output($paratag);
 146                         redo POD_DIRECTIVE;
 147                     }
 148                     &prepare_for_output;
 149                     IP_output($paratag, $_);
 150                 } else {
 151                     local($indent) = $indent[$#index - 1] || $DEF_INDENT;
 152                     output($_);
 153                 }
 154             }
 155         }
 156         else {
 157             warn "Unrecognized directive: $Cmd\n";
 158         }
 159     }
 160     else {
 161         # clear_noremap(1);
 162         makespace();
 163         output($_, 1);
 164     }
 165 }
 166
 167 #########################################################################
 168
 169 sub makespace {
 170     if ($needspace) {
 171         print "\n";
 172         $needspace = 0;
 173     }
 174 }
 175
 176 sub bold {
 177     my $line = shift;
 178     $line =~ s/(.)/$1\b$1/g;
 179     return $line;
 180 }
 181
 182 sub italic {
 183     my $line = shift;
 184     $line =~ s/(.)/_\b$1/g;
 185     return $line;
 186 }
 187
 188 sub IP_output {
 189     local($tag, $_) = @_;
 190     local($tag_indent) = $indent[$#index - 1] || $DEF_INDENT;
 191     $tag_cols = $SCREEN - $tag_indent;
 192     $cols = $SCREEN - $indent;
 193     $tag =~ s/\s*$//;
 194     s/\s+/ /g;
 195     s/^ //;
 196     $str = "format STDOUT = \n"
 197         . (" " x ($tag_indent))
 198         . '@' . ('<' x ($indent - $tag_indent - 1))
 199         . "^" .  ("<" x ($cols - 1)) . "\n"
 200         . '$tag, $_'
 201         . "\n~~"
 202         . (" " x ($indent-2))
 203         . "^" .  ("<" x ($cols - 5)) . "\n"
 204         . '$_' . "\n\n.\n1";
 205     #warn $str; warn "tag is $tag, _ is $_";
 206     eval $str || die;
 207     write;
 208 }
 209
 210 sub output {
 211     local($_, $reformat) = @_;
 212     if ($reformat) {
 213         $cols = $SCREEN - $indent;
 214         s/\s+/ /g;
 215         s/^ //;
 216         $str = "format STDOUT = \n~~"
 217             . (" " x ($indent-2))
 218             . "^" .  ("<" x ($cols - 5)) . "\n"
 219             . '$_' . "\n\n.\n1";
 220         eval $str || die;
 221         write;
 222     } else {
 223         s/^/' ' x $indent/gem;
 224         s/^\s+\n$/\n/gm;
 225         print;
 226     }
 227 }
 228
 229 sub noremap {
 230     local($thing_to_hide) = shift;
 231     $thing_to_hide =~ tr/\000-\177/\200-\377/;
 232     return $thing_to_hide;
 233 }
 234
 235 sub init_noremap {
 236     die "unmatched init" if $mapready++;
 237     if ( /[\200-\377]/ ) {
 238         warn "hit bit char in input stream";
 239     }
 240 }
 241
 242 sub clear_noremap {
 243     my $ready_to_print = $_[0];
 244     die "unmatched clear" unless $mapready--;
 245     tr/\200-\377/\000-\177/;
 246     # now for the E<>s, which have been hidden until now
 247     # otherwise the interative \w<> processing would have
 248     # been hosed by the E<gt>
 249     s {
 250             E<
 251             ( [A-Za-z]+ )
 252             >
 253     } {
 254          do {
 255              defined $HTML_Escapes{$1}
 256                 ? do { $HTML_Escapes{$1} }
 257                 : do {
 258                     warn "Unknown escape: $& in $_";
 259                     "E<$1>";
 260                 }
 261          }
 262     }egx if $ready_to_print;
 263 }
 264
 265 sub internal_lrefs {
 266     local($_) = shift;
 267     s{L</([^>]+)>}{$1}g;
 268     my(@items) = split( /(?:,?\s+(?:and\s+)?)/ );
 269     my $retstr = "the ";
 270     my $i;
 271     for ($i = 0; $i <= $#items; $i++) {
 272         $retstr .= "C<$items[$i]>";
 273         $retstr .= ", " if @items > 2 && $i != $#items;
 274         $retstr .= " and " if $i+2 == @items;
 275     }
 276
 277     $retstr .= " entr" . ( @items > 1  ? "ies" : "y" )
 278             .  " elsewhere in this document ";
 279
 280     return $retstr;
 281
 282 }
 283
 284 BEGIN {
 285
 286 %HTML_Escapes = (
 287     'amp'       =>      '&',    #   ampersand
 288     'lt'        =>      '<',    #   left chevron, less-than
 289     'gt'        =>      '>',    #   right chevron, greater-than
 290     'quot'      =>      '"',    #   double quote
 291
 292     "Aacute"    =>      "\xC1", #   capital A, acute accent
 293     "aacute"    =>      "\xE1", #   small a, acute accent
 294     "Acirc"     =>      "\xC2", #   capital A, circumflex accent
 295     "acirc"     =>      "\xE2", #   small a, circumflex accent
 296     "AElig"     =>      "\xC6", #   capital AE diphthong (ligature)
 297     "aelig"     =>      "\xE6", #   small ae diphthong (ligature)
 298     "Agrave"    =>      "\xC0", #   capital A, grave accent
 299     "agrave"    =>      "\xE0", #   small a, grave accent
 300     "Aring"     =>      "\xC5", #   capital A, ring
 301     "aring"     =>      "\xE5", #   small a, ring
 302     "Atilde"    =>      "\xC3", #   capital A, tilde
 303     "atilde"    =>      "\xE3", #   small a, tilde
 304     "Auml"      =>      "\xC4", #   capital A, dieresis or umlaut mark
 305     "auml"      =>      "\xE4", #   small a, dieresis or umlaut mark
 306     "Ccedil"    =>      "\xC7", #   capital C, cedilla
 307     "ccedil"    =>      "\xE7", #   small c, cedilla
 308     "Eacute"    =>      "\xC9", #   capital E, acute accent
 309     "eacute"    =>      "\xE9", #   small e, acute accent
 310     "Ecirc"     =>      "\xCA", #   capital E, circumflex accent
 311     "ecirc"     =>      "\xEA", #   small e, circumflex accent
 312     "Egrave"    =>      "\xC8", #   capital E, grave accent
 313     "egrave"    =>      "\xE8", #   small e, grave accent
 314     "ETH"       =>      "\xD0", #   capital Eth, Icelandic
 315     "eth"       =>      "\xF0", #   small eth, Icelandic
 316     "Euml"      =>      "\xCB", #   capital E, dieresis or umlaut mark
 317     "euml"      =>      "\xEB", #   small e, dieresis or umlaut mark
 318     "Iacute"    =>      "\xCD", #   capital I, acute accent
 319     "iacute"    =>      "\xED", #   small i, acute accent
 320     "Icirc"     =>      "\xCE", #   capital I, circumflex accent
 321     "icirc"     =>      "\xEE", #   small i, circumflex accent
 322     "Igrave"    =>      "\xCD", #   capital I, grave accent
 323     "igrave"    =>      "\xED", #   small i, grave accent
 324     "Iuml"      =>      "\xCF", #   capital I, dieresis or umlaut mark
 325     "iuml"      =>      "\xEF", #   small i, dieresis or umlaut mark
 326     "Ntilde"    =>      "\xD1",         #   capital N, tilde
 327     "ntilde"    =>      "\xF1",         #   small n, tilde
 328     "Oacute"    =>      "\xD3", #   capital O, acute accent
 329     "oacute"    =>      "\xF3", #   small o, acute accent
 330     "Ocirc"     =>      "\xD4", #   capital O, circumflex accent
 331     "ocirc"     =>      "\xF4", #   small o, circumflex accent
 332     "Ograve"    =>      "\xD2", #   capital O, grave accent
 333     "ograve"    =>      "\xF2", #   small o, grave accent
 334     "Oslash"    =>      "\xD8", #   capital O, slash
 335     "oslash"    =>      "\xF8", #   small o, slash
 336     "Otilde"    =>      "\xD5", #   capital O, tilde
 337     "otilde"    =>      "\xF5", #   small o, tilde
 338     "Ouml"      =>      "\xD6", #   capital O, dieresis or umlaut mark
 339     "ouml"      =>      "\xF6", #   small o, dieresis or umlaut mark
 340     "szlig"     =>      "\xDF",         #   small sharp s, German (sz ligature)
 341     "THORN"     =>      "\xDE", #   capital THORN, Icelandic
 342     "thorn"     =>      "\xFE", #   small thorn, Icelandic
 343     "Uacute"    =>      "\xDA", #   capital U, acute accent
 344     "uacute"    =>      "\xFA", #   small u, acute accent
 345     "Ucirc"     =>      "\xDB", #   capital U, circumflex accent
 346     "ucirc"     =>      "\xFB", #   small u, circumflex accent
 347     "Ugrave"    =>      "\xD9", #   capital U, grave accent
 348     "ugrave"    =>      "\xF9", #   small u, grave accent
 349     "Uuml"      =>      "\xDC", #   capital U, dieresis or umlaut mark
 350     "uuml"      =>      "\xFC", #   small u, dieresis or umlaut mark
 351     "Yacute"    =>      "\xDD", #   capital Y, acute accent
 352     "yacute"    =>      "\xFD", #   small y, acute accent
 353     "yuml"      =>      "\xFF", #   small y, dieresis or umlaut mark
 354
 355     "lchevron"  =>      "\xAB", #   left chevron (double less than)
 356     "rchevron"  =>      "\xBB", #   right chevron (double greater than)
 357 );
 358 }
 359