~ isn't valid in VMS filenames
[p5sagit/p5-mst-13.2.git] / lib / Pod / Text.pm
CommitLineData
69e00e79 1package Pod::Text;
2
69e00e79 3=head1 NAME
4
5Pod::Text - convert POD data to formatted ASCII text
6
7=head1 SYNOPSIS
8
9 use Pod::Text;
10
11 pod2text("perlfunc.pod");
12
13Also:
14
f2506fb2 15 pod2text [B<-a>] [B<->I<width>] < input.pod
69e00e79 16
17=head1 DESCRIPTION
18
19Pod::Text is a module that can convert documentation in the POD format (such
20as can be found throughout the Perl distribution) into formatted ASCII.
21Termcap is optionally supported for boldface/underline, and can enabled via
22C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces
23will be used to simulate bold and underlined text.
24
25A separate F<pod2text> program is included that is primarily a wrapper for
26Pod::Text.
27
f2506fb2 28The single function C<pod2text()> can take the optional options B<-a>
29for an alternative output format, then a B<->I<width> option with the
30max terminal width, followed by one or two arguments. The first
1fef88e7 31should be the name of a file to read the pod from, or "E<lt>&STDIN" to read from
69e00e79 32STDIN. A second argument, if provided, should be a filehandle glob where
33output should be sent.
34
35=head1 AUTHOR
36
1fef88e7 37Tom Christiansen E<lt>F<tchrist@mox.perl.com>E<gt>
69e00e79 38
39=head1 TODO
40
41Cleanup work. The input and output locations need to be more flexible,
42termcap shouldn't be a global variable, and the terminal speed needs to
43be properly calculated.
44
45=cut
46
47use Term::Cap;
48require Exporter;
49@ISA = Exporter;
50@EXPORT = qw(pod2text);
51
f02a87df 52use vars qw($VERSION);
fe1d48e4 53$VERSION = "1.0204";
f02a87df 54
3ec07288 55use locale; # make \w work right in non-ASCII lands
56
69e00e79 57$termcap=0;
58
f2506fb2 59$opt_alt_format = 0;
60
69e00e79 61#$use_format=1;
62
63$UNDL = "\x1b[4m";
64$INV = "\x1b[7m";
65$BOLD = "\x1b[1m";
66$NORM = "\x1b[0m";
67
68sub pod2text {
f2506fb2 69shift if $opt_alt_format = ($_[0] eq '-a');
69e00e79 70
71if($termcap and !$setuptermcap) {
72 $setuptermcap=1;
73
74 my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 };
75 $UNDL = $term->{'_us'};
76 $INV = $term->{'_mr'};
77 $BOLD = $term->{'_md'};
78 $NORM = $term->{'_me'};
79}
80
81$SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1))
69e00e79 82 || $ENV{COLUMNS}
36477c24 83 || ($ENV{TERMCAP} =~ /co#(\d+)/)[0]
39e571d4 84 || ($^O ne 'MSWin32' && $^O ne 'dos' && (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0])
69e00e79 85 || 72;
86
f2506fb2 87@_ = ("<&STDIN") unless @_;
88local($file,*OUTPUT) = @_;
89*OUTPUT = *STDOUT if @_<2;
90
91local $: = $:;
92$: = " \n" if $opt_alt_format; # Do not break ``-L/lib/'' into ``- L/lib/''.
93
69e00e79 94$/ = "";
95
96$FANCY = 0;
97
98$cutting = 1;
99$DEF_INDENT = 4;
100$indent = $DEF_INDENT;
101$needspace = 0;
8c634b6e 102$begun = "";
69e00e79 103
2e917f57 104open(IN, $file) || die "Couldn't open $file: $!";
69e00e79 105
106POD_DIRECTIVE: while (<IN>) {
107 if ($cutting) {
108 next unless /^=/;
109 $cutting = 0;
110 }
8c634b6e 111 if ($begun) {
112 if (/^=end\s+$begun/) {
113 $begun = "";
114 }
115 elsif ($begun eq "text") {
78ff9ed7 116 print OUTPUT $_;
8c634b6e 117 }
118 next;
119 }
69e00e79 120 1 while s{^(.*?)(\t+)(.*)$}{
121 $1
122 . (' ' x (length($2) * 8 - length($1) % 8))
123 . $3
124 }me;
125 # Translate verbatim paragraph
126 if (/^\s/) {
69e00e79 127 output($_);
128 next;
129 }
130
f02a87df 131 if (/^=for\s+(\S+)\s*(.*)/s) {
8c634b6e 132 if ($1 eq "text") {
78ff9ed7 133 print OUTPUT $2,"";
8c634b6e 134 } else {
135 # ignore unknown for
136 }
137 next;
138 }
f02a87df 139 elsif (/^=begin\s+(\S+)\s*(.*)/s) {
8c634b6e 140 $begun = $1;
141 if ($1 eq "text") {
78ff9ed7 142 print OUTPUT $2."";
8c634b6e 143 }
144 next;
145 }
146
69e00e79 147sub prepare_for_output {
148
149 s/\s*$/\n/;
150 &init_noremap;
151
152 # need to hide E<> first; they're processed in clear_noremap
153 s/(E<[^<>]+>)/noremap($1)/ge;
154 $maxnest = 10;
155 while ($maxnest-- && /[A-Z]</) {
156 unless ($FANCY) {
f2506fb2 157 if ($opt_alt_format) {
158 s/[BC]<(.*?)>/``$1''/sg;
159 s/F<(.*?)>/"$1"/sg;
160 } else {
161 s/C<(.*?)>/`$1'/sg;
162 }
69e00e79 163 } else {
55497cff 164 s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/sge;
69e00e79 165 }
166 # s/[IF]<(.*?)>/italic($1)/ge;
55497cff 167 s/I<(.*?)>/*$1*/sg;
69e00e79 168 # s/[CB]<(.*?)>/bold($1)/ge;
55497cff 169 s/X<.*?>//sg;
b74bceb9 170
171 # LREF: a la HREF L<show this text|man/section>
172 s:L<([^|>]+)\|[^>]+>:$1:g;
173
69e00e79 174 # LREF: a manpage(3f)
175 s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g;
176 # LREF: an =item on another manpage
177 s{
178 L<
179 ([^/]+)
180 /
181 (
182 [:\w]+
183 (\(\))?
184 )
185 >
186 } {the "$2" entry in the $1 manpage}gx;
187
188 # LREF: an =item on this manpage
189 s{
190 ((?:
191 L<
192 /
193 (
194 [:\w]+
195 (\(\))?
196 )
197 >
198 (,?\s+(and\s+)?)?
199 )+)
200 } { internal_lrefs($1) }gex;
201
202 # LREF: a =head2 (head1?), maybe on a manpage, maybe right here
203 # the "func" can disambiguate
204 s{
205 L<
206 (?:
207 ([a-zA-Z]\S+?) /
208 )?
209 "?(.*?)"?
210 >
211 }{
212 do {
213 $1 # if no $1, assume it means on this page.
214 ? "the section on \"$2\" in the $1 manpage"
215 : "the section on \"$2\""
216 }
55497cff 217 }sgex;
69e00e79 218
55497cff 219 s/[A-Z]<(.*?)>/$1/sg;
69e00e79 220 }
221 clear_noremap(1);
222}
223
224 &prepare_for_output;
225
226 if (s/^=//) {
227 # $needspace = 0; # Assume this.
228 # s/\n/ /g;
229 ($Cmd, $_) = split(' ', $_, 2);
230 # clear_noremap(1);
231 if ($Cmd eq 'cut') {
232 $cutting = 1;
233 }
78ff9ed7 234 elsif ($Cmd eq 'pod') {
235 $cutting = 0;
236 }
69e00e79 237 elsif ($Cmd eq 'head1') {
238 makespace();
f2506fb2 239 if ($opt_alt_format) {
240 print OUTPUT "\n";
241 s/^(.+?)[ \t]*$/==== $1 ====/;
242 }
69e00e79 243 print OUTPUT;
244 # print OUTPUT uc($_);
f2506fb2 245 $needspace = $opt_alt_format;
69e00e79 246 }
247 elsif ($Cmd eq 'head2') {
248 makespace();
249 # s/(\w+)/\u\L$1/g;
250 #print ' ' x $DEF_INDENT, $_;
251 # print "\xA7";
252 s/(\w)/\xA7 $1/ if $FANCY;
f2506fb2 253 if ($opt_alt_format) {
254 s/^(.+?)[ \t]*$/== $1 ==/;
255 print OUTPUT "\n", $_;
256 } else {
257 print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n";
258 }
259 $needspace = $opt_alt_format;
69e00e79 260 }
261 elsif ($Cmd eq 'over') {
262 push(@indent,$indent);
263 $indent += ($_ + 0) || $DEF_INDENT;
264 }
265 elsif ($Cmd eq 'back') {
266 $indent = pop(@indent);
267 warn "Unmatched =back\n" unless defined $indent;
69e00e79 268 }
269 elsif ($Cmd eq 'item') {
270 makespace();
271 # s/\A(\s*)\*/$1\xb7/ if $FANCY;
272 # s/^(\s*\*\s+)/$1 /;
273 {
274 if (length() + 3 < $indent) {
275 my $paratag = $_;
2e917f57 276 $_ = <IN>;
fe1d48e4 277 if (/^[=\s]/) { # tricked!, or verbatim paragraph
73875a17 278 local($indent) = $indent[$#indent - 1] || $DEF_INDENT;
69e00e79 279 output($paratag);
280 redo POD_DIRECTIVE;
281 }
282 &prepare_for_output;
283 IP_output($paratag, $_);
284 } else {
73875a17 285 local($indent) = $indent[$#indent - 1] || $DEF_INDENT;
f2506fb2 286 output($_, 0);
69e00e79 287 }
288 }
289 }
290 else {
291 warn "Unrecognized directive: $Cmd\n";
292 }
293 }
294 else {
295 # clear_noremap(1);
296 makespace();
297 output($_, 1);
298 }
299}
300
301close(IN);
302
303}
304
305#########################################################################
306
307sub makespace {
308 if ($needspace) {
309 print OUTPUT "\n";
310 $needspace = 0;
311 }
312}
313
314sub bold {
315 my $line = shift;
316 return $line if $use_format;
317 if($termcap) {
318 $line = "$BOLD$line$NORM";
319 } else {
320 $line =~ s/(.)/$1\b$1/g;
321 }
322# $line = "$BOLD$line$NORM" if $ansify;
323 return $line;
324}
325
326sub italic {
327 my $line = shift;
328 return $line if $use_format;
329 if($termcap) {
330 $line = "$UNDL$line$NORM";
331 } else {
332 $line =~ s/(.)/$1\b_/g;
333 }
334# $line = "$UNDL$line$NORM" if $ansify;
335 return $line;
336}
337
338# Fill a paragraph including underlined and overstricken chars.
339# It's not perfect for words longer than the margin, and it's probably
340# slow, but it works.
341sub fill {
342 local $_ = shift;
343 my $par = "";
344 my $indent_space = " " x $indent;
345 my $marg = $SCREEN-$indent;
346 my $line = $indent_space;
347 my $line_length;
348 foreach (split) {
349 my $word_length = length;
350 $word_length -= 2 while /\010/g; # Subtract backspaces
351
352 if ($line_length + $word_length > $marg) {
353 $par .= $line . "\n";
354 $line= $indent_space . $_;
355 $line_length = $word_length;
356 }
357 else {
358 if ($line_length) {
359 $line_length++;
360 $line .= " ";
361 }
362 $line_length += $word_length;
363 $line .= $_;
364 }
365 }
366 $par .= "$line\n" if $line;
367 $par .= "\n";
368 return $par;
369}
370
371sub IP_output {
372 local($tag, $_) = @_;
73875a17 373 local($tag_indent) = $indent[$#indent - 1] || $DEF_INDENT;
69e00e79 374 $tag_cols = $SCREEN - $tag_indent;
375 $cols = $SCREEN - $indent;
376 $tag =~ s/\s*$//;
377 s/\s+/ /g;
378 s/^ //;
379 $str = "format OUTPUT = \n"
f2506fb2 380 . (($opt_alt_format && $tag_indent > 1)
381 ? ":" . " " x ($tag_indent - 1)
382 : " " x ($tag_indent))
69e00e79 383 . '@' . ('<' x ($indent - $tag_indent - 1))
384 . "^" . ("<" x ($cols - 1)) . "\n"
385 . '$tag, $_'
386 . "\n~~"
387 . (" " x ($indent-2))
388 . "^" . ("<" x ($cols - 5)) . "\n"
389 . '$_' . "\n\n.\n1";
390 #warn $str; warn "tag is $tag, _ is $_";
391 eval $str || die;
392 write OUTPUT;
393}
394
395sub output {
396 local($_, $reformat) = @_;
397 if ($reformat) {
398 $cols = $SCREEN - $indent;
399 s/\s+/ /g;
400 s/^ //;
401 $str = "format OUTPUT = \n~~"
402 . (" " x ($indent-2))
403 . "^" . ("<" x ($cols - 5)) . "\n"
404 . '$_' . "\n\n.\n1";
405 eval $str || die;
406 write OUTPUT;
407 } else {
408 s/^/' ' x $indent/gem;
409 s/^\s+\n$/\n/gm;
f2506fb2 410 s/^ /: /s if defined($reformat) && $opt_alt_format;
69e00e79 411 print OUTPUT;
412 }
413}
414
415sub noremap {
416 local($thing_to_hide) = shift;
417 $thing_to_hide =~ tr/\000-\177/\200-\377/;
418 return $thing_to_hide;
419}
420
421sub init_noremap {
422 die "unmatched init" if $mapready++;
26fb054b 423 #mask off high bit characters in input stream
424 s/([\200-\377])/"E<".ord($1).">"/ge;
69e00e79 425}
426
427sub clear_noremap {
428 my $ready_to_print = $_[0];
429 die "unmatched clear" unless $mapready--;
430 tr/\200-\377/\000-\177/;
431 # now for the E<>s, which have been hidden until now
432 # otherwise the interative \w<> processing would have
433 # been hosed by the E<gt>
434 s {
26fb054b 435 E<
436 (
437 ( \d+ )
438 | ( [A-Za-z]+ )
439 )
69e00e79 440 >
441 } {
442 do {
26fb054b 443 defined $2
444 ? chr($2)
445 :
446 defined $HTML_Escapes{$3}
447 ? do { $HTML_Escapes{$3} }
69e00e79 448 : do {
f02a87df 449 warn "Unknown escape: E<$1> in $_";
69e00e79 450 "E<$1>";
451 }
452 }
453 }egx if $ready_to_print;
454}
455
456sub internal_lrefs {
457 local($_) = shift;
458 s{L</([^>]+)>}{$1}g;
459 my(@items) = split( /(?:,?\s+(?:and\s+)?)/ );
460 my $retstr = "the ";
461 my $i;
462 for ($i = 0; $i <= $#items; $i++) {
463 $retstr .= "C<$items[$i]>";
464 $retstr .= ", " if @items > 2 && $i != $#items;
465 $retstr .= " and " if $i+2 == @items;
466 }
467
468 $retstr .= " entr" . ( @items > 1 ? "ies" : "y" )
469 . " elsewhere in this document ";
470
471 return $retstr;
472
473}
474
475BEGIN {
476
5491a304 477if (ord("\t") == 9) {
69e00e79 478%HTML_Escapes = (
479 'amp' => '&', # ampersand
480 'lt' => '<', # left chevron, less-than
481 'gt' => '>', # right chevron, greater-than
482 'quot' => '"', # double quote
483
484 "Aacute" => "\xC1", # capital A, acute accent
485 "aacute" => "\xE1", # small a, acute accent
486 "Acirc" => "\xC2", # capital A, circumflex accent
487 "acirc" => "\xE2", # small a, circumflex accent
488 "AElig" => "\xC6", # capital AE diphthong (ligature)
489 "aelig" => "\xE6", # small ae diphthong (ligature)
490 "Agrave" => "\xC0", # capital A, grave accent
491 "agrave" => "\xE0", # small a, grave accent
492 "Aring" => "\xC5", # capital A, ring
493 "aring" => "\xE5", # small a, ring
494 "Atilde" => "\xC3", # capital A, tilde
495 "atilde" => "\xE3", # small a, tilde
496 "Auml" => "\xC4", # capital A, dieresis or umlaut mark
497 "auml" => "\xE4", # small a, dieresis or umlaut mark
498 "Ccedil" => "\xC7", # capital C, cedilla
499 "ccedil" => "\xE7", # small c, cedilla
500 "Eacute" => "\xC9", # capital E, acute accent
501 "eacute" => "\xE9", # small e, acute accent
502 "Ecirc" => "\xCA", # capital E, circumflex accent
503 "ecirc" => "\xEA", # small e, circumflex accent
504 "Egrave" => "\xC8", # capital E, grave accent
505 "egrave" => "\xE8", # small e, grave accent
506 "ETH" => "\xD0", # capital Eth, Icelandic
507 "eth" => "\xF0", # small eth, Icelandic
508 "Euml" => "\xCB", # capital E, dieresis or umlaut mark
509 "euml" => "\xEB", # small e, dieresis or umlaut mark
510 "Iacute" => "\xCD", # capital I, acute accent
511 "iacute" => "\xED", # small i, acute accent
512 "Icirc" => "\xCE", # capital I, circumflex accent
513 "icirc" => "\xEE", # small i, circumflex accent
514 "Igrave" => "\xCD", # capital I, grave accent
515 "igrave" => "\xED", # small i, grave accent
516 "Iuml" => "\xCF", # capital I, dieresis or umlaut mark
517 "iuml" => "\xEF", # small i, dieresis or umlaut mark
518 "Ntilde" => "\xD1", # capital N, tilde
519 "ntilde" => "\xF1", # small n, tilde
520 "Oacute" => "\xD3", # capital O, acute accent
521 "oacute" => "\xF3", # small o, acute accent
522 "Ocirc" => "\xD4", # capital O, circumflex accent
523 "ocirc" => "\xF4", # small o, circumflex accent
524 "Ograve" => "\xD2", # capital O, grave accent
525 "ograve" => "\xF2", # small o, grave accent
526 "Oslash" => "\xD8", # capital O, slash
527 "oslash" => "\xF8", # small o, slash
528 "Otilde" => "\xD5", # capital O, tilde
529 "otilde" => "\xF5", # small o, tilde
530 "Ouml" => "\xD6", # capital O, dieresis or umlaut mark
531 "ouml" => "\xF6", # small o, dieresis or umlaut mark
532 "szlig" => "\xDF", # small sharp s, German (sz ligature)
533 "THORN" => "\xDE", # capital THORN, Icelandic
534 "thorn" => "\xFE", # small thorn, Icelandic
535 "Uacute" => "\xDA", # capital U, acute accent
536 "uacute" => "\xFA", # small u, acute accent
537 "Ucirc" => "\xDB", # capital U, circumflex accent
538 "ucirc" => "\xFB", # small u, circumflex accent
539 "Ugrave" => "\xD9", # capital U, grave accent
540 "ugrave" => "\xF9", # small u, grave accent
541 "Uuml" => "\xDC", # capital U, dieresis or umlaut mark
542 "uuml" => "\xFC", # small u, dieresis or umlaut mark
543 "Yacute" => "\xDD", # capital Y, acute accent
544 "yacute" => "\xFD", # small y, acute accent
545 "yuml" => "\xFF", # small y, dieresis or umlaut mark
546
547 "lchevron" => "\xAB", # left chevron (double less than)
548 "rchevron" => "\xBB", # right chevron (double greater than)
549);
5491a304 550
551}
552else {
553
554# This hash assumes code page IBM-1047:
555%HTML_Escapes = (
556 'amp' => '&', # ampersand
557 'lt' => '<', # left chevron, less-than
558 'gt' => '>', # right chevron, greater-than
559 'quot' => '"', # double quote
560
561 "Aacute" => "\x65", # capital A, acute accent
562 "aacute" => "\x45", # small a, acute accent
563 "Acirc" => "\x62", # capital A, circumflex accent
564 "acirc" => "\x42", # small a, circumflex accent
565 "AElig" => "\x9E", # capital AE diphthong (ligature)
566 "aelig" => "\x9C", # small ae diphthong (ligature)
567 "Agrave" => "\x64", # capital A, grave accent
568 "agrave" => "\x44", # small a, grave accent
569 "Aring" => "\x67", # capital A, ring
570 "aring" => "\x47", # small a, ring
571 "Atilde" => "\x66", # capital A, tilde
572 "atilde" => "\x46", # small a, tilde
573 "Auml" => "\x63", # capital A, dieresis or umlaut mark
574 "auml" => "\x43", # small a, dieresis or umlaut mark
575 "Ccedil" => "\x68", # capital C, cedilla
576 "ccedil" => "\x48", # small c, cedilla
577 "Eacute" => "\x71", # capital E, acute accent
578 "eacute" => "\x51", # small e, acute accent
579 "Ecirc" => "\x72", # capital E, circumflex accent
580 "ecirc" => "\x52", # small e, circumflex accent
581 "Egrave" => "\x74", # capital E, grave accent
582 "egrave" => "\x54", # small e, grave accent
583 "ETH" => "\xAC", # capital Eth, Icelandic
584 "eth" => "\x8C", # small eth, Icelandic
585 "Euml" => "\x73", # capital E, dieresis or umlaut mark
586 "euml" => "\x53", # small e, dieresis or umlaut mark
587 "Iacute" => "\x75", # capital I, acute accent
588 "iacute" => "\x55", # small i, acute accent
589 "Icirc" => "\x76", # capital I, circumflex accent
590 "icirc" => "\x56", # small i, circumflex accent
591 "Igrave" => "\x75", # capital I, grave accent
592 "igrave" => "\x55", # small i, grave accent
593 "Iuml" => "\x77", # capital I, dieresis or umlaut mark
594 "iuml" => "\x57", # small i, dieresis or umlaut mark
595 "Ntilde" => "\x69", # capital N, tilde
596 "ntilde" => "\x49", # small n, tilde
597 "Oacute" => "\xEE", # capital O, acute accent
598 "oacute" => "\xCE", # small o, acute accent
599 "Ocirc" => "\xEB", # capital O, circumflex accent
600 "ocirc" => "\xCB", # small o, circumflex accent
601 "Ograve" => "\xED", # capital O, grave accent
602 "ograve" => "\xCD", # small o, grave accent
603 "Oslash" => "\x80", # capital O, slash
604 "oslash" => "\x70", # small o, slash
605 "Otilde" => "\xEF", # capital O, tilde
606 "otilde" => "\xCF", # small o, tilde
607 "Ouml" => "\xEC", # capital O, dieresis or umlaut mark
608 "ouml" => "\xCC", # small o, dieresis or umlaut mark
609 "szlig" => "\x59", # small sharp s, German (sz ligature)
610 "THORN" => "\xAE", # capital THORN, Icelandic
611 "thorn" => "\x8E", # small thorn, Icelandic
612 "Uacute" => "\xFE", # capital U, acute accent
613 "uacute" => "\xDE", # small u, acute accent
614 "Ucirc" => "\xFB", # capital U, circumflex accent
615 "ucirc" => "\xDB", # small u, circumflex accent
616 "Ugrave" => "\xFD", # capital U, grave accent
617 "ugrave" => "\xDD", # small u, grave accent
618 "Uuml" => "\xFC", # capital U, dieresis or umlaut mark
619 "uuml" => "\xDC", # small u, dieresis or umlaut mark
620 "Yacute" => "\xBA", # capital Y, acute accent
621 "yacute" => "\x8D", # small y, acute accent
622 "yuml" => "\xDF", # small y, dieresis or umlaut mark
623
624 "lchevron" => "\x8A", # left chevron (double less than)
625 "rchevron" => "\x8B", # right chevron (double greater than)
626);
627
628}
69e00e79 629}
630
6311;