Commit | Line | Data |
69e00e79 |
1 | package Pod::Text; |
2 | |
69e00e79 |
3 | =head1 NAME |
4 | |
5 | Pod::Text - convert POD data to formatted ASCII text |
6 | |
7 | =head1 SYNOPSIS |
8 | |
9 | use Pod::Text; |
10 | |
11 | pod2text("perlfunc.pod"); |
12 | |
13 | Also: |
14 | |
f2506fb2 |
15 | pod2text [B<-a>] [B<->I<width>] < input.pod |
69e00e79 |
16 | |
17 | =head1 DESCRIPTION |
18 | |
19 | Pod::Text is a module that can convert documentation in the POD format (such |
20 | as can be found throughout the Perl distribution) into formatted ASCII. |
21 | Termcap is optionally supported for boldface/underline, and can enabled via |
22 | C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces |
23 | will be used to simulate bold and underlined text. |
24 | |
25 | A separate F<pod2text> program is included that is primarily a wrapper for |
26 | Pod::Text. |
27 | |
f2506fb2 |
28 | The single function C<pod2text()> can take the optional options B<-a> |
29 | for an alternative output format, then a B<->I<width> option with the |
30 | max terminal width, followed by one or two arguments. The first |
1fef88e7 |
31 | should be the name of a file to read the pod from, or "E<lt>&STDIN" to read from |
69e00e79 |
32 | STDIN. A second argument, if provided, should be a filehandle glob where |
33 | output should be sent. |
34 | |
35 | =head1 AUTHOR |
36 | |
1fef88e7 |
37 | Tom Christiansen E<lt>F<tchrist@mox.perl.com>E<gt> |
69e00e79 |
38 | |
39 | =head1 TODO |
40 | |
41 | Cleanup work. The input and output locations need to be more flexible, |
42 | termcap shouldn't be a global variable, and the terminal speed needs to |
43 | be properly calculated. |
44 | |
45 | =cut |
46 | |
47 | use Term::Cap; |
48 | require Exporter; |
49 | @ISA = Exporter; |
50 | @EXPORT = qw(pod2text); |
51 | |
f02a87df |
52 | use vars qw($VERSION); |
fe1d48e4 |
53 | $VERSION = "1.0204"; |
f02a87df |
54 | |
3ec07288 |
55 | use locale; # make \w work right in non-ASCII lands |
56 | |
69e00e79 |
57 | $termcap=0; |
58 | |
f2506fb2 |
59 | $opt_alt_format = 0; |
60 | |
69e00e79 |
61 | #$use_format=1; |
62 | |
63 | $UNDL = "\x1b[4m"; |
64 | $INV = "\x1b[7m"; |
65 | $BOLD = "\x1b[1m"; |
66 | $NORM = "\x1b[0m"; |
67 | |
68 | sub pod2text { |
f2506fb2 |
69 | shift if $opt_alt_format = ($_[0] eq '-a'); |
69e00e79 |
70 | |
71 | if($termcap and !$setuptermcap) { |
72 | $setuptermcap=1; |
73 | |
74 | my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 }; |
75 | $UNDL = $term->{'_us'}; |
76 | $INV = $term->{'_mr'}; |
77 | $BOLD = $term->{'_md'}; |
78 | $NORM = $term->{'_me'}; |
79 | } |
80 | |
81 | $SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1)) |
69e00e79 |
82 | || $ENV{COLUMNS} |
36477c24 |
83 | || ($ENV{TERMCAP} =~ /co#(\d+)/)[0] |
39e571d4 |
84 | || ($^O ne 'MSWin32' && $^O ne 'dos' && (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0]) |
69e00e79 |
85 | || 72; |
86 | |
f2506fb2 |
87 | @_ = ("<&STDIN") unless @_; |
88 | local($file,*OUTPUT) = @_; |
89 | *OUTPUT = *STDOUT if @_<2; |
90 | |
91 | local $: = $:; |
92 | $: = " \n" if $opt_alt_format; # Do not break ``-L/lib/'' into ``- L/lib/''. |
93 | |
69e00e79 |
94 | $/ = ""; |
95 | |
96 | $FANCY = 0; |
97 | |
98 | $cutting = 1; |
99 | $DEF_INDENT = 4; |
100 | $indent = $DEF_INDENT; |
101 | $needspace = 0; |
8c634b6e |
102 | $begun = ""; |
69e00e79 |
103 | |
2e917f57 |
104 | open(IN, $file) || die "Couldn't open $file: $!"; |
69e00e79 |
105 | |
106 | POD_DIRECTIVE: while (<IN>) { |
107 | if ($cutting) { |
108 | next unless /^=/; |
109 | $cutting = 0; |
110 | } |
8c634b6e |
111 | if ($begun) { |
112 | if (/^=end\s+$begun/) { |
113 | $begun = ""; |
114 | } |
115 | elsif ($begun eq "text") { |
78ff9ed7 |
116 | print OUTPUT $_; |
8c634b6e |
117 | } |
118 | next; |
119 | } |
69e00e79 |
120 | 1 while s{^(.*?)(\t+)(.*)$}{ |
121 | $1 |
122 | . (' ' x (length($2) * 8 - length($1) % 8)) |
123 | . $3 |
124 | }me; |
125 | # Translate verbatim paragraph |
126 | if (/^\s/) { |
69e00e79 |
127 | output($_); |
128 | next; |
129 | } |
130 | |
f02a87df |
131 | if (/^=for\s+(\S+)\s*(.*)/s) { |
8c634b6e |
132 | if ($1 eq "text") { |
78ff9ed7 |
133 | print OUTPUT $2,""; |
8c634b6e |
134 | } else { |
135 | # ignore unknown for |
136 | } |
137 | next; |
138 | } |
f02a87df |
139 | elsif (/^=begin\s+(\S+)\s*(.*)/s) { |
8c634b6e |
140 | $begun = $1; |
141 | if ($1 eq "text") { |
78ff9ed7 |
142 | print OUTPUT $2.""; |
8c634b6e |
143 | } |
144 | next; |
145 | } |
146 | |
69e00e79 |
147 | sub prepare_for_output { |
148 | |
149 | s/\s*$/\n/; |
150 | &init_noremap; |
151 | |
152 | # need to hide E<> first; they're processed in clear_noremap |
153 | s/(E<[^<>]+>)/noremap($1)/ge; |
154 | $maxnest = 10; |
155 | while ($maxnest-- && /[A-Z]</) { |
156 | unless ($FANCY) { |
f2506fb2 |
157 | if ($opt_alt_format) { |
158 | s/[BC]<(.*?)>/``$1''/sg; |
159 | s/F<(.*?)>/"$1"/sg; |
160 | } else { |
161 | s/C<(.*?)>/`$1'/sg; |
162 | } |
69e00e79 |
163 | } else { |
55497cff |
164 | s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/sge; |
69e00e79 |
165 | } |
166 | # s/[IF]<(.*?)>/italic($1)/ge; |
55497cff |
167 | s/I<(.*?)>/*$1*/sg; |
69e00e79 |
168 | # s/[CB]<(.*?)>/bold($1)/ge; |
55497cff |
169 | s/X<.*?>//sg; |
b74bceb9 |
170 | |
171 | # LREF: a la HREF L<show this text|man/section> |
172 | s:L<([^|>]+)\|[^>]+>:$1:g; |
173 | |
69e00e79 |
174 | # LREF: a manpage(3f) |
175 | s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g; |
176 | # LREF: an =item on another manpage |
177 | s{ |
178 | L< |
179 | ([^/]+) |
180 | / |
181 | ( |
182 | [:\w]+ |
183 | (\(\))? |
184 | ) |
185 | > |
186 | } {the "$2" entry in the $1 manpage}gx; |
187 | |
188 | # LREF: an =item on this manpage |
189 | s{ |
190 | ((?: |
191 | L< |
192 | / |
193 | ( |
194 | [:\w]+ |
195 | (\(\))? |
196 | ) |
197 | > |
198 | (,?\s+(and\s+)?)? |
199 | )+) |
200 | } { internal_lrefs($1) }gex; |
201 | |
202 | # LREF: a =head2 (head1?), maybe on a manpage, maybe right here |
203 | # the "func" can disambiguate |
204 | s{ |
205 | L< |
206 | (?: |
207 | ([a-zA-Z]\S+?) / |
208 | )? |
209 | "?(.*?)"? |
210 | > |
211 | }{ |
212 | do { |
213 | $1 # if no $1, assume it means on this page. |
214 | ? "the section on \"$2\" in the $1 manpage" |
215 | : "the section on \"$2\"" |
216 | } |
55497cff |
217 | }sgex; |
69e00e79 |
218 | |
55497cff |
219 | s/[A-Z]<(.*?)>/$1/sg; |
69e00e79 |
220 | } |
221 | clear_noremap(1); |
222 | } |
223 | |
224 | &prepare_for_output; |
225 | |
226 | if (s/^=//) { |
227 | # $needspace = 0; # Assume this. |
228 | # s/\n/ /g; |
229 | ($Cmd, $_) = split(' ', $_, 2); |
230 | # clear_noremap(1); |
231 | if ($Cmd eq 'cut') { |
232 | $cutting = 1; |
233 | } |
78ff9ed7 |
234 | elsif ($Cmd eq 'pod') { |
235 | $cutting = 0; |
236 | } |
69e00e79 |
237 | elsif ($Cmd eq 'head1') { |
238 | makespace(); |
f2506fb2 |
239 | if ($opt_alt_format) { |
240 | print OUTPUT "\n"; |
241 | s/^(.+?)[ \t]*$/==== $1 ====/; |
242 | } |
69e00e79 |
243 | print OUTPUT; |
244 | # print OUTPUT uc($_); |
f2506fb2 |
245 | $needspace = $opt_alt_format; |
69e00e79 |
246 | } |
247 | elsif ($Cmd eq 'head2') { |
248 | makespace(); |
249 | # s/(\w+)/\u\L$1/g; |
250 | #print ' ' x $DEF_INDENT, $_; |
251 | # print "\xA7"; |
252 | s/(\w)/\xA7 $1/ if $FANCY; |
f2506fb2 |
253 | if ($opt_alt_format) { |
254 | s/^(.+?)[ \t]*$/== $1 ==/; |
255 | print OUTPUT "\n", $_; |
256 | } else { |
257 | print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n"; |
258 | } |
259 | $needspace = $opt_alt_format; |
69e00e79 |
260 | } |
261 | elsif ($Cmd eq 'over') { |
262 | push(@indent,$indent); |
263 | $indent += ($_ + 0) || $DEF_INDENT; |
264 | } |
265 | elsif ($Cmd eq 'back') { |
266 | $indent = pop(@indent); |
267 | warn "Unmatched =back\n" unless defined $indent; |
69e00e79 |
268 | } |
269 | elsif ($Cmd eq 'item') { |
270 | makespace(); |
271 | # s/\A(\s*)\*/$1\xb7/ if $FANCY; |
272 | # s/^(\s*\*\s+)/$1 /; |
273 | { |
274 | if (length() + 3 < $indent) { |
275 | my $paratag = $_; |
2e917f57 |
276 | $_ = <IN>; |
fe1d48e4 |
277 | if (/^[=\s]/) { # tricked!, or verbatim paragraph |
73875a17 |
278 | local($indent) = $indent[$#indent - 1] || $DEF_INDENT; |
69e00e79 |
279 | output($paratag); |
280 | redo POD_DIRECTIVE; |
281 | } |
282 | &prepare_for_output; |
283 | IP_output($paratag, $_); |
284 | } else { |
73875a17 |
285 | local($indent) = $indent[$#indent - 1] || $DEF_INDENT; |
f2506fb2 |
286 | output($_, 0); |
69e00e79 |
287 | } |
288 | } |
289 | } |
290 | else { |
291 | warn "Unrecognized directive: $Cmd\n"; |
292 | } |
293 | } |
294 | else { |
295 | # clear_noremap(1); |
296 | makespace(); |
297 | output($_, 1); |
298 | } |
299 | } |
300 | |
301 | close(IN); |
302 | |
303 | } |
304 | |
305 | ######################################################################### |
306 | |
307 | sub makespace { |
308 | if ($needspace) { |
309 | print OUTPUT "\n"; |
310 | $needspace = 0; |
311 | } |
312 | } |
313 | |
314 | sub bold { |
315 | my $line = shift; |
316 | return $line if $use_format; |
317 | if($termcap) { |
318 | $line = "$BOLD$line$NORM"; |
319 | } else { |
320 | $line =~ s/(.)/$1\b$1/g; |
321 | } |
322 | # $line = "$BOLD$line$NORM" if $ansify; |
323 | return $line; |
324 | } |
325 | |
326 | sub italic { |
327 | my $line = shift; |
328 | return $line if $use_format; |
329 | if($termcap) { |
330 | $line = "$UNDL$line$NORM"; |
331 | } else { |
332 | $line =~ s/(.)/$1\b_/g; |
333 | } |
334 | # $line = "$UNDL$line$NORM" if $ansify; |
335 | return $line; |
336 | } |
337 | |
338 | # Fill a paragraph including underlined and overstricken chars. |
339 | # It's not perfect for words longer than the margin, and it's probably |
340 | # slow, but it works. |
341 | sub fill { |
342 | local $_ = shift; |
343 | my $par = ""; |
344 | my $indent_space = " " x $indent; |
345 | my $marg = $SCREEN-$indent; |
346 | my $line = $indent_space; |
347 | my $line_length; |
348 | foreach (split) { |
349 | my $word_length = length; |
350 | $word_length -= 2 while /\010/g; # Subtract backspaces |
351 | |
352 | if ($line_length + $word_length > $marg) { |
353 | $par .= $line . "\n"; |
354 | $line= $indent_space . $_; |
355 | $line_length = $word_length; |
356 | } |
357 | else { |
358 | if ($line_length) { |
359 | $line_length++; |
360 | $line .= " "; |
361 | } |
362 | $line_length += $word_length; |
363 | $line .= $_; |
364 | } |
365 | } |
366 | $par .= "$line\n" if $line; |
367 | $par .= "\n"; |
368 | return $par; |
369 | } |
370 | |
371 | sub IP_output { |
372 | local($tag, $_) = @_; |
73875a17 |
373 | local($tag_indent) = $indent[$#indent - 1] || $DEF_INDENT; |
69e00e79 |
374 | $tag_cols = $SCREEN - $tag_indent; |
375 | $cols = $SCREEN - $indent; |
376 | $tag =~ s/\s*$//; |
377 | s/\s+/ /g; |
378 | s/^ //; |
379 | $str = "format OUTPUT = \n" |
f2506fb2 |
380 | . (($opt_alt_format && $tag_indent > 1) |
381 | ? ":" . " " x ($tag_indent - 1) |
382 | : " " x ($tag_indent)) |
69e00e79 |
383 | . '@' . ('<' x ($indent - $tag_indent - 1)) |
384 | . "^" . ("<" x ($cols - 1)) . "\n" |
385 | . '$tag, $_' |
386 | . "\n~~" |
387 | . (" " x ($indent-2)) |
388 | . "^" . ("<" x ($cols - 5)) . "\n" |
389 | . '$_' . "\n\n.\n1"; |
390 | #warn $str; warn "tag is $tag, _ is $_"; |
391 | eval $str || die; |
392 | write OUTPUT; |
393 | } |
394 | |
395 | sub output { |
396 | local($_, $reformat) = @_; |
397 | if ($reformat) { |
398 | $cols = $SCREEN - $indent; |
399 | s/\s+/ /g; |
400 | s/^ //; |
401 | $str = "format OUTPUT = \n~~" |
402 | . (" " x ($indent-2)) |
403 | . "^" . ("<" x ($cols - 5)) . "\n" |
404 | . '$_' . "\n\n.\n1"; |
405 | eval $str || die; |
406 | write OUTPUT; |
407 | } else { |
408 | s/^/' ' x $indent/gem; |
409 | s/^\s+\n$/\n/gm; |
f2506fb2 |
410 | s/^ /: /s if defined($reformat) && $opt_alt_format; |
69e00e79 |
411 | print OUTPUT; |
412 | } |
413 | } |
414 | |
415 | sub noremap { |
416 | local($thing_to_hide) = shift; |
417 | $thing_to_hide =~ tr/\000-\177/\200-\377/; |
418 | return $thing_to_hide; |
419 | } |
420 | |
421 | sub init_noremap { |
422 | die "unmatched init" if $mapready++; |
26fb054b |
423 | #mask off high bit characters in input stream |
424 | s/([\200-\377])/"E<".ord($1).">"/ge; |
69e00e79 |
425 | } |
426 | |
427 | sub clear_noremap { |
428 | my $ready_to_print = $_[0]; |
429 | die "unmatched clear" unless $mapready--; |
430 | tr/\200-\377/\000-\177/; |
431 | # now for the E<>s, which have been hidden until now |
432 | # otherwise the interative \w<> processing would have |
433 | # been hosed by the E<gt> |
434 | s { |
26fb054b |
435 | E< |
436 | ( |
437 | ( \d+ ) |
438 | | ( [A-Za-z]+ ) |
439 | ) |
69e00e79 |
440 | > |
441 | } { |
442 | do { |
26fb054b |
443 | defined $2 |
444 | ? chr($2) |
445 | : |
446 | defined $HTML_Escapes{$3} |
447 | ? do { $HTML_Escapes{$3} } |
69e00e79 |
448 | : do { |
f02a87df |
449 | warn "Unknown escape: E<$1> in $_"; |
69e00e79 |
450 | "E<$1>"; |
451 | } |
452 | } |
453 | }egx if $ready_to_print; |
454 | } |
455 | |
456 | sub internal_lrefs { |
457 | local($_) = shift; |
458 | s{L</([^>]+)>}{$1}g; |
459 | my(@items) = split( /(?:,?\s+(?:and\s+)?)/ ); |
460 | my $retstr = "the "; |
461 | my $i; |
462 | for ($i = 0; $i <= $#items; $i++) { |
463 | $retstr .= "C<$items[$i]>"; |
464 | $retstr .= ", " if @items > 2 && $i != $#items; |
465 | $retstr .= " and " if $i+2 == @items; |
466 | } |
467 | |
468 | $retstr .= " entr" . ( @items > 1 ? "ies" : "y" ) |
469 | . " elsewhere in this document "; |
470 | |
471 | return $retstr; |
472 | |
473 | } |
474 | |
475 | BEGIN { |
476 | |
477 | %HTML_Escapes = ( |
478 | 'amp' => '&', # ampersand |
479 | 'lt' => '<', # left chevron, less-than |
480 | 'gt' => '>', # right chevron, greater-than |
481 | 'quot' => '"', # double quote |
482 | |
483 | "Aacute" => "\xC1", # capital A, acute accent |
484 | "aacute" => "\xE1", # small a, acute accent |
485 | "Acirc" => "\xC2", # capital A, circumflex accent |
486 | "acirc" => "\xE2", # small a, circumflex accent |
487 | "AElig" => "\xC6", # capital AE diphthong (ligature) |
488 | "aelig" => "\xE6", # small ae diphthong (ligature) |
489 | "Agrave" => "\xC0", # capital A, grave accent |
490 | "agrave" => "\xE0", # small a, grave accent |
491 | "Aring" => "\xC5", # capital A, ring |
492 | "aring" => "\xE5", # small a, ring |
493 | "Atilde" => "\xC3", # capital A, tilde |
494 | "atilde" => "\xE3", # small a, tilde |
495 | "Auml" => "\xC4", # capital A, dieresis or umlaut mark |
496 | "auml" => "\xE4", # small a, dieresis or umlaut mark |
497 | "Ccedil" => "\xC7", # capital C, cedilla |
498 | "ccedil" => "\xE7", # small c, cedilla |
499 | "Eacute" => "\xC9", # capital E, acute accent |
500 | "eacute" => "\xE9", # small e, acute accent |
501 | "Ecirc" => "\xCA", # capital E, circumflex accent |
502 | "ecirc" => "\xEA", # small e, circumflex accent |
503 | "Egrave" => "\xC8", # capital E, grave accent |
504 | "egrave" => "\xE8", # small e, grave accent |
505 | "ETH" => "\xD0", # capital Eth, Icelandic |
506 | "eth" => "\xF0", # small eth, Icelandic |
507 | "Euml" => "\xCB", # capital E, dieresis or umlaut mark |
508 | "euml" => "\xEB", # small e, dieresis or umlaut mark |
509 | "Iacute" => "\xCD", # capital I, acute accent |
510 | "iacute" => "\xED", # small i, acute accent |
511 | "Icirc" => "\xCE", # capital I, circumflex accent |
512 | "icirc" => "\xEE", # small i, circumflex accent |
513 | "Igrave" => "\xCD", # capital I, grave accent |
514 | "igrave" => "\xED", # small i, grave accent |
515 | "Iuml" => "\xCF", # capital I, dieresis or umlaut mark |
516 | "iuml" => "\xEF", # small i, dieresis or umlaut mark |
517 | "Ntilde" => "\xD1", # capital N, tilde |
518 | "ntilde" => "\xF1", # small n, tilde |
519 | "Oacute" => "\xD3", # capital O, acute accent |
520 | "oacute" => "\xF3", # small o, acute accent |
521 | "Ocirc" => "\xD4", # capital O, circumflex accent |
522 | "ocirc" => "\xF4", # small o, circumflex accent |
523 | "Ograve" => "\xD2", # capital O, grave accent |
524 | "ograve" => "\xF2", # small o, grave accent |
525 | "Oslash" => "\xD8", # capital O, slash |
526 | "oslash" => "\xF8", # small o, slash |
527 | "Otilde" => "\xD5", # capital O, tilde |
528 | "otilde" => "\xF5", # small o, tilde |
529 | "Ouml" => "\xD6", # capital O, dieresis or umlaut mark |
530 | "ouml" => "\xF6", # small o, dieresis or umlaut mark |
531 | "szlig" => "\xDF", # small sharp s, German (sz ligature) |
532 | "THORN" => "\xDE", # capital THORN, Icelandic |
533 | "thorn" => "\xFE", # small thorn, Icelandic |
534 | "Uacute" => "\xDA", # capital U, acute accent |
535 | "uacute" => "\xFA", # small u, acute accent |
536 | "Ucirc" => "\xDB", # capital U, circumflex accent |
537 | "ucirc" => "\xFB", # small u, circumflex accent |
538 | "Ugrave" => "\xD9", # capital U, grave accent |
539 | "ugrave" => "\xF9", # small u, grave accent |
540 | "Uuml" => "\xDC", # capital U, dieresis or umlaut mark |
541 | "uuml" => "\xFC", # small u, dieresis or umlaut mark |
542 | "Yacute" => "\xDD", # capital Y, acute accent |
543 | "yacute" => "\xFD", # small y, acute accent |
544 | "yuml" => "\xFF", # small y, dieresis or umlaut mark |
545 | |
546 | "lchevron" => "\xAB", # left chevron (double less than) |
547 | "rchevron" => "\xBB", # right chevron (double greater than) |
548 | ); |
549 | } |
550 | |
551 | 1; |