[p5sagit/p5-mst-13.2.git] / lib / Pod / Text.pm

package Pod::Text;

=head1 NAME

Pod::Text - convert POD data to formatted ASCII text

=head1 SYNOPSIS

	use Pod::Text;

	pod2text("perlfunc.pod");

Also:

	pod2text [B<-a>] [B<->I<width>] < input.pod

=head1 DESCRIPTION

Pod::Text is a module that can convert documentation in the POD format (such
as can be found throughout the Perl distribution) into formatted ASCII.
Termcap is optionally supported for boldface/underline, and can enabled via
C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces
will be used to simulate bold and underlined text.

A separate F<pod2text> program is included that is primarily a wrapper for
Pod::Text.

The single function C<pod2text()> can take the optional options B<-a>
for an alternative output format, then a B<->I<width> option with the
max terminal width, followed by one or two arguments. The first
should be the name of a file to read the pod from, or "E<lt>&STDIN" to read from
STDIN. A second argument, if provided, should be a filehandle glob where
output should be sent.

=head1 AUTHOR

Tom Christiansen E<lt>F<tchrist@mox.perl.com>E<gt>

=head1 TODO

Cleanup work. The input and output locations need to be more flexible,
termcap shouldn't be a global variable, and the terminal speed needs to
be properly calculated.

=cut

use Term::Cap;
require Exporter;
@ISA = Exporter;
@EXPORT = qw(pod2text);

use vars qw($VERSION);
$VERSION = "1.0204";

use locale;	# make \w work right in non-ASCII lands

$termcap=0;

$opt_alt_format = 0;

#$use_format=1;

$UNDL = "\x1b[4m";
$INV = "\x1b[7m";
$BOLD = "\x1b[1m";
$NORM = "\x1b[0m";

sub pod2text {
shift if $opt_alt_format = ($_[0] eq '-a');

if($termcap and !$setuptermcap) {
	$setuptermcap=1;

    my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 };
    $UNDL = $term->{'_us'};
    $INV = $term->{'_mr'};
    $BOLD = $term->{'_md'};
    $NORM = $term->{'_me'};
}

$SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1))
       ||  $ENV{COLUMNS}
       || ($ENV{TERMCAP} =~ /co#(\d+)/)[0]
       || ($^O ne 'MSWin32' && $^O ne 'dos' && (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0])
       || 72;

@_ = ("<&STDIN") unless @_;
local($file,*OUTPUT) = @_;
*OUTPUT = *STDOUT if @_<2;

local $: = $:;
$: = " \n" if $opt_alt_format;	# Do not break ``-L/lib/'' into ``- L/lib/''.

$/ = "";

$FANCY = 0;

$cutting = 1;
$DEF_INDENT = 4;
$indent = $DEF_INDENT;
$needspace = 0;
$begun = "";

open(IN, $file) || die "Couldn't open $file: $!";

POD_DIRECTIVE: while (<IN>) {
    if ($cutting) {
	next unless /^=/;
	$cutting = 0;
    }
    if ($begun) {
        if (/^=end\s+$begun/) {
             $begun = "";
        }
        elsif ($begun eq "text") {
            print OUTPUT $_;
        }
        next;
    }
    1 while s{^(.*?)(\t+)(.*)$}{
	$1
	. (' ' x (length($2) * 8 - length($1) % 8))
	. $3
    }me;
    # Translate verbatim paragraph
    if (/^\s/) {
	output($_);
	next;
    }

    if (/^=for\s+(\S+)\s*(.*)/s) {
        if ($1 eq "text") {
            print OUTPUT $2,"";
        } else {
            # ignore unknown for
        }
        next;
    }
    elsif (/^=begin\s+(\S+)\s*(.*)/s) {
        $begun = $1;
        if ($1 eq "text") {
            print OUTPUT $2."";
        }
        next;
    }

sub prepare_for_output {

    s/\s*$/\n/;
    &init_noremap;

    # need to hide E<> first; they're processed in clear_noremap
    s/(E<[^<>]+>)/noremap($1)/ge;
    $maxnest = 10;
    while ($maxnest-- && /[A-Z]</) {
	unless ($FANCY) {
	    if ($opt_alt_format) {
		s/[BC]<(.*?)>/``$1''/sg;
		s/F<(.*?)>/"$1"/sg;
	    } else {
		s/C<(.*?)>/`$1'/sg;
	    }
	} else {
	    s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/sge;
	}
        # s/[IF]<(.*?)>/italic($1)/ge;
        s/I<(.*?)>/*$1*/sg;
        # s/[CB]<(.*?)>/bold($1)/ge;
	s/X<.*?>//sg;

	# LREF: a la HREF L<show this text|man/section>
	s:L<([^|>]+)\|[^>]+>:$1:g;

	# LREF: a manpage(3f)
	s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g;
	# LREF: an =item on another manpage
	s{
	    L<
		([^/]+)
		/
		(
		    [:\w]+
		    (\(\))?
		)
	    >
	} {the "$2" entry in the $1 manpage}gx;

	# LREF: an =item on this manpage
	s{
	   ((?:
	    L<
		/
		(
		    [:\w]+
		    (\(\))?
		)
	    >
	    (,?\s+(and\s+)?)?
	  )+)
	} { internal_lrefs($1) }gex;

	# LREF: a =head2 (head1?), maybe on a manpage, maybe right here
	# the "func" can disambiguate
	s{
	    L<
		(?:
		    ([a-zA-Z]\S+?) / 
		)?
		"?(.*?)"?
	    >
	}{
	    do {
		$1 	# if no $1, assume it means on this page.
		    ?  "the section on \"$2\" in the $1 manpage"
		    :  "the section on \"$2\""
	    }
	}sgex;

        s/[A-Z]<(.*?)>/$1/sg;
    }
    clear_noremap(1);
}

    &prepare_for_output;

    if (s/^=//) {
	# $needspace = 0;		# Assume this.
	# s/\n/ /g;
	($Cmd, $_) = split(' ', $_, 2);
	# clear_noremap(1);
	if ($Cmd eq 'cut') {
	    $cutting = 1;
	}
	elsif ($Cmd eq 'pod') {
	    $cutting = 0;
	}
	elsif ($Cmd eq 'head1') {
	    makespace();
	    if ($opt_alt_format) {
		print OUTPUT "\n";
		s/^(.+?)[ \t]*$/==== $1 ====/;
	    }
	    print OUTPUT;
	    # print OUTPUT uc($_);
	    $needspace = $opt_alt_format;
	}
	elsif ($Cmd eq 'head2') {
	    makespace();
	    # s/(\w+)/\u\L$1/g;
	    #print ' ' x $DEF_INDENT, $_;
	    # print "\xA7";
	    s/(\w)/\xA7 $1/ if $FANCY;
	    if ($opt_alt_format) {
		s/^(.+?)[ \t]*$/==   $1   ==/;
		print OUTPUT "\n", $_;
	    } else {
		print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n";
	    }
	    $needspace = $opt_alt_format;
	}
	elsif ($Cmd eq 'over') {
	    push(@indent,$indent);
	    $indent += ($_ + 0) || $DEF_INDENT;
	}
	elsif ($Cmd eq 'back') {
	    $indent = pop(@indent);
	    warn "Unmatched =back\n" unless defined $indent;
	}
	elsif ($Cmd eq 'item') {
	    makespace();
	    # s/\A(\s*)\*/$1\xb7/ if $FANCY;
	    # s/^(\s*\*\s+)/$1 /;
	    {
		if (length() + 3 < $indent) {
		    my $paratag = $_;
		    $_ = <IN>;
		    if (/^[=\s]/) {  # tricked!, or verbatim paragraph
			local($indent) = $indent[$#indent - 1] || $DEF_INDENT;
			output($paratag);
			redo POD_DIRECTIVE;
		    }
		    &prepare_for_output;
		    IP_output($paratag, $_);
		} else {
		    local($indent) = $indent[$#indent - 1] || $DEF_INDENT;
		    output($_, 0);
		}
	    }
	}
	else {
	    warn "Unrecognized directive: $Cmd\n";
	}
    }
    else {
	# clear_noremap(1);
	makespace();
	output($_, 1);
    }
}

close(IN);

}

#########################################################################

sub makespace {
    if ($needspace) {
	print OUTPUT "\n";
	$needspace = 0;
    }
}

sub bold {
    my $line = shift;
    return $line if $use_format;
    if($termcap) {
    	$line = "$BOLD$line$NORM";
    } else {
	    $line =~ s/(.)/$1\b$1/g;
	}
#    $line = "$BOLD$line$NORM" if $ansify;
    return $line;
}

sub italic {
    my $line = shift;
    return $line if $use_format;
    if($termcap) {
    	$line = "$UNDL$line$NORM";
    } else {
	    $line =~ s/(.)/$1\b_/g;
    }
#    $line = "$UNDL$line$NORM" if $ansify;
    return $line;
}

# Fill a paragraph including underlined and overstricken chars.
# It's not perfect for words longer than the margin, and it's probably
# slow, but it works.
sub fill {
    local $_ = shift;
    my $par = "";
    my $indent_space = " " x $indent;
    my $marg = $SCREEN-$indent;
    my $line = $indent_space;
    my $line_length;
    foreach (split) {
	my $word_length = length;
	$word_length -= 2 while /\010/g;  # Subtract backspaces

	if ($line_length + $word_length > $marg) {
	    $par .= $line . "\n";
	    $line= $indent_space . $_;
	    $line_length = $word_length;
	}
	else {
	    if ($line_length) {
		$line_length++;
		$line .= " ";
	    }
	    $line_length += $word_length;
	    $line .= $_;
	}
    }
    $par .= "$line\n" if $line;
    $par .= "\n";
    return $par;
}

sub IP_output {
    local($tag, $_) = @_;
    local($tag_indent) = $indent[$#indent - 1] || $DEF_INDENT;
    $tag_cols = $SCREEN - $tag_indent;
    $cols = $SCREEN - $indent;
    $tag =~ s/\s*$//;
    s/\s+/ /g;
    s/^ //;
    $str = "format OUTPUT = \n"
	. (($opt_alt_format && $tag_indent > 1)
	   ? ":" . " " x ($tag_indent - 1)
	   : " " x ($tag_indent))
	. '@' . ('<' x ($indent - $tag_indent - 1))
	. "^" .  ("<" x ($cols - 1)) . "\n"
	. '$tag, $_'
	. "\n~~"
	. (" " x ($indent-2))
	. "^" .  ("<" x ($cols - 5)) . "\n"
	. '$_' . "\n\n.\n1";
    #warn $str; warn "tag is $tag, _ is $_";
    eval $str || die;
    write OUTPUT;
}

sub output {
    local($_, $reformat) = @_;
    if ($reformat) {
	$cols = $SCREEN - $indent;
	s/\s+/ /g;
	s/^ //;
	$str = "format OUTPUT = \n~~"
	    . (" " x ($indent-2))
	    . "^" .  ("<" x ($cols - 5)) . "\n"
	    . '$_' . "\n\n.\n1";
	eval $str || die;
	write OUTPUT;
    } else {
	s/^/' ' x $indent/gem;
	s/^\s+\n$/\n/gm;
	s/^  /: /s if defined($reformat) && $opt_alt_format;
	print OUTPUT;
    }
}

sub noremap {
    local($thing_to_hide) = shift;
    $thing_to_hide =~ tr/\000-\177/\200-\377/;
    return $thing_to_hide;
}

sub init_noremap {
    die "unmatched init" if $mapready++;
    #mask off high bit characters in input stream
    s/([\200-\377])/"E<".ord($1).">"/ge;
}

sub clear_noremap {
    my $ready_to_print = $_[0];
    die "unmatched clear" unless $mapready--;
    tr/\200-\377/\000-\177/;
    # now for the E<>s, which have been hidden until now
    # otherwise the interative \w<> processing would have
    # been hosed by the E<gt>
    s {
	    E<
	    (
	    	( \d+ )
	    	| ( [A-Za-z]+ )
	    )
	    >	
    } {
	 do {
	 	defined $2
	 	? chr($2)
	 	:
	     defined $HTML_Escapes{$3}
		? do { $HTML_Escapes{$3} }
		: do {
		    warn "Unknown escape: E<$1> in $_";
		    "E<$1>";
		}
	 }
    }egx if $ready_to_print;
}

sub internal_lrefs {
    local($_) = shift;
    s{L</([^>]+)>}{$1}g;
    my(@items) = split( /(?:,?\s+(?:and\s+)?)/ );
    my $retstr = "the ";
    my $i;
    for ($i = 0; $i <= $#items; $i++) {
	$retstr .= "C<$items[$i]>";
	$retstr .= ", " if @items > 2 && $i != $#items;
	$retstr .= " and " if $i+2 == @items;
    }

    $retstr .= " entr" . ( @items > 1  ? "ies" : "y" )
	    .  " elsewhere in this document ";

    return $retstr;

}

BEGIN {

if (ord("\t") == 9) {
%HTML_Escapes = (
    'amp'	=>	'&',	#   ampersand
    'lt'	=>	'<',	#   left chevron, less-than
    'gt'	=>	'>',	#   right chevron, greater-than
    'quot'	=>	'"',	#   double quote

    "Aacute"	=>	"\xC1",	#   capital A, acute accent
    "aacute"	=>	"\xE1",	#   small a, acute accent
    "Acirc"	=>	"\xC2",	#   capital A, circumflex accent
    "acirc"	=>	"\xE2",	#   small a, circumflex accent
    "AElig"	=>	"\xC6",	#   capital AE diphthong (ligature)
    "aelig"	=>	"\xE6",	#   small ae diphthong (ligature)
    "Agrave"	=>	"\xC0",	#   capital A, grave accent
    "agrave"	=>	"\xE0",	#   small a, grave accent
    "Aring"	=>	"\xC5",	#   capital A, ring
    "aring"	=>	"\xE5",	#   small a, ring
    "Atilde"	=>	"\xC3",	#   capital A, tilde
    "atilde"	=>	"\xE3",	#   small a, tilde
    "Auml"	=>	"\xC4",	#   capital A, dieresis or umlaut mark
    "auml"	=>	"\xE4",	#   small a, dieresis or umlaut mark
    "Ccedil"	=>	"\xC7",	#   capital C, cedilla
    "ccedil"	=>	"\xE7",	#   small c, cedilla
    "Eacute"	=>	"\xC9",	#   capital E, acute accent
    "eacute"	=>	"\xE9",	#   small e, acute accent
    "Ecirc"	=>	"\xCA",	#   capital E, circumflex accent
    "ecirc"	=>	"\xEA",	#   small e, circumflex accent
    "Egrave"	=>	"\xC8",	#   capital E, grave accent
    "egrave"	=>	"\xE8",	#   small e, grave accent
    "ETH"	=>	"\xD0",	#   capital Eth, Icelandic
    "eth"	=>	"\xF0",	#   small eth, Icelandic
    "Euml"	=>	"\xCB",	#   capital E, dieresis or umlaut mark
    "euml"	=>	"\xEB",	#   small e, dieresis or umlaut mark
    "Iacute"	=>	"\xCD",	#   capital I, acute accent
    "iacute"	=>	"\xED",	#   small i, acute accent
    "Icirc"	=>	"\xCE",	#   capital I, circumflex accent
    "icirc"	=>	"\xEE",	#   small i, circumflex accent
    "Igrave"	=>	"\xCD",	#   capital I, grave accent
    "igrave"	=>	"\xED",	#   small i, grave accent
    "Iuml"	=>	"\xCF",	#   capital I, dieresis or umlaut mark
    "iuml"	=>	"\xEF",	#   small i, dieresis or umlaut mark
    "Ntilde"	=>	"\xD1",		#   capital N, tilde
    "ntilde"	=>	"\xF1",		#   small n, tilde
    "Oacute"	=>	"\xD3",	#   capital O, acute accent
    "oacute"	=>	"\xF3",	#   small o, acute accent
    "Ocirc"	=>	"\xD4",	#   capital O, circumflex accent
    "ocirc"	=>	"\xF4",	#   small o, circumflex accent
    "Ograve"	=>	"\xD2",	#   capital O, grave accent
    "ograve"	=>	"\xF2",	#   small o, grave accent
    "Oslash"	=>	"\xD8",	#   capital O, slash
    "oslash"	=>	"\xF8",	#   small o, slash
    "Otilde"	=>	"\xD5",	#   capital O, tilde
    "otilde"	=>	"\xF5",	#   small o, tilde
    "Ouml"	=>	"\xD6",	#   capital O, dieresis or umlaut mark
    "ouml"	=>	"\xF6",	#   small o, dieresis or umlaut mark
    "szlig"	=>	"\xDF",		#   small sharp s, German (sz ligature)
    "THORN"	=>	"\xDE",	#   capital THORN, Icelandic
    "thorn"	=>	"\xFE",	#   small thorn, Icelandic
    "Uacute"	=>	"\xDA",	#   capital U, acute accent
    "uacute"	=>	"\xFA",	#   small u, acute accent
    "Ucirc"	=>	"\xDB",	#   capital U, circumflex accent
    "ucirc"	=>	"\xFB",	#   small u, circumflex accent
    "Ugrave"	=>	"\xD9",	#   capital U, grave accent
    "ugrave"	=>	"\xF9",	#   small u, grave accent
    "Uuml"	=>	"\xDC",	#   capital U, dieresis or umlaut mark
    "uuml"	=>	"\xFC",	#   small u, dieresis or umlaut mark
    "Yacute"	=>	"\xDD",	#   capital Y, acute accent
    "yacute"	=>	"\xFD",	#   small y, acute accent
    "yuml"	=>	"\xFF",	#   small y, dieresis or umlaut mark

    "lchevron"	=>	"\xAB",	#   left chevron (double less than)
    "rchevron"	=>	"\xBB",	#   right chevron (double greater than)
);

}
else {

# This hash assumes code page IBM-1047:
%HTML_Escapes = (
    'amp'	=>	'&',    #   ampersand
    'lt'	=>	'<',    #   left chevron, less-than
    'gt'	=>	'>',    #   right chevron, greater-than
    'quot'	=>	'"',    #   double quote

    "Aacute"	=>	"\x65", #   capital A, acute accent
    "aacute"	=>	"\x45", #   small a, acute accent
    "Acirc"	=>	"\x62", #   capital A, circumflex accent
    "acirc"	=>	"\x42", #   small a, circumflex accent
    "AElig"	=>	"\x9E", #   capital AE diphthong (ligature)
    "aelig"	=>	"\x9C", #   small ae diphthong (ligature)
    "Agrave"	=>	"\x64", #   capital A, grave accent
    "agrave"	=>	"\x44", #   small a, grave accent
    "Aring"	=>	"\x67", #   capital A, ring
    "aring"	=>	"\x47", #   small a, ring
    "Atilde"	=>	"\x66", #   capital A, tilde
    "atilde"	=>	"\x46", #   small a, tilde
    "Auml"	=>	"\x63", #   capital A, dieresis or umlaut mark
    "auml"	=>	"\x43", #   small a, dieresis or umlaut mark
    "Ccedil"	=>	"\x68", #   capital C, cedilla
    "ccedil"	=>	"\x48", #   small c, cedilla
    "Eacute"	=>	"\x71", #   capital E, acute accent
    "eacute"	=>	"\x51", #   small e, acute accent
    "Ecirc"	=>	"\x72", #   capital E, circumflex accent
    "ecirc"	=>	"\x52", #   small e, circumflex accent
    "Egrave"	=>	"\x74", #   capital E, grave accent
    "egrave"	=>	"\x54", #   small e, grave accent
    "ETH"	=>	"\xAC", #   capital Eth, Icelandic
    "eth"	=>	"\x8C", #   small eth, Icelandic
    "Euml"	=>	"\x73", #   capital E, dieresis or umlaut mark
    "euml"	=>	"\x53", #   small e, dieresis or umlaut mark
    "Iacute"	=>	"\x75", #   capital I, acute accent
    "iacute"	=>	"\x55", #   small i, acute accent
    "Icirc"	=>	"\x76", #   capital I, circumflex accent
    "icirc"	=>	"\x56", #   small i, circumflex accent
    "Igrave"	=>	"\x75", #   capital I, grave accent
    "igrave"	=>	"\x55", #   small i, grave accent
    "Iuml"	=>	"\x77", #   capital I, dieresis or umlaut mark
    "iuml"	=>	"\x57", #   small i, dieresis or umlaut mark
    "Ntilde"	=>	"\x69", #   capital N, tilde
    "ntilde"	=>	"\x49", #   small n, tilde
    "Oacute"	=>	"\xEE", #   capital O, acute accent
    "oacute"	=>	"\xCE", #   small o, acute accent
    "Ocirc"	=>	"\xEB", #   capital O, circumflex accent
    "ocirc"	=>	"\xCB", #   small o, circumflex accent
    "Ograve"	=>	"\xED", #   capital O, grave accent
    "ograve"	=>	"\xCD", #   small o, grave accent
    "Oslash"	=>	"\x80", #   capital O, slash
    "oslash"	=>	"\x70", #   small o, slash
    "Otilde"	=>	"\xEF", #   capital O, tilde
    "otilde"	=>	"\xCF", #   small o, tilde
    "Ouml"	=>	"\xEC", #   capital O, dieresis or umlaut mark
    "ouml"	=>	"\xCC", #   small o, dieresis or umlaut mark
    "szlig"	=>	"\x59", #   small sharp s, German (sz ligature)
    "THORN"	=>	"\xAE", #   capital THORN, Icelandic
    "thorn"	=>	"\x8E", #   small thorn, Icelandic
    "Uacute"	=>	"\xFE", #   capital U, acute accent
    "uacute"	=>	"\xDE", #   small u, acute accent
    "Ucirc"	=>	"\xFB", #   capital U, circumflex accent
    "ucirc"	=>	"\xDB", #   small u, circumflex accent
    "Ugrave"	=>	"\xFD", #   capital U, grave accent
    "ugrave"	=>	"\xDD", #   small u, grave accent
    "Uuml"	=>	"\xFC", #   capital U, dieresis or umlaut mark
    "uuml"	=>	"\xDC", #   small u, dieresis or umlaut mark
    "Yacute"	=>	"\xBA", #   capital Y, acute accent
    "yacute"	=>	"\x8D", #   small y, acute accent
    "yuml"	=>	"\xDF", #   small y, dieresis or umlaut mark

    "lchevron"	=>	"\x8A", #   left chevron (double less than)
    "rchevron"	=>	"\x8B", #   right chevron (double greater than)
);
 
}
}

1;
Commit	Line	Data
69e00e79	1	package Pod::Text;
69e00e79	2
69e00e79	3	=head1 NAME
	4
	5	Pod::Text - convert POD data to formatted ASCII text
	6
	7	=head1 SYNOPSIS
	8
	9	use Pod::Text;
	10
	11	pod2text("perlfunc.pod");
	12
	13	Also:
	14
f2506fb2	15	pod2text [B<-a>] [B<->I<width>] < input.pod
69e00e79	16
	17	=head1 DESCRIPTION
	18
	19	Pod::Text is a module that can convert documentation in the POD format (such
	20	as can be found throughout the Perl distribution) into formatted ASCII.
	21	Termcap is optionally supported for boldface/underline, and can enabled via
	22	C<$Pod::Text::termcap=1>. If termcap has not been enabled, then backspaces
	23	will be used to simulate bold and underlined text.
	24
	25	A separate F<pod2text> program is included that is primarily a wrapper for
	26	Pod::Text.
	27
f2506fb2	28	The single function C<pod2text()> can take the optional options B<-a>
	29	for an alternative output format, then a B<->I<width> option with the
	30	max terminal width, followed by one or two arguments. The first
1fef88e7	31	should be the name of a file to read the pod from, or "E<lt>&STDIN" to read from
69e00e79	32	STDIN. A second argument, if provided, should be a filehandle glob where
	33	output should be sent.
	34
	35	=head1 AUTHOR
	36
1fef88e7	37	Tom Christiansen E<lt>F<tchrist@mox.perl.com>E<gt>
69e00e79	38
	39	=head1 TODO
	40
	41	Cleanup work. The input and output locations need to be more flexible,
	42	termcap shouldn't be a global variable, and the terminal speed needs to
	43	be properly calculated.
	44
	45	=cut
	46
	47	use Term::Cap;
	48	require Exporter;
	49	@ISA = Exporter;
	50	@EXPORT = qw(pod2text);
	51
f02a87df	52	use vars qw($VERSION);
fe1d48e4	53	$VERSION = "1.0204";
f02a87df	54
3ec07288	55	use locale; # make \w work right in non-ASCII lands
3ec07288	56
69e00e79	57	$termcap=0;
69e00e79	58
f2506fb2	59	$opt_alt_format = 0;
f2506fb2	60
69e00e79	61	#$use_format=1;
	62
	63	$UNDL = "\x1b[4m";
	64	$INV = "\x1b[7m";
	65	$BOLD = "\x1b[1m";
	66	$NORM = "\x1b[0m";
	67
	68	sub pod2text {
f2506fb2	69	shift if $opt_alt_format = ($_[0] eq '-a');
69e00e79	70
	71	if($termcap and !$setuptermcap) {
	72	$setuptermcap=1;
	73
	74	my($term) = Tgetent Term::Cap { TERM => undef, OSPEED => 9600 };
	75	$UNDL = $term->{'_us'};
	76	$INV = $term->{'_mr'};
	77	$BOLD = $term->{'_md'};
	78	$NORM = $term->{'_me'};
	79	}
	80
	81	$SCREEN = ($_[0] =~ /^-(\d+)/ && (shift, $1))
69e00e79	82	\|\| $ENV{COLUMNS}
36477c24	83	\|\| ($ENV{TERMCAP} =~ /co#(\d+)/)[0]
39e571d4	84	\|\| ($^O ne 'MSWin32' && $^O ne 'dos' && (`stty -a 2>/dev/null` =~ /(\d+) columns/)[0])
69e00e79	85	\|\| 72;
69e00e79	86
f2506fb2	87	@_ = ("<&STDIN") unless @_;
	88	local($file,*OUTPUT) = @_;
	89	OUTPUT = STDOUT if @_<2;
	90
	91	local $: = $:;
	92	$: = " \n" if $opt_alt_format; # Do not break ``-L/lib/'' into ``- L/lib/''.
	93
69e00e79	94	$/ = "";
	95
	96	$FANCY = 0;
	97
	98	$cutting = 1;
	99	$DEF_INDENT = 4;
	100	$indent = $DEF_INDENT;
	101	$needspace = 0;
8c634b6e	102	$begun = "";
69e00e79	103
2e917f57	104	open(IN, $file) \|\| die "Couldn't open $file: $!";
69e00e79	105
	106	POD_DIRECTIVE: while (<IN>) {
	107	if ($cutting) {
	108	next unless /^=/;
	109	$cutting = 0;
	110	}
8c634b6e	111	if ($begun) {
	112	if (/^=end\s+$begun/) {
	113	$begun = "";
	114	}
	115	elsif ($begun eq "text") {
78ff9ed7	116	print OUTPUT $_;
8c634b6e	117	}
	118	next;
	119	}
69e00e79	120	1 while s{^(.?)(\t+)(.)$}{
	121	$1
	122	. (' ' x (length($2) * 8 - length($1) % 8))
	123	. $3
	124	}me;
	125	# Translate verbatim paragraph
	126	if (/^\s/) {
69e00e79	127	output($_);
	128	next;
	129	}
	130
f02a87df	131	if (/^=for\s+(\S+)\s(.)/s) {
8c634b6e	132	if ($1 eq "text") {
78ff9ed7	133	print OUTPUT $2,"";
8c634b6e	134	} else {
	135	# ignore unknown for
	136	}
	137	next;
	138	}
f02a87df	139	elsif (/^=begin\s+(\S+)\s(.)/s) {
8c634b6e	140	$begun = $1;
8c634b6e	141	if ($1 eq "text") {
78ff9ed7	142	print OUTPUT $2."";
8c634b6e	143	}
	144	next;
	145	}
	146
69e00e79	147	sub prepare_for_output {
	148
	149	s/\s*$/\n/;
	150	&init_noremap;
	151
	152	# need to hide E<> first; they're processed in clear_noremap
	153	s/(E<[^<>]+>)/noremap($1)/ge;
	154	$maxnest = 10;
	155	while ($maxnest-- && /[A-Z]</) {
	156	unless ($FANCY) {
f2506fb2	157	if ($opt_alt_format) {
	158	s/[BC]<(.*?)>/``$1''/sg;
	159	s/F<(.*?)>/"$1"/sg;
	160	} else {
	161	s/C<(.*?)>/`$1'/sg;
	162	}
69e00e79	163	} else {
55497cff	164	s/C<(.*?)>/noremap("E<lchevron>${1}E<rchevron>")/sge;
69e00e79	165	}
69e00e79	166	# s/[IF]<(.*?)>/italic($1)/ge;
55497cff	167	s/I<(.?)>/$1*/sg;
69e00e79	168	# s/[CB]<(.*?)>/bold($1)/ge;
55497cff	169	s/X<.*?>//sg;
b74bceb9	170
	171	# LREF: a la HREF L<show this text\|man/section>
	172	s:L<([^\|>]+)\\|[^>]+>:$1:g;
	173
69e00e79	174	# LREF: a manpage(3f)
	175	s:L<([a-zA-Z][^\s\/]+)(\([^\)]+\))?>:the $1$2 manpage:g;
	176	# LREF: an =item on another manpage
	177	s{
	178	L<
	179	([^/]+)
	180	/
	181	(
	182	[:\w]+
	183	(\(\))?
	184	)
	185	>
	186	} {the "$2" entry in the $1 manpage}gx;
	187
	188	# LREF: an =item on this manpage
	189	s{
	190	((?:
	191	L<
	192	/
	193	(
	194	[:\w]+
	195	(\(\))?
	196	)
	197	>
	198	(,?\s+(and\s+)?)?
	199	)+)
	200	} { internal_lrefs($1) }gex;
	201
	202	# LREF: a =head2 (head1?), maybe on a manpage, maybe right here
	203	# the "func" can disambiguate
	204	s{
	205	L<
	206	(?:
	207	([a-zA-Z]\S+?) /
	208	)?
	209	"?(.*?)"?
	210	>
	211	}{
	212	do {
	213	$1 # if no $1, assume it means on this page.
	214	? "the section on \"$2\" in the $1 manpage"
	215	: "the section on \"$2\""
	216	}
55497cff	217	}sgex;
69e00e79	218
55497cff	219	s/[A-Z]<(.*?)>/$1/sg;
69e00e79	220	}
	221	clear_noremap(1);
	222	}
	223
	224	&prepare_for_output;
	225
	226	if (s/^=//) {
	227	# $needspace = 0; # Assume this.
	228	# s/\n/ /g;
	229	($Cmd, $_) = split(' ', $_, 2);
	230	# clear_noremap(1);
	231	if ($Cmd eq 'cut') {
	232	$cutting = 1;
	233	}
78ff9ed7	234	elsif ($Cmd eq 'pod') {
	235	$cutting = 0;
	236	}
69e00e79	237	elsif ($Cmd eq 'head1') {
69e00e79	238	makespace();
f2506fb2	239	if ($opt_alt_format) {
	240	print OUTPUT "\n";
	241	s/^(.+?)[ \t]*$/==== $1 ====/;
	242	}
69e00e79	243	print OUTPUT;
69e00e79	244	# print OUTPUT uc($_);
f2506fb2	245	$needspace = $opt_alt_format;
69e00e79	246	}
	247	elsif ($Cmd eq 'head2') {
	248	makespace();
	249	# s/(\w+)/\u\L$1/g;
	250	#print ' ' x $DEF_INDENT, $_;
	251	# print "\xA7";
	252	s/(\w)/\xA7 $1/ if $FANCY;
f2506fb2	253	if ($opt_alt_format) {
	254	s/^(.+?)[ \t]*$/== $1 ==/;
	255	print OUTPUT "\n", $_;
	256	} else {
	257	print OUTPUT ' ' x ($DEF_INDENT/2), $_, "\n";
	258	}
	259	$needspace = $opt_alt_format;
69e00e79	260	}
	261	elsif ($Cmd eq 'over') {
	262	push(@indent,$indent);
	263	$indent += ($_ + 0) \|\| $DEF_INDENT;
	264	}
	265	elsif ($Cmd eq 'back') {
	266	$indent = pop(@indent);
	267	warn "Unmatched =back\n" unless defined $indent;
69e00e79	268	}
	269	elsif ($Cmd eq 'item') {
	270	makespace();
	271	# s/\A(\s)\/$1\xb7/ if $FANCY;
	272	# s/^(\s\\s+)/$1 /;
	273	{
	274	if (length() + 3 < $indent) {
	275	my $paratag = $_;
2e917f57	276	$_ = <IN>;
fe1d48e4	277	if (/^[=\s]/) { # tricked!, or verbatim paragraph
73875a17	278	local($indent) = $indent[$#indent - 1] \|\| $DEF_INDENT;
69e00e79	279	output($paratag);
	280	redo POD_DIRECTIVE;
	281	}
	282	&prepare_for_output;
	283	IP_output($paratag, $_);
	284	} else {
73875a17	285	local($indent) = $indent[$#indent - 1] \|\| $DEF_INDENT;
f2506fb2	286	output($_, 0);
69e00e79	287	}
	288	}
	289	}
	290	else {
	291	warn "Unrecognized directive: $Cmd\n";
	292	}
	293	}
	294	else {
	295	# clear_noremap(1);
	296	makespace();
	297	output($_, 1);
	298	}
	299	}
	300
	301	close(IN);
	302
	303	}
	304
	305	#########################################################################
	306
	307	sub makespace {
	308	if ($needspace) {
	309	print OUTPUT "\n";
	310	$needspace = 0;
	311	}
	312	}
	313
	314	sub bold {
	315	my $line = shift;
	316	return $line if $use_format;
	317	if($termcap) {
	318	$line = "$BOLD$line$NORM";
	319	} else {
	320	$line =~ s/(.)/$1\b$1/g;
	321	}
	322	# $line = "$BOLD$line$NORM" if $ansify;
	323	return $line;
	324	}
	325
	326	sub italic {
	327	my $line = shift;
	328	return $line if $use_format;
	329	if($termcap) {
	330	$line = "$UNDL$line$NORM";
	331	} else {
	332	$line =~ s/(.)/$1\b_/g;
	333	}
	334	# $line = "$UNDL$line$NORM" if $ansify;
	335	return $line;
	336	}
	337
	338	# Fill a paragraph including underlined and overstricken chars.
	339	# It's not perfect for words longer than the margin, and it's probably
	340	# slow, but it works.
	341	sub fill {
	342	local $_ = shift;
	343	my $par = "";
	344	my $indent_space = " " x $indent;
	345	my $marg = $SCREEN-$indent;
	346	my $line = $indent_space;
	347	my $line_length;
	348	foreach (split) {
	349	my $word_length = length;
	350	$word_length -= 2 while /\010/g; # Subtract backspaces
351
352	if ($line_length + $word_length > $marg) {
353	$par .= $line . "\n";
354	$line= $indent_space . $_;
355	$line_length = $word_length;
356	}
357	else {
358	if ($line_length) {
359	$line_length++;
360	$line .= " ";
361	}
362	$line_length += $word_length;
363	$line .= $_;
364	}
365	}
366	$par .= "$line\n" if $line;
367	$par .= "\n";
368	return $par;
369	}
370
371	sub IP_output {
372	local($tag, $_) = @_;
73875a17	373	local($tag_indent) = $indent[$#indent - 1] \|\| $DEF_INDENT;
69e00e79	374	$tag_cols = $SCREEN - $tag_indent;
	375	$cols = $SCREEN - $indent;
	376	$tag =~ s/\s*$//;
	377	s/\s+/ /g;
	378	s/^ //;
	379	$str = "format OUTPUT = \n"
f2506fb2	380	. (($opt_alt_format && $tag_indent > 1)
	381	? ":" . " " x ($tag_indent - 1)
	382	: " " x ($tag_indent))
69e00e79	383	. '@' . ('<' x ($indent - $tag_indent - 1))
	384	. "^" . ("<" x ($cols - 1)) . "\n"
	385	. '$tag, $_'
	386	. "\n~~"
	387	. (" " x ($indent-2))
	388	. "^" . ("<" x ($cols - 5)) . "\n"
	389	. '$_' . "\n\n.\n1";
	390	#warn $str; warn "tag is $tag, _ is $_";
	391	eval $str \|\| die;
	392	write OUTPUT;
	393	}
	394
	395	sub output {
	396	local($_, $reformat) = @_;
	397	if ($reformat) {
	398	$cols = $SCREEN - $indent;
	399	s/\s+/ /g;
	400	s/^ //;
	401	$str = "format OUTPUT = \n~~"
	402	. (" " x ($indent-2))
	403	. "^" . ("<" x ($cols - 5)) . "\n"
	404	. '$_' . "\n\n.\n1";
	405	eval $str \|\| die;
	406	write OUTPUT;
	407	} else {
	408	s/^/' ' x $indent/gem;
	409	s/^\s+\n$/\n/gm;
f2506fb2	410	s/^ /: /s if defined($reformat) && $opt_alt_format;
69e00e79	411	print OUTPUT;
	412	}
	413	}
	414
	415	sub noremap {
	416	local($thing_to_hide) = shift;
	417	$thing_to_hide =~ tr/\000-\177/\200-\377/;
	418	return $thing_to_hide;
	419	}
	420
	421	sub init_noremap {
	422	die "unmatched init" if $mapready++;
26fb054b	423	#mask off high bit characters in input stream
26fb054b	424	s/([\200-\377])/"E<".ord($1).">"/ge;
69e00e79	425	}
	426
	427	sub clear_noremap {
	428	my $ready_to_print = $_[0];
	429	die "unmatched clear" unless $mapready--;
	430	tr/\200-\377/\000-\177/;
	431	# now for the E<>s, which have been hidden until now
	432	# otherwise the interative \w<> processing would have
	433	# been hosed by the E<gt>
	434	s {
26fb054b	435	E<
	436	(
	437	( \d+ )
	438	\| ( [A-Za-z]+ )
	439	)
69e00e79	440	>
	441	} {
	442	do {
26fb054b	443	defined $2
	444	? chr($2)
	445	:
	446	defined $HTML_Escapes{$3}
	447	? do { $HTML_Escapes{$3} }
69e00e79	448	: do {
f02a87df	449	warn "Unknown escape: E<$1> in $_";
69e00e79	450	"E<$1>";
	451	}
	452	}
	453	}egx if $ready_to_print;
	454	}
	455
	456	sub internal_lrefs {
	457	local($_) = shift;
	458	s{L</([^>]+)>}{$1}g;
	459	my(@items) = split( /(?:,?\s+(?:and\s+)?)/ );
	460	my $retstr = "the ";
	461	my $i;
	462	for ($i = 0; $i <= $#items; $i++) {
	463	$retstr .= "C<$items[$i]>";
	464	$retstr .= ", " if @items > 2 && $i != $#items;
	465	$retstr .= " and " if $i+2 == @items;
	466	}
	467
	468	$retstr .= " entr" . ( @items > 1 ? "ies" : "y" )
	469	. " elsewhere in this document ";
	470
	471	return $retstr;
	472
	473	}
	474
	475	BEGIN {
	476
5491a304	477	if (ord("\t") == 9) {
69e00e79	478	%HTML_Escapes = (
	479	'amp' => '&', # ampersand
	480	'lt' => '<', # left chevron, less-than
	481	'gt' => '>', # right chevron, greater-than
	482	'quot' => '"', # double quote
	483
	484	"Aacute" => "\xC1", # capital A, acute accent
	485	"aacute" => "\xE1", # small a, acute accent
	486	"Acirc" => "\xC2", # capital A, circumflex accent
	487	"acirc" => "\xE2", # small a, circumflex accent
	488	"AElig" => "\xC6", # capital AE diphthong (ligature)
	489	"aelig" => "\xE6", # small ae diphthong (ligature)
	490	"Agrave" => "\xC0", # capital A, grave accent
	491	"agrave" => "\xE0", # small a, grave accent
	492	"Aring" => "\xC5", # capital A, ring
	493	"aring" => "\xE5", # small a, ring
	494	"Atilde" => "\xC3", # capital A, tilde
	495	"atilde" => "\xE3", # small a, tilde
	496	"Auml" => "\xC4", # capital A, dieresis or umlaut mark
	497	"auml" => "\xE4", # small a, dieresis or umlaut mark
	498	"Ccedil" => "\xC7", # capital C, cedilla
	499	"ccedil" => "\xE7", # small c, cedilla
	500	"Eacute" => "\xC9", # capital E, acute accent
	501	"eacute" => "\xE9", # small e, acute accent
	502	"Ecirc" => "\xCA", # capital E, circumflex accent
	503	"ecirc" => "\xEA", # small e, circumflex accent
	504	"Egrave" => "\xC8", # capital E, grave accent
	505	"egrave" => "\xE8", # small e, grave accent
	506	"ETH" => "\xD0", # capital Eth, Icelandic
	507	"eth" => "\xF0", # small eth, Icelandic
	508	"Euml" => "\xCB", # capital E, dieresis or umlaut mark
	509	"euml" => "\xEB", # small e, dieresis or umlaut mark
	510	"Iacute" => "\xCD", # capital I, acute accent
	511	"iacute" => "\xED", # small i, acute accent
	512	"Icirc" => "\xCE", # capital I, circumflex accent
	513	"icirc" => "\xEE", # small i, circumflex accent
	514	"Igrave" => "\xCD", # capital I, grave accent
	515	"igrave" => "\xED", # small i, grave accent
	516	"Iuml" => "\xCF", # capital I, dieresis or umlaut mark
	517	"iuml" => "\xEF", # small i, dieresis or umlaut mark
	518	"Ntilde" => "\xD1", # capital N, tilde
	519	"ntilde" => "\xF1", # small n, tilde
	520	"Oacute" => "\xD3", # capital O, acute accent
	521	"oacute" => "\xF3", # small o, acute accent
	522	"Ocirc" => "\xD4", # capital O, circumflex accent
	523	"ocirc" => "\xF4", # small o, circumflex accent
	524	"Ograve" => "\xD2", # capital O, grave accent
	525	"ograve" => "\xF2", # small o, grave accent
	526	"Oslash" => "\xD8", # capital O, slash
	527	"oslash" => "\xF8", # small o, slash
	528	"Otilde" => "\xD5", # capital O, tilde
	529	"otilde" => "\xF5", # small o, tilde
	530	"Ouml" => "\xD6", # capital O, dieresis or umlaut mark
	531	"ouml" => "\xF6", # small o, dieresis or umlaut mark
	532	"szlig" => "\xDF", # small sharp s, German (sz ligature)
	533	"THORN" => "\xDE", # capital THORN, Icelandic
	534	"thorn" => "\xFE", # small thorn, Icelandic
	535	"Uacute" => "\xDA", # capital U, acute accent
	536	"uacute" => "\xFA", # small u, acute accent
	537	"Ucirc" => "\xDB", # capital U, circumflex accent
	538	"ucirc" => "\xFB", # small u, circumflex accent
	539	"Ugrave" => "\xD9", # capital U, grave accent
	540	"ugrave" => "\xF9", # small u, grave accent
	541	"Uuml" => "\xDC", # capital U, dieresis or umlaut mark
542	"uuml" => "\xFC", # small u, dieresis or umlaut mark
543	"Yacute" => "\xDD", # capital Y, acute accent
544	"yacute" => "\xFD", # small y, acute accent
545	"yuml" => "\xFF", # small y, dieresis or umlaut mark
546
547	"lchevron" => "\xAB", # left chevron (double less than)
548	"rchevron" => "\xBB", # right chevron (double greater than)
549	);
5491a304	550
	551	}
	552	else {
	553
	554	# This hash assumes code page IBM-1047:
	555	%HTML_Escapes = (
	556	'amp' => '&', # ampersand
	557	'lt' => '<', # left chevron, less-than
	558	'gt' => '>', # right chevron, greater-than
	559	'quot' => '"', # double quote
	560
	561	"Aacute" => "\x65", # capital A, acute accent
	562	"aacute" => "\x45", # small a, acute accent
	563	"Acirc" => "\x62", # capital A, circumflex accent
	564	"acirc" => "\x42", # small a, circumflex accent
	565	"AElig" => "\x9E", # capital AE diphthong (ligature)
	566	"aelig" => "\x9C", # small ae diphthong (ligature)
	567	"Agrave" => "\x64", # capital A, grave accent
	568	"agrave" => "\x44", # small a, grave accent
	569	"Aring" => "\x67", # capital A, ring
	570	"aring" => "\x47", # small a, ring
	571	"Atilde" => "\x66", # capital A, tilde
	572	"atilde" => "\x46", # small a, tilde
	573	"Auml" => "\x63", # capital A, dieresis or umlaut mark
	574	"auml" => "\x43", # small a, dieresis or umlaut mark
	575	"Ccedil" => "\x68", # capital C, cedilla
	576	"ccedil" => "\x48", # small c, cedilla
	577	"Eacute" => "\x71", # capital E, acute accent
	578	"eacute" => "\x51", # small e, acute accent
	579	"Ecirc" => "\x72", # capital E, circumflex accent
	580	"ecirc" => "\x52", # small e, circumflex accent
	581	"Egrave" => "\x74", # capital E, grave accent
	582	"egrave" => "\x54", # small e, grave accent
	583	"ETH" => "\xAC", # capital Eth, Icelandic
	584	"eth" => "\x8C", # small eth, Icelandic
	585	"Euml" => "\x73", # capital E, dieresis or umlaut mark
	586	"euml" => "\x53", # small e, dieresis or umlaut mark
	587	"Iacute" => "\x75", # capital I, acute accent
	588	"iacute" => "\x55", # small i, acute accent
	589	"Icirc" => "\x76", # capital I, circumflex accent
	590	"icirc" => "\x56", # small i, circumflex accent
	591	"Igrave" => "\x75", # capital I, grave accent
	592	"igrave" => "\x55", # small i, grave accent
	593	"Iuml" => "\x77", # capital I, dieresis or umlaut mark
	594	"iuml" => "\x57", # small i, dieresis or umlaut mark
	595	"Ntilde" => "\x69", # capital N, tilde
	596	"ntilde" => "\x49", # small n, tilde
	597	"Oacute" => "\xEE", # capital O, acute accent
	598	"oacute" => "\xCE", # small o, acute accent
	599	"Ocirc" => "\xEB", # capital O, circumflex accent
	600	"ocirc" => "\xCB", # small o, circumflex accent
	601	"Ograve" => "\xED", # capital O, grave accent
	602	"ograve" => "\xCD", # small o, grave accent
	603	"Oslash" => "\x80", # capital O, slash
	604	"oslash" => "\x70", # small o, slash
	605	"Otilde" => "\xEF", # capital O, tilde
	606	"otilde" => "\xCF", # small o, tilde
	607	"Ouml" => "\xEC", # capital O, dieresis or umlaut mark
	608	"ouml" => "\xCC", # small o, dieresis or umlaut mark
	609	"szlig" => "\x59", # small sharp s, German (sz ligature)
	610	"THORN" => "\xAE", # capital THORN, Icelandic
	611	"thorn" => "\x8E", # small thorn, Icelandic
	612	"Uacute" => "\xFE", # capital U, acute accent
	613	"uacute" => "\xDE", # small u, acute accent
614	"Ucirc" => "\xFB", # capital U, circumflex accent
615	"ucirc" => "\xDB", # small u, circumflex accent
616	"Ugrave" => "\xFD", # capital U, grave accent
617	"ugrave" => "\xDD", # small u, grave accent
618	"Uuml" => "\xFC", # capital U, dieresis or umlaut mark
619	"uuml" => "\xDC", # small u, dieresis or umlaut mark
620	"Yacute" => "\xBA", # capital Y, acute accent
621	"yacute" => "\x8D", # small y, acute accent
622	"yuml" => "\xDF", # small y, dieresis or umlaut mark
623
624	"lchevron" => "\x8A", # left chevron (double less than)
625	"rchevron" => "\x8B", # right chevron (double greater than)
626	);
627
628	}
69e00e79	629	}
	630
	631	1;