From: Gurusamy Sarathy Date: Sat, 22 Jan 2000 12:04:30 +0000 (+0000) Subject: heavy cleanup of Pod::Html bug fixes (from Wolfgang Laun X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2a28b791c040b17cd26931dd9a159635db5fe244;p=p5sagit%2Fp5-mst-13.2.git heavy cleanup of Pod::Html bug fixes (from Wolfgang Laun ) p4raw-id: //depot/perl@4840 --- diff --git a/lib/Pod/Html.pm b/lib/Pod/Html.pm index 15757ec..1cb5267 100644 --- a/lib/Pod/Html.pm +++ b/lib/Pod/Html.pm @@ -5,7 +5,7 @@ use Getopt::Long; # package for handling command-line parameters use File::Spec::Unix; require Exporter; use vars qw($VERSION); -$VERSION = 1.02; +$VERSION = 1.03; @ISA = Exporter; @EXPORT = qw(pod2html htmlify); use Cwd; @@ -176,10 +176,6 @@ Uses $Config{pod2html} to setup default options. Tom Christiansen, Etchrist@perl.comE. -=head1 BUGS - -Has trouble with C<> etc in = commands. - =head1 SEE ALSO L @@ -216,13 +212,8 @@ my $quiet = 0; # not quiet by default my $verbose = 0; # not verbose by default my $doindex = 1; # non-zero if we should generate an index my $listlevel = 0; # current list depth -my @listitem = (); # stack of HTML commands to use when a =item is - # encountered. the top of the stack is the - # current list. -my @listdata = (); # similar to @listitem, but for the text after - # an =item -my @listend = (); # similar to @listitem, but the text to use to - # end the list. +my @listend = (); # the text to use to end the list. +my $after_lpar = 0; # set to true after a par in an =item my $ignore = 1; # whether or not to format text. we don't # format text until we hit our first pod # directive. @@ -236,11 +227,13 @@ my $top = 1; # true if we are at the top of the doc. used # to prevent the first
directive. my $paragraph; # which paragraph we're processing (used # for error messages) +my $ptQuote = 0; # status of double-quote conversion my %pages = (); # associative array used to find the location # of pages referenced by L<> links. my %sections = (); # sections within this page my %items = (); # associative array used to find the location # of =item directives referenced by C<> links +my %local_items = (); # local items - avoid destruction of %items my $Is83; # is dos with short filenames (8.3) sub init_globals { @@ -263,13 +256,8 @@ $quiet = 0; # not quiet by default $verbose = 0; # not verbose by default $doindex = 1; # non-zero if we should generate an index $listlevel = 0; # current list depth -@listitem = (); # stack of HTML commands to use when a =item is - # encountered. the top of the stack is the - # current list. -@listdata = (); # similar to @listitem, but for the text after - # an =item -@listend = (); # similar to @listitem, but the text to use to - # end the list. +@listend = (); # the text to use to end the list. +$after_lpar = 0; # set to true after a par in an =item $ignore = 1; # whether or not to format text. we don't # format text until we hit our first pod # directive. @@ -291,9 +279,28 @@ $paragraph = ''; # which paragraph we're processing (used # of pages referenced by L<> links. #%items = (); # associative array used to find the location # of =item directives referenced by C<> links +%local_items = (); $Is83=$^O eq 'dos'; } +# +# clean_data: global clean-up of pod data +# +sub clean_data($){ + my( $dataref ) = @_; + my $i; + for( $i = 0; $i <= $#$dataref; $i++ ){ + ${$dataref}[$i] =~ s/\s+\Z//; + + # have a look for all-space lines + if( ${$dataref}[$i] =~ /^\s+$/m ){ + my @chunks = split( /^\s+$/m, ${$dataref}[$i] ); + splice( @$dataref, $i, 1, @chunks ); + } + } +} + + sub pod2html { local(@ARGV) = @_; local($/); @@ -341,6 +348,7 @@ sub pod2html { $/ = ""; my @poddata = ; close(POD); + clean_data( \@poddata ); # scan the pod for =head[1-6] directives and build an index my $index = scan_headings(\%sections, @poddata); @@ -410,12 +418,13 @@ END_OF_HEAD get_cache($dircache, $itemcache, \@podpath, $podroot, $recurse); # scan the pod for =item directives - scan_items("", \%items, @poddata); + scan_items( \%local_items, "", @poddata); # put an index at the top of the file. note, if $doindex is 0 we # still generate an index, but surround it with an html comment. # that way some other program can extract it if desired. $index =~ s/--+/-/g; + print HTML "\n"; print HTML "\n"; print HTML "call emit_C($par) lev=$lev, par with BI=$x\n"; + + $res = emit_C( $text, $lev > 1 || ($par =~ /[BI] - convert to character + $$rstr =~ s/^(\w+)>//; + $res = "&$1;"; + + } elsif( $func eq 'F' ){ + # F - italizice + $res = '' . process_text1( $lev, $rstr ) . ''; + + } elsif( $func eq 'I' ){ + # I - italizice + $res = '' . process_text1( $lev, $rstr ) . ''; + + } elsif( $func eq 'L' ){ + # L - link + ## L => produce text, use cross-ref for linking + ## L => make text from cross-ref + ## need to extract text + my $par = go_ahead( $rstr, 'L' ); + + # some L<>'s that shouldn't be: + # a) full-blown URL's are emitted as-is + if( $par =~ m{^\w+://}s ){ + return make_URL_href( $par ); + } + # b) C<...> is stripped and treated as C<> + if( $par =~ /^C<(.*)>$/ ){ + my $text = depod( $1 ); + return emit_C( $text, $lev > 1 || ($par =~ /[BI] L<$par> to page $page, ident $ident\n"; + + } elsif( $par =~ m{^(.*?)/"?(.*?)"?$} ){ # [name]/"section" + # even though this should be a "section", we go for ident first + ( $page, $ident ) = ( $1, $2 ); + ### print STDERR "--> L<$par> to page $page, section $section\n"; + + } elsif( $par =~ /\s/ ){ # this must be a section with missing quotes + ( $page, $section ) = ( '', $par ); + ### print STDERR "--> L<$par> to void page, section $section\n"; + + } else { + ( $page, $section ) = ( $par, '' ); + ### print STDERR "--> L<$par> to page $par, void section\n"; + } + + # now, either $section or $ident is defined. the convoluted logic + # below tries to resolve L<> according to what the user specified. + # failing this, we try to find the next best thing... + my( $url, $ltext, $fid ); + + RESOLVE: { + if( defined $ident ){ + ## try to resolve $ident as an item + ( $url, $fid ) = coderef( $page, $ident ); + if( $url ){ + if( ! defined( $linktext ) ){ + $linktext = $ident; + $linktext .= " in " if $ident && $page; + $linktext .= "the $page manpage" if $page; + } + ### print STDERR "got coderef url=$url\n"; + last RESOLVE; + } + ## no luck: go for a section (auto-quoting!) + $section = $ident; + } + ## now go for a section + my $htmlsection = htmlify( $section ); + $url = page_sect( $page, $htmlsection ); + if( $url ){ + if( ! defined( $linktext ) ){ + $linktext = $section; + $linktext .= " in " if $section && $page; + $linktext .= "the $page manpage" if $page; + } + ### print STDERR "got page/section url=$url\n"; + last RESOLVE; + } + ## no luck: go for an ident + if( $section ){ + $ident = $section; + } else { + $ident = $page; + $page = undef(); + } + ( $url, $fid ) = coderef( $page, $ident ); + if( $url ){ + if( ! defined( $linktext ) ){ + $linktext = $ident; + $linktext .= " in " if $ident && $page; + $linktext .= "the $page manpage" if $page; + } + ### print STDERR "got section=>coderef url=$url\n"; + last RESOLVE; + } + + # warning; show some text. + $linktext = $opar unless defined $linktext; + warn "$0: $podfile: cannot resolve L<$opar> in paragraph $paragraph."; + } + + # now we have an URL or just plain code + $$rstr = $linktext . '>' . $$rstr; + if( defined( $url ) ){ + $res = "" . process_text1( $lev, $rstr ) . ''; + } else { + $res = '' . process_text1( $lev, $rstr ) . ''; + } + + } elsif( $func eq 'S' ){ + # S - non-breaking spaces + $res = process_text1( $lev, $rstr ); + $res =~ s/ / /g; + + } elsif( $func eq 'X' ){ + # X<> - ignore + $$rstr =~ s/^[^>]*>//; + + } elsif( $func eq 'Z' ){ + # Z<> - empty + warn "$0: $podfile: invalid X<> in paragraph $paragraph." + unless $$rstr =~ s/^>//; + + } else { + while( $$rstr =~ s/\A(.*?)([BCEFILSXZ]<|>)//s ){ + # all others: either recurse into new function or + # terminate at closing angle bracket + my $pt = $1; + $pt .= '>' if $2 eq '>' && $lev == 1; + $res .= $lev == 1 ? pure_text( $pt ) : inIS_text( $pt ); + return $res if $2 eq '>' && $lev > 1; + if( $2 ne '>' ){ + $res .= process_text1( $lev, $rstr, substr($2,0,1) ); + } + + } + if( $lev == 1 ){ + $res .= pure_text( $$rstr ); } else { - $result .= "$rest\n"; - $rest = ""; + warn "$0: $podfile: undelimited $func<> in paragraph $paragraph."; } } - $result .= $rest if $rest; - - # restore the leading and trailing white-space - $result = "$lead$result$trail"; + return $res; +} - return $result; +# +# go_ahead: extract text of an IS (can be nested) +# +sub go_ahead($$){ + my( $rstr, $func ) = @_; + my $res = ''; + my $level = 1; + while( $$rstr =~ s/\A(.*?)([BCEFILSXZ]<|>)//s ){ + $res .= $1; + if( $2 eq '>' ){ + return $res if --$level == 0; + } else { + ++$level; + } + $res .= $2; + } + warn "$0: $podfile: undelimited $func<> in paragraph $paragraph."; + return $res; } # -# pre_escape - convert & in text to $amp; +# emit_C - output result of C +# $text is the depod-ed text # -sub pre_escape { - my($str) = @_; - $$str =~ s/&(?!\w+;|#)/&/g; # XXX not bulletproof +sub emit_C($;$){ + my( $text, $nocode ) = @_; + my $res; + my( $url, $fid ) = coderef( undef(), $text ); + + # need HTML-safe text + my $linktext = html_escape( $text ); + + if( defined( $url ) && + (!defined( $EmittedItem ) || $EmittedItem ne $fid ) ){ + $res = "$linktext"; + } elsif( 0 && $nocode ){ + $res = $linktext; + } else { + $res = "$linktext"; + } + return $res; } # +# html_escape: make text safe for HTML +# +sub html_escape { + my $rest = $_[0]; + $rest =~ s/&/&/g; + $rest =~ s//>/g; + $rest =~ s/"/"/g; + return $rest; +} + + +# # dosify - convert filenames to 8.3 # sub dosify { @@ -1460,54 +1643,24 @@ sub dosify { } # -# process_L - convert a pod L<> directive to a corresponding HTML link. -# most of the links made are inferred rather than known about directly -# (i.e it's not known whether the =head\d section exists in the target file, -# or whether a .pod file exists in the case of split files). however, the -# guessing usually works. +# page_sect - make an URL from the text of a L<> # -# Unlike the other directives, this should be called with an unprocessed -# string, else tags in the link won't be matched. -# -sub process_L { - my($str) = @_; - my($s1, $s2, $linktext, $page, $page83, $section, $link); # work strings - - $str =~ s/\n/ /g; # undo word-wrapped tags - $s1 = $str; - for ($s1) { - # LREF: a la HREF L - $linktext = $1 if s:^([^|]+)\|::; - - # make sure sections start with a / - s,^",/",g; - s,^,/,g if (!m,/, && / /); - - # check if there's a section specified - if (m,^(.*?)/"?(.*?)"?$,) { # yes - ($page, $section) = ($1, $2); - } else { # no - ($page, $section) = ($_, ""); - } - - # check if we know that this is a section in this page - if (!defined $pages{$page} && defined $sections{$page}) { - $section = $page; - $page = ""; - } - - # remove trailing punctuation, like () - $section =~ s/\W*$// ; +sub page_sect($$) { + my( $page, $section ) = @_; + my( $linktext, $page83, $link); # work strings + + # check if we know that this is a section in this page + if (!defined $pages{$page} && defined $sections{$page}) { + $section = $page; + $page = ""; + ### print STDERR "reset page='', section=$section\n"; } $page83=dosify($page); $page=$page83 if (defined $pages{$page83}); if ($page eq "") { - $link = "#" . htmlify(0,$section); - $linktext = $section unless defined($linktext); + $link = "#" . htmlify( $section ); } elsif ( $page =~ /::/ ) { - $linktext = ($section ? "$section" : "$page") - unless defined($linktext); $page =~ s,::,/,g; # Search page cache for an entry keyed under the html page name, # then look to see what directory that page might be in. NOTE: @@ -1529,45 +1682,42 @@ sub process_L { # but A::C is found in lib/A/C.pm, then A::B is assumed to be in # lib/A/B.pm. This is also limited, but it's an improvement. # Maybe a hints file so that the links point to the correct places - # non-theless? - # Also, maybe put a warn "$0: cannot resolve..." here. + # nonetheless? + } $link = "$htmlroot/$page.html"; - $link .= "#" . htmlify(0,$section) if ($section); + $link .= "#" . htmlify( $section ) if ($section); } elsif (!defined $pages{$page}) { - warn "$0: $podfile: cannot resolve L<$str> in paragraph $paragraph: no such page '$page'\n" unless $quiet; $link = ""; - $linktext = $page unless defined($linktext); } else { - $linktext = ($section ? "$section" : "the $page manpage") unless defined($linktext); - $section = htmlify(0,$section) if $section ne ""; + $section = htmlify( $section ) if $section ne ""; + ### print STDERR "...section=$section\n"; # if there is a directory by the name of the page, then assume that an # appropriate section will exist in the subdirectory # if ($section ne "" && $pages{$page} =~ /([^:]*[^(\.pod|\.pm)]):/) { if ($section ne "" && $pages{$page} =~ /([^:]*(?$linktext"; } else { - $s1 = "$linktext"; + return undef(); } - return $s1; } # @@ -1626,110 +1773,63 @@ sub relativize_url { return $rel_path ; } -# -# process_BFI - process any of the B<>, F<>, or I<> pod-escapes and -# convert them to corresponding HTML directives. -# -sub process_BFI { - my($tag, $str) = @_; - my($s1); # work string - my(%repltext) = ( 'B' => 'STRONG', - 'F' => 'EM', - 'I' => 'EM'); - - # extract the modified text and convert to HTML - $s1 = "<$repltext{$tag}>$str"; - return $s1; -} # -# process_C - process the C<> pod-escape. +# coderef - make URL from the text of a C<> # -sub process_C { - my($str, $doref) = @_; - my($s1, $s2); +sub coderef($$){ + my( $page, $item ) = @_; + my( $url ); + + my $fid = fragment_id( $item ); + + if( defined( $page ) ){ + # we have been given a $page... + $page =~ s{::}{/}g; + + # Do we take it? Item could be a section! + my $base = $items{$fid}; + $base =~ s{[^/]*/}{}; + if( $base ne "$page.html" ){ + ### print STDERR "coderef( $page, $item ): items{$fid} = $items{$fid} = $base => discard page!\n"; + $page = undef(); + } - $s1 = $str; - $s1 =~ s/\([^()]*\)//g; # delete parentheses - $s2 = $s1; - $s1 =~ s/\W//g; # delete bogus characters - $str = html_escape($str); + } else { + # no page - local items precede cached items + if( exists $local_items{$fid} ){ + $page = $local_items{$fid}; + } else { + $page = $items{$fid}; + } + } # if there was a pod file that we found earlier with an appropriate # =item directive, then create a link to that page. - if ($doref && defined $items{$s1}) { - if ( $items{$s1} ) { - my $link = "$htmlroot/$items{$s1}#item_" . htmlify(0,$s2) ; - # Here, we take advantage of the knowledge that $htmlfileurl ne '' - # implies $htmlroot eq ''. - my $url ; - if ( $htmlfileurl ne '' ) { - $link = "$htmldir$link" ; - $url = relativize_url( $link, $htmlfileurl ) ; - } - else { - $url = $link ; + if( defined $page ){ + if( $page ){ + if( $pages{$page} =~ /([^:.]*)\.[^:]*:/){ + $page = $1 . '.html'; } - $s1 = "$str" ; - } - else { - $s1 = "$str" ; - } - $s1 =~ s,(perl\w+/(\S+)\.html)#item_\2\b,$1,; - confess "s1 has space: $s1" if $s1 =~ /HREF="[^"]*\s[^"]*"/; - } else { - $s1 = "$str"; - # warn "$0: $podfile: cannot resolve C<$str> in paragraph $paragraph\n" if $verbose - } - - - return $s1; -} - -# -# process_E - process the E<> pod directive which seems to escape a character. -# -sub process_E { - my($str) = @_; - - for ($str) { - s,([^/].*),\&$1\;,g; - } - - return $str; -} - -# -# process_Z - process the Z<> pod directive which really just amounts to -# ignoring it. this allows someone to start a paragraph with an = -# -sub process_Z { - my($str) = @_; - - # there is no equivalent in HTML for this so just ignore it. - $str = ""; - return $str; -} + my $link = "$htmlroot/$page#item_$fid"; -# -# process_S - process the S<> pod directive which means to convert all -# spaces in the string to non-breaking spaces (in HTML-eze). -# -sub process_S { - my($str) = @_; + # Here, we take advantage of the knowledge that $htmlfileurl + # ne '' implies $htmlroot eq ''. + if ( $htmlfileurl ne '' ) { + $link = "$htmldir$link" ; + $url = relativize_url( $link, $htmlfileurl ) ; + } else { + $url = $link ; + } + } else { + $url = "#item_" . $fid; + } - # convert all spaces in the text to non-breaking spaces in HTML. - $str =~ s/ / /g; - return $str; + confess "url has space: $url" if $url =~ /"[^"]*\s[^"]*"/; + } + return( $url, $fid ); } -# -# process_X - this is supposed to make an index entry. we'll just -# ignore it. -# -sub process_X { - return ''; -} # @@ -1757,29 +1857,126 @@ sub finish_list { # # htmlify - converts a pod section specification to a suitable section -# specification for HTML. if first arg is 1, only takes 1st word. +# specification for HTML. Note that we keep spaces and special characters +# except ", ? (Netscape problem) and the hyphen (writer's problem...). # sub htmlify { - my($compact, $heading) = @_; + my( $heading) = @_; + $heading =~ s/(\s+)/ /g; + $heading =~ s/\s+\Z//; + $heading =~ s/\A\s+//; + # The hyphen is a disgrace to the English language. + $heading =~ s/[-"?]//g; + $heading = lc( $heading ); + return $heading; +} - if ($compact) { - $heading =~ /^(\w+)/; - $heading = $1; - } +# +# depod - convert text by eliminating all interior sequences +# Note: can be called with copy or modify semantics +# +my %E2c; +$E2c{lt} = '<'; +$E2c{gt} = '>'; +$E2c{sol} = '/'; +$E2c{verbar} = '|'; + +sub depod($){ + my $string; + if( ref( $_[0] ) ){ + $string = ${$_[0]}; + ${$_[0]} = depod1( \$string ); + } else { + $string = $_[0]; + depod1( \$string ); + } +} - # $heading = lc($heading); - $heading =~ s/[^\w\s]/_/g; - $heading =~ s/(\s+)/ /g; - $heading =~ s/^\s*(.*?)\s*$/$1/s; - $heading =~ s/ /_/g; - $heading =~ s/\A(.{32}).*\Z/$1/s; - $heading =~ s/\s+\Z//; - $heading =~ s/_{2,}/_/g; +sub depod1($;$){ + my( $rstr, $func ) = @_; + my $res = ''; + if( ! defined( $func ) ){ + # skip to next begin of an interior sequence + while( $$rstr =~ s/\A(.*?)([BCEFILSXZ]) - convert to character + $$rstr =~ s/^(\w+)>//; + $res .= $E2c{$1}; + } elsif( $func eq 'X' ){ + # X<> - ignore + $$rstr =~ s/^[^>]*>//; + } elsif( $func eq 'Z' ){ + # Z<> - empty + $$rstr =~ s/^>//; + } else { + # all others: either recurse into new function or + # terminate at closing angle bracket + while( $$rstr =~ s/\A(.*?)([BCEFILSXZ]<|>)// ){ + $res .= $1; + last if $2 eq '>'; + $res .= depod1( $rstr, substr($2,0,1) ); + } + ## If we're here and $2 ne '>': undelimited interior sequence. + ## Ignored, as this is called without proper indication of where we are. + ## Rely on process_text to produce diagnostics. + } + return $res; +} - return $heading; +# +# fragment_id - construct a fragment identifier from: +# a) =item text +# b) contents of C<...> +# +my @hc; +sub fragment_id { + my $text = shift(); + $text =~ s/\s+\Z//s; + if( $text ){ + # a method or function? + return $1 if $text =~ /(\w+)\s*\(/; + return $1 if $text =~ /->\s*(\w+)\s*\(?/; + + # a variable name? + return $1 if $text =~ /^([$@%*]\S+)/; + + # some pattern matching operator? + return $1 if $text =~ m|^(\w+/).*/\w*$|; + + # fancy stuff... like "do { }" + return $1 if $text =~ m|^(\w+)\s*{.*}$|; + + # honour the perlfunc manpage: func [PAR[,[ ]PAR]...] + # and some funnies with ... Module ... + return $1 if $text =~ m{^([a-z\d]+)(\s+[A-Z\d,/& ]+)?$}; + return $1 if $text =~ m{^([a-z\d]+)\s+Module(\s+[A-Z\d,/& ]+)?$}; + + # text? normalize! + $text =~ s/\s+/_/sg; + $text =~ s{(\W)}{ + defined( $hc[ord($1)] ) ? $hc[ord($1)] + : ( $hc[ord($1)] = sprintf( "%%%02X", ord($1) ) ) }gxe; + $text = substr( $text, 0, 50 ); + } else { + return undef(); + } } -BEGIN { +# +# make_URL_href - generate HTML href from URL +# Special treatment for CGI queries. +# +sub make_URL_href($){ + my( $url ) = @_; + if( $url !~ + s{^(http:[-\w/#~:.+=&%@!]+)(\?.*)?$}{$1}i ){ + $url = "$url"; + } + return $url; } 1;