X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=catagits%2FGitalist.git;a=blobdiff_plain;f=local-lib5%2Fman%2Fman3%2FHTML%3A%3ATokeParser.3pm;fp=local-lib5%2Fman%2Fman3%2FHTML%3A%3ATokeParser.3pm;h=13d14393b415df8f19e6eace50b71594fb56b21d;hp=0000000000000000000000000000000000000000;hb=3fea05b9fbf95091f4522528b9980a33e0235603;hpb=af746827daa7a8feccee889e1d12ebc74cc9201e

diff --git a/local-lib5/man/man3/HTML::TokeParser.3pm b/local-lib5/man/man3/HTML::TokeParser.3pm
new file mode 100644
index 0000000..13d1439
--- /dev/null
+++ b/local-lib5/man/man3/HTML::TokeParser.3pm
@@ -0,0 +1,369 @@
+.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.3
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sh \" Subsection heading
+.br
+.if t .Sp
+.ne 5
+.PP
+\fB\\$1\fR
+.PP
+..
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  | will give a
+.\" real vertical bar.  \*(C+ will give a nicer C++.  Capital omega is used to
+.\" do unbreakable dashes and therefore won't be available.  \*(C` and \*(C'
+.\" expand to `' in nroff, nothing in troff, for use with C<>.
+.tr \(*W-|\(bv\*(Tr
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+'br\}
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.if \nF \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    nr % 0
+.    rr F
+.\}
+.\"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.hy 0
+.if n .na
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "HTML::TokeParser 3"
+.TH HTML::TokeParser 3 "2008-04-04" "perl v5.8.7" "User Contributed Perl Documentation"
+.SH "NAME"
+HTML::TokeParser \- Alternative HTML::Parser interface
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 4
+\& require HTML::TokeParser;
+\& $p = HTML::TokeParser\->new("index.html") ||
+\&      die "Can't open: $!";
+\& $p\->empty_element_tags(1);  # configure its behaviour
+.Ve
+.PP
+.Vb 3
+\& while (my $token = $p\->get_token) {
+\&     #...
+\& }
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+The \f(CW\*(C`HTML::TokeParser\*(C'\fR is an alternative interface to the
+\&\f(CW\*(C`HTML::Parser\*(C'\fR class.  It is an \f(CW\*(C`HTML::PullParser\*(C'\fR subclass with a
+predeclared set of token types.  If you wish the tokens to be reported
+differently you probably want to use the \f(CW\*(C`HTML::PullParser\*(C'\fR directly.
+.PP
+The following methods are available:
+.ie n .IP "$p = HTML::TokeParser\->new( $filename\fR, \f(CW%opt );" 4
+.el .IP "$p = HTML::TokeParser\->new( \f(CW$filename\fR, \f(CW%opt\fR );" 4
+.IX Item "$p = HTML::TokeParser->new( $filename, %opt );"
+.PD 0
+.ie n .IP "$p = HTML::TokeParser\->new( $filehandle\fR, \f(CW%opt );" 4
+.el .IP "$p = HTML::TokeParser\->new( \f(CW$filehandle\fR, \f(CW%opt\fR );" 4
+.IX Item "$p = HTML::TokeParser->new( $filehandle, %opt );"
+.ie n .IP "$p = HTML::TokeParser\->new( \e$document, %opt );" 4
+.el .IP "$p = HTML::TokeParser\->new( \e$document, \f(CW%opt\fR );" 4
+.IX Item "$p = HTML::TokeParser->new( $document, %opt );"
+.PD
+The object constructor argument is either a file name, a file handle
+object, or the complete document to be parsed.  Extra options can be
+provided as key/value pairs and are processed as documented by the base
+classes.
+.Sp
+If the argument is a plain scalar, then it is taken as the name of a
+file to be opened and parsed.  If the file can't be opened for
+reading, then the constructor will return \f(CW\*(C`undef\*(C'\fR and $! will tell
+you why it failed.
+.Sp
+If the argument is a reference to a plain scalar, then this scalar is
+taken to be the literal document to parse.  The value of this
+scalar should not be changed before all tokens have been extracted.
+.Sp
+Otherwise the argument is taken to be some object that the
+\&\f(CW\*(C`HTML::TokeParser\*(C'\fR can \fIread()\fR from when it needs more data.  Typically
+it will be a filehandle of some kind.  The stream will be \fIread()\fR until
+\&\s-1EOF\s0, but not closed.
+.Sp
+A newly constructed \f(CW\*(C`HTML::TokeParser\*(C'\fR differ from its base classes
+by having the \f(CW\*(C`unbroken_text\*(C'\fR attribute enabled by default. See
+HTML::Parser for a description of this and other attributes that
+influence how the document is parsed. It is often a good idea to enable
+\&\f(CW\*(C`empty_element_tags\*(C'\fR behaviour.
+.Sp
+Note that the parsing result will likely not be valid if raw undecoded
+\&\s-1UTF\-8\s0 is used as a source.  When parsing \s-1UTF\-8\s0 encoded files turn
+on \s-1UTF\-8\s0 decoding:
+.Sp
+.Vb 3
+\&   open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
+\&   my $p = HTML::TokeParser\->new( $fh );
+\&   # ...
+.Ve
+.Sp
+If a \f(CW$filename\fR is passed to the constructor the file will be opened in
+raw mode and the parsing result will only be valid if its content is
+Latin\-1 or pure \s-1ASCII\s0.
+.Sp
+If parsing from an \s-1UTF\-8\s0 encoded string buffer decode it first:
+.Sp
+.Vb 3
+\&   utf8::decode($document);
+\&   my $p = HTML::TokeParser\->new( \e$document );
+\&   # ...
+.Ve
+.IP "$p\->get_token" 4
+.IX Item "$p->get_token"
+This method will return the next \fItoken\fR found in the \s-1HTML\s0 document,
+or \f(CW\*(C`undef\*(C'\fR at the end of the document.  The token is returned as an
+array reference.  The first element of the array will be a string
+denoting the type of this token: \*(L"S\*(R" for start tag, \*(L"E\*(R" for end tag,
+\&\*(L"T\*(R" for text, \*(L"C\*(R" for comment, \*(L"D\*(R" for declaration, and \*(L"\s-1PI\s0\*(R" for
+process instructions.  The rest of the token array depend on the type
+like this:
+.Sp
+.Vb 6
+\&  ["S",  $tag, $attr, $attrseq, $text]
+\&  ["E",  $tag, $text]
+\&  ["T",  $text, $is_data]
+\&  ["C",  $text]
+\&  ["D",  $text]
+\&  ["PI", $token0, $text]
+.Ve
+.Sp
+where \f(CW$attr\fR is a hash reference, \f(CW$attrseq\fR is an array reference and
+the rest are plain scalars.  The \*(L"Argspec\*(R" in HTML::Parser explains the
+details.
+.ie n .IP "$p\->unget_token( @tokens )" 4
+.el .IP "$p\->unget_token( \f(CW@tokens\fR )" 4
+.IX Item "$p->unget_token( @tokens )"
+If you find you have read too many tokens you can push them back,
+so that they are returned the next time \f(CW$p\fR\->get_token is called.
+.IP "$p\->get_tag" 4
+.IX Item "$p->get_tag"
+.PD 0
+.ie n .IP "$p\->get_tag( @tags )" 4
+.el .IP "$p\->get_tag( \f(CW@tags\fR )" 4
+.IX Item "$p->get_tag( @tags )"
+.PD
+This method returns the next start or end tag (skipping any other
+tokens), or \f(CW\*(C`undef\*(C'\fR if there are no more tags in the document.  If
+one or more arguments are given, then we skip tokens until one of the
+specified tag types is found.  For example:
+.Sp
+.Vb 1
+\&   $p\->get_tag("font", "/font");
+.Ve
+.Sp
+will find the next start or end tag for a font\-element.
+.Sp
+The tag information is returned as an array reference in the same form
+as for \f(CW$p\fR\->get_token above, but the type code (first element) is
+missing. A start tag will be returned like this:
+.Sp
+.Vb 1
+\&  [$tag, $attr, $attrseq, $text]
+.Ve
+.Sp
+The tagname of end tags are prefixed with \*(L"/\*(R", i.e. end tag is
+returned like this:
+.Sp
+.Vb 1
+\&  ["/$tag", $text]
+.Ve
+.IP "$p\->get_text" 4
+.IX Item "$p->get_text"
+.PD 0
+.ie n .IP "$p\->get_text( @endtags )" 4
+.el .IP "$p\->get_text( \f(CW@endtags\fR )" 4
+.IX Item "$p->get_text( @endtags )"
+.PD
+This method returns all text found at the current position. It will
+return a zero length string if the next token is not text. Any
+entities will be converted to their corresponding character.
+.Sp
+If one or more arguments are given, then we return all text occurring
+before the first of the specified tags found. For example:
+.Sp
+.Vb 1
+\&   $p\->get_text("p", "br");
+.Ve
+.Sp
+will return the text up to either a paragraph of linebreak element.
+.Sp
+The text might span tags that should be \fItextified\fR.  This is
+controlled by the \f(CW$p\fR\->{textify} attribute, which is a hash that
+defines how certain tags can be treated as text.  If the name of a
+start tag matches a key in this hash then this tag is converted to
+text.  The hash value is used to specify which tag attribute to obtain
+the text from.  If this tag attribute is missing, then the upper case
+name of the tag enclosed in brackets is returned, e.g. \*(L"[\s-1IMG\s0]\*(R".  The
+hash value can also be a subroutine reference.  In this case the
+routine is called with the start tag token content as its argument and
+the return value is treated as the text.
+.Sp
+The default \f(CW$p\fR\->{textify} value is:
+.Sp
+.Vb 1
+\&  {img => "alt", applet => "alt"}
+.Ve
+.Sp
+This means that <\s-1IMG\s0> and <\s-1APPLET\s0> tags are treated as text, and that
+the text to substitute can be found in the \s-1ALT\s0 attribute.
+.IP "$p\->get_trimmed_text" 4
+.IX Item "$p->get_trimmed_text"
+.PD 0
+.ie n .IP "$p\->get_trimmed_text( @endtags )" 4
+.el .IP "$p\->get_trimmed_text( \f(CW@endtags\fR )" 4
+.IX Item "$p->get_trimmed_text( @endtags )"
+.PD
+Same as \f(CW$p\fR\->get_text above, but will collapse any sequences of white
+space to a single space character.  Leading and trailing white space is
+removed.
+.IP "$p\->get_phrase" 4
+.IX Item "$p->get_phrase"
+This will return all text found at the current position ignoring any
+phrasal-level tags.  Text is extracted until the first non
+phrasal-level tag.  Textification of tags is the same as for
+\&\fIget_text()\fR.  This method will collapse white space in the same way as
+\&\fIget_trimmed_text()\fR does.
+.Sp
+The definition of <i>phrasal\-level tags</i> is obtained from the
+HTML::Tagset module.
+.SH "EXAMPLES"
+.IX Header "EXAMPLES"
+This example extracts all links from a document.  It will print one
+line for each link, containing the \s-1URL\s0 and the textual description
+between the <A>...</A> tags:
+.PP
+.Vb 2
+\&  use HTML::TokeParser;
+\&  $p = HTML::TokeParser\->new(shift||"index.html");
+.Ve
+.PP
+.Vb 5
+\&  while (my $token = $p\->get_tag("a")) {
+\&      my $url = $token\->[1]{href} || "\-";
+\&      my $text = $p\->get_trimmed_text("/a");
+\&      print "$url\et$text\en";
+\&  }
+.Ve
+.PP
+This example extract the <\s-1TITLE\s0> from the document:
+.PP
+.Vb 6
+\&  use HTML::TokeParser;
+\&  $p = HTML::TokeParser\->new(shift||"index.html");
+\&  if ($p\->get_tag("title")) {
+\&      my $title = $p\->get_trimmed_text;
+\&      print "Title: $title\en";
+\&  }
+.Ve
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+HTML::PullParser, HTML::Parser
+.SH "COPYRIGHT"
+.IX Header "COPYRIGHT"
+Copyright 1998\-2005 Gisle Aas.
+.PP
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.