X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=catagits%2FGitalist.git;a=blobdiff_plain;f=local-lib5%2Fman%2Fman3%2FHTML%3A%3ATokeParser.3pm;fp=local-lib5%2Fman%2Fman3%2FHTML%3A%3ATokeParser.3pm;h=13d14393b415df8f19e6eace50b71594fb56b21d;hp=0000000000000000000000000000000000000000;hb=3fea05b9fbf95091f4522528b9980a33e0235603;hpb=af746827daa7a8feccee889e1d12ebc74cc9201e diff --git a/local-lib5/man/man3/HTML::TokeParser.3pm b/local-lib5/man/man3/HTML::TokeParser.3pm new file mode 100644 index 0000000..13d1439 --- /dev/null +++ b/local-lib5/man/man3/HTML::TokeParser.3pm @@ -0,0 +1,369 @@ +.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.3 +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sh \" Subsection heading +.br +.if t .Sp +.ne 5 +.PP +\fB\\$1\fR +.PP +.. +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" Set up some character translations and predefined strings. \*(-- will +.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left +.\" double quote, and \*(R" will give a right double quote. | will give a +.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to +.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' +.\" expand to `' in nroff, nothing in troff, for use with C<>. +.tr \(*W-|\(bv\*(Tr +.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' +.ie n \{\ +. ds -- \(*W- +. ds PI pi +. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch +. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch +. ds L" "" +. ds R" "" +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds -- \|\(em\| +. ds PI \(*p +. ds L" `` +. ds R" '' +'br\} +.\" +.\" If the F register is turned on, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.if \nF \{\ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. nr % 0 +. rr F +.\} +.\" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.hy 0 +.if n .na +.\" +.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). +.\" Fear. Run. Save yourself. No user-serviceable parts. +. \" fudge factors for nroff and troff +.if n \{\ +. ds #H 0 +. ds #V .8m +. ds #F .3m +. ds #[ \f1 +. ds #] \fP +.\} +.if t \{\ +. ds #H ((1u-(\\\\n(.fu%2u))*.13m) +. ds #V .6m +. ds #F 0 +. ds #[ \& +. ds #] \& +.\} +. \" simple accents for nroff and troff +.if n \{\ +. ds ' \& +. ds ` \& +. ds ^ \& +. ds , \& +. ds ~ ~ +. ds / +.\} +.if t \{\ +. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" +. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' +. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' +. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' +. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' +. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' +.\} +. \" troff and (daisy-wheel) nroff accents +.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' +.ds 8 \h'\*(#H'\(*b\h'-\*(#H' +.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] +.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' +.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' +.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] +.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] +.ds ae a\h'-(\w'a'u*4/10)'e +.ds Ae A\h'-(\w'A'u*4/10)'E +. \" corrections for vroff +.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' +.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' +. \" for low resolution devices (crt and lpr) +.if \n(.H>23 .if \n(.V>19 \ +\{\ +. ds : e +. ds 8 ss +. ds o a +. ds d- d\h'-1'\(ga +. ds D- D\h'-1'\(hy +. ds th \o'bp' +. ds Th \o'LP' +. ds ae ae +. ds Ae AE +.\} +.rm #[ #] #H #V #F C +.\" ======================================================================== +.\" +.IX Title "HTML::TokeParser 3" +.TH HTML::TokeParser 3 "2008-04-04" "perl v5.8.7" "User Contributed Perl Documentation" +.SH "NAME" +HTML::TokeParser \- Alternative HTML::Parser interface +.SH "SYNOPSIS" +.IX Header "SYNOPSIS" +.Vb 4 +\& require HTML::TokeParser; +\& $p = HTML::TokeParser\->new("index.html") || +\& die "Can't open: $!"; +\& $p\->empty_element_tags(1); # configure its behaviour +.Ve +.PP +.Vb 3 +\& while (my $token = $p\->get_token) { +\& #... +\& } +.Ve +.SH "DESCRIPTION" +.IX Header "DESCRIPTION" +The \f(CW\*(C`HTML::TokeParser\*(C'\fR is an alternative interface to the +\&\f(CW\*(C`HTML::Parser\*(C'\fR class. It is an \f(CW\*(C`HTML::PullParser\*(C'\fR subclass with a +predeclared set of token types. If you wish the tokens to be reported +differently you probably want to use the \f(CW\*(C`HTML::PullParser\*(C'\fR directly. +.PP +The following methods are available: +.ie n .IP "$p = HTML::TokeParser\->new( $filename\fR, \f(CW%opt );" 4 +.el .IP "$p = HTML::TokeParser\->new( \f(CW$filename\fR, \f(CW%opt\fR );" 4 +.IX Item "$p = HTML::TokeParser->new( $filename, %opt );" +.PD 0 +.ie n .IP "$p = HTML::TokeParser\->new( $filehandle\fR, \f(CW%opt );" 4 +.el .IP "$p = HTML::TokeParser\->new( \f(CW$filehandle\fR, \f(CW%opt\fR );" 4 +.IX Item "$p = HTML::TokeParser->new( $filehandle, %opt );" +.ie n .IP "$p = HTML::TokeParser\->new( \e$document, %opt );" 4 +.el .IP "$p = HTML::TokeParser\->new( \e$document, \f(CW%opt\fR );" 4 +.IX Item "$p = HTML::TokeParser->new( $document, %opt );" +.PD +The object constructor argument is either a file name, a file handle +object, or the complete document to be parsed. Extra options can be +provided as key/value pairs and are processed as documented by the base +classes. +.Sp +If the argument is a plain scalar, then it is taken as the name of a +file to be opened and parsed. If the file can't be opened for +reading, then the constructor will return \f(CW\*(C`undef\*(C'\fR and $! will tell +you why it failed. +.Sp +If the argument is a reference to a plain scalar, then this scalar is +taken to be the literal document to parse. The value of this +scalar should not be changed before all tokens have been extracted. +.Sp +Otherwise the argument is taken to be some object that the +\&\f(CW\*(C`HTML::TokeParser\*(C'\fR can \fIread()\fR from when it needs more data. Typically +it will be a filehandle of some kind. The stream will be \fIread()\fR until +\&\s-1EOF\s0, but not closed. +.Sp +A newly constructed \f(CW\*(C`HTML::TokeParser\*(C'\fR differ from its base classes +by having the \f(CW\*(C`unbroken_text\*(C'\fR attribute enabled by default. See +HTML::Parser for a description of this and other attributes that +influence how the document is parsed. It is often a good idea to enable +\&\f(CW\*(C`empty_element_tags\*(C'\fR behaviour. +.Sp +Note that the parsing result will likely not be valid if raw undecoded +\&\s-1UTF\-8\s0 is used as a source. When parsing \s-1UTF\-8\s0 encoded files turn +on \s-1UTF\-8\s0 decoding: +.Sp +.Vb 3 +\& open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!"; +\& my $p = HTML::TokeParser\->new( $fh ); +\& # ... +.Ve +.Sp +If a \f(CW$filename\fR is passed to the constructor the file will be opened in +raw mode and the parsing result will only be valid if its content is +Latin\-1 or pure \s-1ASCII\s0. +.Sp +If parsing from an \s-1UTF\-8\s0 encoded string buffer decode it first: +.Sp +.Vb 3 +\& utf8::decode($document); +\& my $p = HTML::TokeParser\->new( \e$document ); +\& # ... +.Ve +.IP "$p\->get_token" 4 +.IX Item "$p->get_token" +This method will return the next \fItoken\fR found in the \s-1HTML\s0 document, +or \f(CW\*(C`undef\*(C'\fR at the end of the document. The token is returned as an +array reference. The first element of the array will be a string +denoting the type of this token: \*(L"S\*(R" for start tag, \*(L"E\*(R" for end tag, +\&\*(L"T\*(R" for text, \*(L"C\*(R" for comment, \*(L"D\*(R" for declaration, and \*(L"\s-1PI\s0\*(R" for +process instructions. The rest of the token array depend on the type +like this: +.Sp +.Vb 6 +\& ["S", $tag, $attr, $attrseq, $text] +\& ["E", $tag, $text] +\& ["T", $text, $is_data] +\& ["C", $text] +\& ["D", $text] +\& ["PI", $token0, $text] +.Ve +.Sp +where \f(CW$attr\fR is a hash reference, \f(CW$attrseq\fR is an array reference and +the rest are plain scalars. The \*(L"Argspec\*(R" in HTML::Parser explains the +details. +.ie n .IP "$p\->unget_token( @tokens )" 4 +.el .IP "$p\->unget_token( \f(CW@tokens\fR )" 4 +.IX Item "$p->unget_token( @tokens )" +If you find you have read too many tokens you can push them back, +so that they are returned the next time \f(CW$p\fR\->get_token is called. +.IP "$p\->get_tag" 4 +.IX Item "$p->get_tag" +.PD 0 +.ie n .IP "$p\->get_tag( @tags )" 4 +.el .IP "$p\->get_tag( \f(CW@tags\fR )" 4 +.IX Item "$p->get_tag( @tags )" +.PD +This method returns the next start or end tag (skipping any other +tokens), or \f(CW\*(C`undef\*(C'\fR if there are no more tags in the document. If +one or more arguments are given, then we skip tokens until one of the +specified tag types is found. For example: +.Sp +.Vb 1 +\& $p\->get_tag("font", "/font"); +.Ve +.Sp +will find the next start or end tag for a font\-element. +.Sp +The tag information is returned as an array reference in the same form +as for \f(CW$p\fR\->get_token above, but the type code (first element) is +missing. A start tag will be returned like this: +.Sp +.Vb 1 +\& [$tag, $attr, $attrseq, $text] +.Ve +.Sp +The tagname of end tags are prefixed with \*(L"/\*(R", i.e. end tag is +returned like this: +.Sp +.Vb 1 +\& ["/$tag", $text] +.Ve +.IP "$p\->get_text" 4 +.IX Item "$p->get_text" +.PD 0 +.ie n .IP "$p\->get_text( @endtags )" 4 +.el .IP "$p\->get_text( \f(CW@endtags\fR )" 4 +.IX Item "$p->get_text( @endtags )" +.PD +This method returns all text found at the current position. It will +return a zero length string if the next token is not text. Any +entities will be converted to their corresponding character. +.Sp +If one or more arguments are given, then we return all text occurring +before the first of the specified tags found. For example: +.Sp +.Vb 1 +\& $p\->get_text("p", "br"); +.Ve +.Sp +will return the text up to either a paragraph of linebreak element. +.Sp +The text might span tags that should be \fItextified\fR. This is +controlled by the \f(CW$p\fR\->{textify} attribute, which is a hash that +defines how certain tags can be treated as text. If the name of a +start tag matches a key in this hash then this tag is converted to +text. The hash value is used to specify which tag attribute to obtain +the text from. If this tag attribute is missing, then the upper case +name of the tag enclosed in brackets is returned, e.g. \*(L"[\s-1IMG\s0]\*(R". The +hash value can also be a subroutine reference. In this case the +routine is called with the start tag token content as its argument and +the return value is treated as the text. +.Sp +The default \f(CW$p\fR\->{textify} value is: +.Sp +.Vb 1 +\& {img => "alt", applet => "alt"} +.Ve +.Sp +This means that <\s-1IMG\s0> and <\s-1APPLET\s0> tags are treated as text, and that +the text to substitute can be found in the \s-1ALT\s0 attribute. +.IP "$p\->get_trimmed_text" 4 +.IX Item "$p->get_trimmed_text" +.PD 0 +.ie n .IP "$p\->get_trimmed_text( @endtags )" 4 +.el .IP "$p\->get_trimmed_text( \f(CW@endtags\fR )" 4 +.IX Item "$p->get_trimmed_text( @endtags )" +.PD +Same as \f(CW$p\fR\->get_text above, but will collapse any sequences of white +space to a single space character. Leading and trailing white space is +removed. +.IP "$p\->get_phrase" 4 +.IX Item "$p->get_phrase" +This will return all text found at the current position ignoring any +phrasal-level tags. Text is extracted until the first non +phrasal-level tag. Textification of tags is the same as for +\&\fIget_text()\fR. This method will collapse white space in the same way as +\&\fIget_trimmed_text()\fR does. +.Sp +The definition of phrasal\-level tags is obtained from the +HTML::Tagset module. +.SH "EXAMPLES" +.IX Header "EXAMPLES" +This example extracts all links from a document. It will print one +line for each link, containing the \s-1URL\s0 and the textual description +between the ... tags: +.PP +.Vb 2 +\& use HTML::TokeParser; +\& $p = HTML::TokeParser\->new(shift||"index.html"); +.Ve +.PP +.Vb 5 +\& while (my $token = $p\->get_tag("a")) { +\& my $url = $token\->[1]{href} || "\-"; +\& my $text = $p\->get_trimmed_text("/a"); +\& print "$url\et$text\en"; +\& } +.Ve +.PP +This example extract the <\s-1TITLE\s0> from the document: +.PP +.Vb 6 +\& use HTML::TokeParser; +\& $p = HTML::TokeParser\->new(shift||"index.html"); +\& if ($p\->get_tag("title")) { +\& my $title = $p\->get_trimmed_text; +\& print "Title: $title\en"; +\& } +.Ve +.SH "SEE ALSO" +.IX Header "SEE ALSO" +HTML::PullParser, HTML::Parser +.SH "COPYRIGHT" +.IX Header "COPYRIGHT" +Copyright 1998\-2005 Gisle Aas. +.PP +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself.