Add built local::lib
[catagits/Gitalist.git] / local-lib5 / lib / perl5 / i486-linux-gnu-thread-multi / HTML / TokeParser.pm
CommitLineData
3fea05b9 1package HTML::TokeParser;
2
3require HTML::PullParser;
4@ISA=qw(HTML::PullParser);
5$VERSION = "3.57";
6
7use strict;
8use Carp ();
9use HTML::Entities qw(decode_entities);
10use HTML::Tagset ();
11
12my %ARGS =
13(
14 start => "'S',tagname,attr,attrseq,text",
15 end => "'E',tagname,text",
16 text => "'T',text,is_cdata",
17 process => "'PI',token0,text",
18 comment => "'C',text",
19 declaration => "'D',text",
20
21 # options that default on
22 unbroken_text => 1,
23);
24
25
26sub new
27{
28 my $class = shift;
29 my %cnf;
30 if (@_ == 1) {
31 my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
32 %cnf = ($type => $_[0]);
33 }
34 else {
35 %cnf = @_;
36 }
37
38 my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
39
40 my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
41
42 $self->{textify} = $textify;
43 $self;
44}
45
46
47sub get_tag
48{
49 my $self = shift;
50 my $token;
51 while (1) {
52 $token = $self->get_token || return undef;
53 my $type = shift @$token;
54 next unless $type eq "S" || $type eq "E";
55 substr($token->[0], 0, 0) = "/" if $type eq "E";
56 return $token unless @_;
57 for (@_) {
58 return $token if $token->[0] eq $_;
59 }
60 }
61}
62
63
64sub _textify {
65 my($self, $token) = @_;
66 my $tag = $token->[1];
67 return undef unless exists $self->{textify}{$tag};
68
69 my $alt = $self->{textify}{$tag};
70 my $text;
71 if (ref($alt)) {
72 $text = &$alt(@$token);
73 } else {
74 $text = $token->[2]{$alt || "alt"};
75 $text = "[\U$tag]" unless defined $text;
76 }
77 return $text;
78}
79
80
81sub get_text
82{
83 my $self = shift;
84 my @text;
85 while (my $token = $self->get_token) {
86 my $type = $token->[0];
87 if ($type eq "T") {
88 my $text = $token->[1];
89 decode_entities($text) unless $token->[2];
90 push(@text, $text);
91 } elsif ($type =~ /^[SE]$/) {
92 my $tag = $token->[1];
93 if ($type eq "S") {
94 if (defined(my $text = _textify($self, $token))) {
95 push(@text, $text);
96 next;
97 }
98 } else {
99 $tag = "/$tag";
100 }
101 if (!@_ || grep $_ eq $tag, @_) {
102 $self->unget_token($token);
103 last;
104 }
105 push(@text, " ")
106 if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
107 }
108 }
109 join("", @text);
110}
111
112
113sub get_trimmed_text
114{
115 my $self = shift;
116 my $text = $self->get_text(@_);
117 $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
118 $text;
119}
120
121sub get_phrase {
122 my $self = shift;
123 my @text;
124 while (my $token = $self->get_token) {
125 my $type = $token->[0];
126 if ($type eq "T") {
127 my $text = $token->[1];
128 decode_entities($text) unless $token->[2];
129 push(@text, $text);
130 } elsif ($type =~ /^[SE]$/) {
131 my $tag = $token->[1];
132 if ($type eq "S") {
133 if (defined(my $text = _textify($self, $token))) {
134 push(@text, $text);
135 next;
136 }
137 }
138 if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
139 $self->unget_token($token);
140 last;
141 }
142 push(@text, " ") if $tag eq "br";
143 }
144 }
145 my $text = join("", @text);
146 $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
147 $text;
148}
149
1501;
151
152
153__END__
154
155=head1 NAME
156
157HTML::TokeParser - Alternative HTML::Parser interface
158
159=head1 SYNOPSIS
160
161 require HTML::TokeParser;
162 $p = HTML::TokeParser->new("index.html") ||
163 die "Can't open: $!";
164 $p->empty_element_tags(1); # configure its behaviour
165
166 while (my $token = $p->get_token) {
167 #...
168 }
169
170=head1 DESCRIPTION
171
172The C<HTML::TokeParser> is an alternative interface to the
173C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
174predeclared set of token types. If you wish the tokens to be reported
175differently you probably want to use the C<HTML::PullParser> directly.
176
177The following methods are available:
178
179=over 4
180
181=item $p = HTML::TokeParser->new( $filename, %opt );
182
183=item $p = HTML::TokeParser->new( $filehandle, %opt );
184
185=item $p = HTML::TokeParser->new( \$document, %opt );
186
187The object constructor argument is either a file name, a file handle
188object, or the complete document to be parsed. Extra options can be
189provided as key/value pairs and are processed as documented by the base
190classes.
191
192If the argument is a plain scalar, then it is taken as the name of a
193file to be opened and parsed. If the file can't be opened for
194reading, then the constructor will return C<undef> and $! will tell
195you why it failed.
196
197If the argument is a reference to a plain scalar, then this scalar is
198taken to be the literal document to parse. The value of this
199scalar should not be changed before all tokens have been extracted.
200
201Otherwise the argument is taken to be some object that the
202C<HTML::TokeParser> can read() from when it needs more data. Typically
203it will be a filehandle of some kind. The stream will be read() until
204EOF, but not closed.
205
206A newly constructed C<HTML::TokeParser> differ from its base classes
207by having the C<unbroken_text> attribute enabled by default. See
208L<HTML::Parser> for a description of this and other attributes that
209influence how the document is parsed. It is often a good idea to enable
210C<empty_element_tags> behaviour.
211
212Note that the parsing result will likely not be valid if raw undecoded
213UTF-8 is used as a source. When parsing UTF-8 encoded files turn
214on UTF-8 decoding:
215
216 open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
217 my $p = HTML::TokeParser->new( $fh );
218 # ...
219
220If a $filename is passed to the constructor the file will be opened in
221raw mode and the parsing result will only be valid if its content is
222Latin-1 or pure ASCII.
223
224If parsing from an UTF-8 encoded string buffer decode it first:
225
226 utf8::decode($document);
227 my $p = HTML::TokeParser->new( \$document );
228 # ...
229
230=item $p->get_token
231
232This method will return the next I<token> found in the HTML document,
233or C<undef> at the end of the document. The token is returned as an
234array reference. The first element of the array will be a string
235denoting the type of this token: "S" for start tag, "E" for end tag,
236"T" for text, "C" for comment, "D" for declaration, and "PI" for
237process instructions. The rest of the token array depend on the type
238like this:
239
240 ["S", $tag, $attr, $attrseq, $text]
241 ["E", $tag, $text]
242 ["T", $text, $is_data]
243 ["C", $text]
244 ["D", $text]
245 ["PI", $token0, $text]
246
247where $attr is a hash reference, $attrseq is an array reference and
248the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
249details.
250
251=item $p->unget_token( @tokens )
252
253If you find you have read too many tokens you can push them back,
254so that they are returned the next time $p->get_token is called.
255
256=item $p->get_tag
257
258=item $p->get_tag( @tags )
259
260This method returns the next start or end tag (skipping any other
261tokens), or C<undef> if there are no more tags in the document. If
262one or more arguments are given, then we skip tokens until one of the
263specified tag types is found. For example:
264
265 $p->get_tag("font", "/font");
266
267will find the next start or end tag for a font-element.
268
269The tag information is returned as an array reference in the same form
270as for $p->get_token above, but the type code (first element) is
271missing. A start tag will be returned like this:
272
273 [$tag, $attr, $attrseq, $text]
274
275The tagname of end tags are prefixed with "/", i.e. end tag is
276returned like this:
277
278 ["/$tag", $text]
279
280=item $p->get_text
281
282=item $p->get_text( @endtags )
283
284This method returns all text found at the current position. It will
285return a zero length string if the next token is not text. Any
286entities will be converted to their corresponding character.
287
288If one or more arguments are given, then we return all text occurring
289before the first of the specified tags found. For example:
290
291 $p->get_text("p", "br");
292
293will return the text up to either a paragraph of linebreak element.
294
295The text might span tags that should be I<textified>. This is
296controlled by the $p->{textify} attribute, which is a hash that
297defines how certain tags can be treated as text. If the name of a
298start tag matches a key in this hash then this tag is converted to
299text. The hash value is used to specify which tag attribute to obtain
300the text from. If this tag attribute is missing, then the upper case
301name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
302hash value can also be a subroutine reference. In this case the
303routine is called with the start tag token content as its argument and
304the return value is treated as the text.
305
306The default $p->{textify} value is:
307
308 {img => "alt", applet => "alt"}
309
310This means that <IMG> and <APPLET> tags are treated as text, and that
311the text to substitute can be found in the ALT attribute.
312
313=item $p->get_trimmed_text
314
315=item $p->get_trimmed_text( @endtags )
316
317Same as $p->get_text above, but will collapse any sequences of white
318space to a single space character. Leading and trailing white space is
319removed.
320
321=item $p->get_phrase
322
323This will return all text found at the current position ignoring any
324phrasal-level tags. Text is extracted until the first non
325phrasal-level tag. Textification of tags is the same as for
326get_text(). This method will collapse white space in the same way as
327get_trimmed_text() does.
328
329The definition of <i>phrasal-level tags</i> is obtained from the
330HTML::Tagset module.
331
332=back
333
334=head1 EXAMPLES
335
336This example extracts all links from a document. It will print one
337line for each link, containing the URL and the textual description
338between the <A>...</A> tags:
339
340 use HTML::TokeParser;
341 $p = HTML::TokeParser->new(shift||"index.html");
342
343 while (my $token = $p->get_tag("a")) {
344 my $url = $token->[1]{href} || "-";
345 my $text = $p->get_trimmed_text("/a");
346 print "$url\t$text\n";
347 }
348
349This example extract the <TITLE> from the document:
350
351 use HTML::TokeParser;
352 $p = HTML::TokeParser->new(shift||"index.html");
353 if ($p->get_tag("title")) {
354 my $title = $p->get_trimmed_text;
355 print "Title: $title\n";
356 }
357
358=head1 SEE ALSO
359
360L<HTML::PullParser>, L<HTML::Parser>
361
362=head1 COPYRIGHT
363
364Copyright 1998-2005 Gisle Aas.
365
366This library is free software; you can redistribute it and/or
367modify it under the same terms as Perl itself.
368
369=cut