Commit | Line | Data |
3fea05b9 |
1 | package HTML::TokeParser; |
2 | |
3 | require HTML::PullParser; |
4 | @ISA=qw(HTML::PullParser); |
5 | $VERSION = "3.57"; |
6 | |
7 | use strict; |
8 | use Carp (); |
9 | use HTML::Entities qw(decode_entities); |
10 | use HTML::Tagset (); |
11 | |
12 | my %ARGS = |
13 | ( |
14 | start => "'S',tagname,attr,attrseq,text", |
15 | end => "'E',tagname,text", |
16 | text => "'T',text,is_cdata", |
17 | process => "'PI',token0,text", |
18 | comment => "'C',text", |
19 | declaration => "'D',text", |
20 | |
21 | # options that default on |
22 | unbroken_text => 1, |
23 | ); |
24 | |
25 | |
26 | sub new |
27 | { |
28 | my $class = shift; |
29 | my %cnf; |
30 | if (@_ == 1) { |
31 | my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; |
32 | %cnf = ($type => $_[0]); |
33 | } |
34 | else { |
35 | %cnf = @_; |
36 | } |
37 | |
38 | my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; |
39 | |
40 | my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; |
41 | |
42 | $self->{textify} = $textify; |
43 | $self; |
44 | } |
45 | |
46 | |
47 | sub get_tag |
48 | { |
49 | my $self = shift; |
50 | my $token; |
51 | while (1) { |
52 | $token = $self->get_token || return undef; |
53 | my $type = shift @$token; |
54 | next unless $type eq "S" || $type eq "E"; |
55 | substr($token->[0], 0, 0) = "/" if $type eq "E"; |
56 | return $token unless @_; |
57 | for (@_) { |
58 | return $token if $token->[0] eq $_; |
59 | } |
60 | } |
61 | } |
62 | |
63 | |
64 | sub _textify { |
65 | my($self, $token) = @_; |
66 | my $tag = $token->[1]; |
67 | return undef unless exists $self->{textify}{$tag}; |
68 | |
69 | my $alt = $self->{textify}{$tag}; |
70 | my $text; |
71 | if (ref($alt)) { |
72 | $text = &$alt(@$token); |
73 | } else { |
74 | $text = $token->[2]{$alt || "alt"}; |
75 | $text = "[\U$tag]" unless defined $text; |
76 | } |
77 | return $text; |
78 | } |
79 | |
80 | |
81 | sub get_text |
82 | { |
83 | my $self = shift; |
84 | my @text; |
85 | while (my $token = $self->get_token) { |
86 | my $type = $token->[0]; |
87 | if ($type eq "T") { |
88 | my $text = $token->[1]; |
89 | decode_entities($text) unless $token->[2]; |
90 | push(@text, $text); |
91 | } elsif ($type =~ /^[SE]$/) { |
92 | my $tag = $token->[1]; |
93 | if ($type eq "S") { |
94 | if (defined(my $text = _textify($self, $token))) { |
95 | push(@text, $text); |
96 | next; |
97 | } |
98 | } else { |
99 | $tag = "/$tag"; |
100 | } |
101 | if (!@_ || grep $_ eq $tag, @_) { |
102 | $self->unget_token($token); |
103 | last; |
104 | } |
105 | push(@text, " ") |
106 | if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; |
107 | } |
108 | } |
109 | join("", @text); |
110 | } |
111 | |
112 | |
113 | sub get_trimmed_text |
114 | { |
115 | my $self = shift; |
116 | my $text = $self->get_text(@_); |
117 | $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; |
118 | $text; |
119 | } |
120 | |
121 | sub get_phrase { |
122 | my $self = shift; |
123 | my @text; |
124 | while (my $token = $self->get_token) { |
125 | my $type = $token->[0]; |
126 | if ($type eq "T") { |
127 | my $text = $token->[1]; |
128 | decode_entities($text) unless $token->[2]; |
129 | push(@text, $text); |
130 | } elsif ($type =~ /^[SE]$/) { |
131 | my $tag = $token->[1]; |
132 | if ($type eq "S") { |
133 | if (defined(my $text = _textify($self, $token))) { |
134 | push(@text, $text); |
135 | next; |
136 | } |
137 | } |
138 | if (!$HTML::Tagset::isPhraseMarkup{$tag}) { |
139 | $self->unget_token($token); |
140 | last; |
141 | } |
142 | push(@text, " ") if $tag eq "br"; |
143 | } |
144 | } |
145 | my $text = join("", @text); |
146 | $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; |
147 | $text; |
148 | } |
149 | |
150 | 1; |
151 | |
152 | |
153 | __END__ |
154 | |
155 | =head1 NAME |
156 | |
157 | HTML::TokeParser - Alternative HTML::Parser interface |
158 | |
159 | =head1 SYNOPSIS |
160 | |
161 | require HTML::TokeParser; |
162 | $p = HTML::TokeParser->new("index.html") || |
163 | die "Can't open: $!"; |
164 | $p->empty_element_tags(1); # configure its behaviour |
165 | |
166 | while (my $token = $p->get_token) { |
167 | #... |
168 | } |
169 | |
170 | =head1 DESCRIPTION |
171 | |
172 | The C<HTML::TokeParser> is an alternative interface to the |
173 | C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a |
174 | predeclared set of token types. If you wish the tokens to be reported |
175 | differently you probably want to use the C<HTML::PullParser> directly. |
176 | |
177 | The following methods are available: |
178 | |
179 | =over 4 |
180 | |
181 | =item $p = HTML::TokeParser->new( $filename, %opt ); |
182 | |
183 | =item $p = HTML::TokeParser->new( $filehandle, %opt ); |
184 | |
185 | =item $p = HTML::TokeParser->new( \$document, %opt ); |
186 | |
187 | The object constructor argument is either a file name, a file handle |
188 | object, or the complete document to be parsed. Extra options can be |
189 | provided as key/value pairs and are processed as documented by the base |
190 | classes. |
191 | |
192 | If the argument is a plain scalar, then it is taken as the name of a |
193 | file to be opened and parsed. If the file can't be opened for |
194 | reading, then the constructor will return C<undef> and $! will tell |
195 | you why it failed. |
196 | |
197 | If the argument is a reference to a plain scalar, then this scalar is |
198 | taken to be the literal document to parse. The value of this |
199 | scalar should not be changed before all tokens have been extracted. |
200 | |
201 | Otherwise the argument is taken to be some object that the |
202 | C<HTML::TokeParser> can read() from when it needs more data. Typically |
203 | it will be a filehandle of some kind. The stream will be read() until |
204 | EOF, but not closed. |
205 | |
206 | A newly constructed C<HTML::TokeParser> differ from its base classes |
207 | by having the C<unbroken_text> attribute enabled by default. See |
208 | L<HTML::Parser> for a description of this and other attributes that |
209 | influence how the document is parsed. It is often a good idea to enable |
210 | C<empty_element_tags> behaviour. |
211 | |
212 | Note that the parsing result will likely not be valid if raw undecoded |
213 | UTF-8 is used as a source. When parsing UTF-8 encoded files turn |
214 | on UTF-8 decoding: |
215 | |
216 | open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!"; |
217 | my $p = HTML::TokeParser->new( $fh ); |
218 | # ... |
219 | |
220 | If a $filename is passed to the constructor the file will be opened in |
221 | raw mode and the parsing result will only be valid if its content is |
222 | Latin-1 or pure ASCII. |
223 | |
224 | If parsing from an UTF-8 encoded string buffer decode it first: |
225 | |
226 | utf8::decode($document); |
227 | my $p = HTML::TokeParser->new( \$document ); |
228 | # ... |
229 | |
230 | =item $p->get_token |
231 | |
232 | This method will return the next I<token> found in the HTML document, |
233 | or C<undef> at the end of the document. The token is returned as an |
234 | array reference. The first element of the array will be a string |
235 | denoting the type of this token: "S" for start tag, "E" for end tag, |
236 | "T" for text, "C" for comment, "D" for declaration, and "PI" for |
237 | process instructions. The rest of the token array depend on the type |
238 | like this: |
239 | |
240 | ["S", $tag, $attr, $attrseq, $text] |
241 | ["E", $tag, $text] |
242 | ["T", $text, $is_data] |
243 | ["C", $text] |
244 | ["D", $text] |
245 | ["PI", $token0, $text] |
246 | |
247 | where $attr is a hash reference, $attrseq is an array reference and |
248 | the rest are plain scalars. The L<HTML::Parser/Argspec> explains the |
249 | details. |
250 | |
251 | =item $p->unget_token( @tokens ) |
252 | |
253 | If you find you have read too many tokens you can push them back, |
254 | so that they are returned the next time $p->get_token is called. |
255 | |
256 | =item $p->get_tag |
257 | |
258 | =item $p->get_tag( @tags ) |
259 | |
260 | This method returns the next start or end tag (skipping any other |
261 | tokens), or C<undef> if there are no more tags in the document. If |
262 | one or more arguments are given, then we skip tokens until one of the |
263 | specified tag types is found. For example: |
264 | |
265 | $p->get_tag("font", "/font"); |
266 | |
267 | will find the next start or end tag for a font-element. |
268 | |
269 | The tag information is returned as an array reference in the same form |
270 | as for $p->get_token above, but the type code (first element) is |
271 | missing. A start tag will be returned like this: |
272 | |
273 | [$tag, $attr, $attrseq, $text] |
274 | |
275 | The tagname of end tags are prefixed with "/", i.e. end tag is |
276 | returned like this: |
277 | |
278 | ["/$tag", $text] |
279 | |
280 | =item $p->get_text |
281 | |
282 | =item $p->get_text( @endtags ) |
283 | |
284 | This method returns all text found at the current position. It will |
285 | return a zero length string if the next token is not text. Any |
286 | entities will be converted to their corresponding character. |
287 | |
288 | If one or more arguments are given, then we return all text occurring |
289 | before the first of the specified tags found. For example: |
290 | |
291 | $p->get_text("p", "br"); |
292 | |
293 | will return the text up to either a paragraph of linebreak element. |
294 | |
295 | The text might span tags that should be I<textified>. This is |
296 | controlled by the $p->{textify} attribute, which is a hash that |
297 | defines how certain tags can be treated as text. If the name of a |
298 | start tag matches a key in this hash then this tag is converted to |
299 | text. The hash value is used to specify which tag attribute to obtain |
300 | the text from. If this tag attribute is missing, then the upper case |
301 | name of the tag enclosed in brackets is returned, e.g. "[IMG]". The |
302 | hash value can also be a subroutine reference. In this case the |
303 | routine is called with the start tag token content as its argument and |
304 | the return value is treated as the text. |
305 | |
306 | The default $p->{textify} value is: |
307 | |
308 | {img => "alt", applet => "alt"} |
309 | |
310 | This means that <IMG> and <APPLET> tags are treated as text, and that |
311 | the text to substitute can be found in the ALT attribute. |
312 | |
313 | =item $p->get_trimmed_text |
314 | |
315 | =item $p->get_trimmed_text( @endtags ) |
316 | |
317 | Same as $p->get_text above, but will collapse any sequences of white |
318 | space to a single space character. Leading and trailing white space is |
319 | removed. |
320 | |
321 | =item $p->get_phrase |
322 | |
323 | This will return all text found at the current position ignoring any |
324 | phrasal-level tags. Text is extracted until the first non |
325 | phrasal-level tag. Textification of tags is the same as for |
326 | get_text(). This method will collapse white space in the same way as |
327 | get_trimmed_text() does. |
328 | |
329 | The definition of <i>phrasal-level tags</i> is obtained from the |
330 | HTML::Tagset module. |
331 | |
332 | =back |
333 | |
334 | =head1 EXAMPLES |
335 | |
336 | This example extracts all links from a document. It will print one |
337 | line for each link, containing the URL and the textual description |
338 | between the <A>...</A> tags: |
339 | |
340 | use HTML::TokeParser; |
341 | $p = HTML::TokeParser->new(shift||"index.html"); |
342 | |
343 | while (my $token = $p->get_tag("a")) { |
344 | my $url = $token->[1]{href} || "-"; |
345 | my $text = $p->get_trimmed_text("/a"); |
346 | print "$url\t$text\n"; |
347 | } |
348 | |
349 | This example extract the <TITLE> from the document: |
350 | |
351 | use HTML::TokeParser; |
352 | $p = HTML::TokeParser->new(shift||"index.html"); |
353 | if ($p->get_tag("title")) { |
354 | my $title = $p->get_trimmed_text; |
355 | print "Title: $title\n"; |
356 | } |
357 | |
358 | =head1 SEE ALSO |
359 | |
360 | L<HTML::PullParser>, L<HTML::Parser> |
361 | |
362 | =head1 COPYRIGHT |
363 | |
364 | Copyright 1998-2005 Gisle Aas. |
365 | |
366 | This library is free software; you can redistribute it and/or |
367 | modify it under the same terms as Perl itself. |
368 | |
369 | =cut |