Commit | Line | Data |
3fea05b9 |
1 | package HTML::Parser; |
2 | |
3 | # Copyright 1996-2009, Gisle Aas. |
4 | # Copyright 1999-2000, Michael A. Chase. |
5 | # |
6 | # This library is free software; you can redistribute it and/or |
7 | # modify it under the same terms as Perl itself. |
8 | |
9 | use strict; |
10 | use vars qw($VERSION @ISA); |
11 | |
12 | $VERSION = "3.64"; |
13 | |
14 | require HTML::Entities; |
15 | |
16 | require XSLoader; |
17 | XSLoader::load('HTML::Parser', $VERSION); |
18 | |
19 | sub new |
20 | { |
21 | my $class = shift; |
22 | my $self = bless {}, $class; |
23 | return $self->init(@_); |
24 | } |
25 | |
26 | |
27 | sub init |
28 | { |
29 | my $self = shift; |
30 | $self->_alloc_pstate; |
31 | |
32 | my %arg = @_; |
33 | my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); |
34 | if ($api_version >= 4) { |
35 | require Carp; |
36 | Carp::croak("API version $api_version not supported " . |
37 | "by HTML::Parser $VERSION"); |
38 | } |
39 | |
40 | if ($api_version < 3) { |
41 | # Set up method callbacks compatible with HTML-Parser-2.xx |
42 | $self->handler(text => "text", "self,text,is_cdata"); |
43 | $self->handler(end => "end", "self,tagname,text"); |
44 | $self->handler(process => "process", "self,token0,text"); |
45 | $self->handler(start => "start", |
46 | "self,tagname,attr,attrseq,text"); |
47 | |
48 | $self->handler(comment => |
49 | sub { |
50 | my($self, $tokens) = @_; |
51 | for (@$tokens) { |
52 | $self->comment($_); |
53 | } |
54 | }, "self,tokens"); |
55 | |
56 | $self->handler(declaration => |
57 | sub { |
58 | my $self = shift; |
59 | $self->declaration(substr($_[0], 2, -1)); |
60 | }, "self,text"); |
61 | } |
62 | |
63 | if (my $h = delete $arg{handlers}) { |
64 | $h = {@$h} if ref($h) eq "ARRAY"; |
65 | while (my($event, $cb) = each %$h) { |
66 | $self->handler($event => @$cb); |
67 | } |
68 | } |
69 | |
70 | # In the end we try to assume plain attribute or handler |
71 | while (my($option, $val) = each %arg) { |
72 | if ($option =~ /^(\w+)_h$/) { |
73 | $self->handler($1 => @$val); |
74 | } |
75 | elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { |
76 | require Carp; |
77 | Carp::croak("Bad constructor option '$option'"); |
78 | } |
79 | else { |
80 | $self->$option($val); |
81 | } |
82 | } |
83 | |
84 | return $self; |
85 | } |
86 | |
87 | |
88 | sub parse_file |
89 | { |
90 | my($self, $file) = @_; |
91 | my $opened; |
92 | if (!ref($file) && ref(\$file) ne "GLOB") { |
93 | # Assume $file is a filename |
94 | local(*F); |
95 | open(F, "<", $file) || return undef; |
96 | binmode(F); # should we? good for byte counts |
97 | $opened++; |
98 | $file = *F; |
99 | } |
100 | my $chunk = ''; |
101 | while (read($file, $chunk, 512)) { |
102 | $self->parse($chunk) || last; |
103 | } |
104 | close($file) if $opened; |
105 | $self->eof; |
106 | } |
107 | |
108 | |
109 | sub netscape_buggy_comment # legacy |
110 | { |
111 | my $self = shift; |
112 | require Carp; |
113 | Carp::carp("netscape_buggy_comment() is deprecated. " . |
114 | "Please use the strict_comment() method instead"); |
115 | my $old = !$self->strict_comment; |
116 | $self->strict_comment(!shift) if @_; |
117 | return $old; |
118 | } |
119 | |
120 | # set up method stubs |
121 | sub text { } |
122 | *start = \&text; |
123 | *end = \&text; |
124 | *comment = \&text; |
125 | *declaration = \&text; |
126 | *process = \&text; |
127 | |
128 | 1; |
129 | |
130 | __END__ |
131 | |
132 | |
133 | =head1 NAME |
134 | |
135 | HTML::Parser - HTML parser class |
136 | |
137 | =head1 SYNOPSIS |
138 | |
139 | use HTML::Parser (); |
140 | |
141 | # Create parser object |
142 | $p = HTML::Parser->new( api_version => 3, |
143 | start_h => [\&start, "tagname, attr"], |
144 | end_h => [\&end, "tagname"], |
145 | marked_sections => 1, |
146 | ); |
147 | |
148 | # Parse document text chunk by chunk |
149 | $p->parse($chunk1); |
150 | $p->parse($chunk2); |
151 | #... |
152 | $p->eof; # signal end of document |
153 | |
154 | # Parse directly from file |
155 | $p->parse_file("foo.html"); |
156 | # or |
157 | open(my $fh, "<:utf8", "foo.html") || die; |
158 | $p->parse_file($fh); |
159 | |
160 | =head1 DESCRIPTION |
161 | |
162 | Objects of the C<HTML::Parser> class will recognize markup and |
163 | separate it from plain text (alias data content) in HTML |
164 | documents. As different kinds of markup and text are recognized, the |
165 | corresponding event handlers are invoked. |
166 | |
167 | C<HTML::Parser> is not a generic SGML parser. We have tried to |
168 | make it able to deal with the HTML that is actually "out there", and |
169 | it normally parses as closely as possible to the way the popular web |
170 | browsers do it instead of strictly following one of the many HTML |
171 | specifications from W3C. Where there is disagreement, there is often |
172 | an option that you can enable to get the official behaviour. |
173 | |
174 | The document to be parsed may be supplied in arbitrary chunks. This |
175 | makes on-the-fly parsing as documents are received from the network |
176 | possible. |
177 | |
178 | If event driven parsing does not feel right for your application, you |
179 | might want to use C<HTML::PullParser>. This is an C<HTML::Parser> |
180 | subclass that allows a more conventional program structure. |
181 | |
182 | |
183 | =head1 METHODS |
184 | |
185 | The following method is used to construct a new C<HTML::Parser> object: |
186 | |
187 | =over |
188 | |
189 | =item $p = HTML::Parser->new( %options_and_handlers ) |
190 | |
191 | This class method creates a new C<HTML::Parser> object and |
192 | returns it. Key/value argument pairs may be provided to assign event |
193 | handlers or initialize parser options. The handlers and parser |
194 | options can also be set or modified later by the method calls described below. |
195 | |
196 | If a top level key is in the form "<event>_h" (e.g., "text_h") then it |
197 | assigns a handler to that event, otherwise it initializes a parser |
198 | option. The event handler specification value must be an array |
199 | reference. Multiple handlers may also be assigned with the 'handlers |
200 | => [%handlers]' option. See examples below. |
201 | |
202 | If new() is called without any arguments, it will create a parser that |
203 | uses callback methods compatible with version 2 of C<HTML::Parser>. |
204 | See the section on "version 2 compatibility" below for details. |
205 | |
206 | The special constructor option 'api_version => 2' can be used to |
207 | initialize version 2 callbacks while still setting other options and |
208 | handlers. The 'api_version => 3' option can be used if you don't want |
209 | to set any options and don't want to fall back to v2 compatible |
210 | mode. |
211 | |
212 | Examples: |
213 | |
214 | $p = HTML::Parser->new(api_version => 3, |
215 | text_h => [ sub {...}, "dtext" ]); |
216 | |
217 | This creates a new parser object with a text event handler subroutine |
218 | that receives the original text with general entities decoded. |
219 | |
220 | $p = HTML::Parser->new(api_version => 3, |
221 | start_h => [ 'my_start', "self,tokens" ]); |
222 | |
223 | This creates a new parser object with a start event handler method |
224 | that receives the $p and the tokens array. |
225 | |
226 | $p = HTML::Parser->new(api_version => 3, |
227 | handlers => { text => [\@array, "event,text"], |
228 | comment => [\@array, "event,text"], |
229 | }); |
230 | |
231 | This creates a new parser object that stores the event type and the |
232 | original text in @array for text and comment events. |
233 | |
234 | =back |
235 | |
236 | The following methods feed the HTML document |
237 | to the C<HTML::Parser> object: |
238 | |
239 | =over |
240 | |
241 | =item $p->parse( $string ) |
242 | |
243 | Parse $string as the next chunk of the HTML document. Handlers invoked should |
244 | not attempt to modify the $string in-place until $p->parse returns. |
245 | |
246 | If an invoked event handler aborts parsing by calling $p->eof, then $p->parse() |
247 | will return a FALSE value. Otherwise the return value is a reference to the |
248 | parser object ($p). |
249 | |
250 | =item $p->parse( $code_ref ) |
251 | |
252 | If a code reference is passed as the argument to be parsed, then the |
253 | chunks to be parsed are obtained by invoking this function repeatedly. |
254 | Parsing continues until the function returns an empty (or undefined) |
255 | result. When this happens $p->eof is automatically signaled. |
256 | |
257 | Parsing will also abort if one of the event handlers calls $p->eof. |
258 | |
259 | The effect of this is the same as: |
260 | |
261 | while (1) { |
262 | my $chunk = &$code_ref(); |
263 | if (!defined($chunk) || !length($chunk)) { |
264 | $p->eof; |
265 | return $p; |
266 | } |
267 | $p->parse($chunk) || return undef; |
268 | } |
269 | |
270 | But it is more efficient as this loop runs internally in XS code. |
271 | |
272 | =item $p->parse_file( $file ) |
273 | |
274 | Parse text directly from a file. The $file argument can be a |
275 | filename, an open file handle, or a reference to an open file |
276 | handle. |
277 | |
278 | If $file contains a filename and the file can't be opened, then the |
279 | method returns an undefined value and $! tells why it failed. |
280 | Otherwise the return value is a reference to the parser object. |
281 | |
282 | If a file handle is passed as the $file argument, then the file will |
283 | normally be read until EOF, but not closed. |
284 | |
285 | If an invoked event handler aborts parsing by calling $p->eof, |
286 | then $p->parse_file() may not have read the entire file. |
287 | |
288 | On systems with multi-byte line terminators, the values passed for the |
289 | offset and length argspecs may be too low if parse_file() is called on |
290 | a file handle that is not in binary mode. |
291 | |
292 | If a filename is passed in, then parse_file() will open the file in |
293 | binary mode. |
294 | |
295 | =item $p->eof |
296 | |
297 | Signals the end of the HTML document. Calling the $p->eof method |
298 | outside a handler callback will flush any remaining buffered text |
299 | (which triggers the C<text> event if there is any remaining text). |
300 | |
301 | Calling $p->eof inside a handler will terminate parsing at that point |
302 | and cause $p->parse to return a FALSE value. This also terminates |
303 | parsing by $p->parse_file(). |
304 | |
305 | After $p->eof has been called, the parse() and parse_file() methods |
306 | can be invoked to feed new documents with the parser object. |
307 | |
308 | The return value from eof() is a reference to the parser object. |
309 | |
310 | =back |
311 | |
312 | |
313 | Most parser options are controlled by boolean attributes. |
314 | Each boolean attribute is enabled by calling the corresponding method |
315 | with a TRUE argument and disabled with a FALSE argument. The |
316 | attribute value is left unchanged if no argument is given. The return |
317 | value from each method is the old attribute value. |
318 | |
319 | Methods that can be used to get and/or set parser options are: |
320 | |
321 | =over |
322 | |
323 | =item $p->attr_encoded |
324 | |
325 | =item $p->attr_encoded( $bool ) |
326 | |
327 | By default, the C<attr> and C<@attr> argspecs will have general |
328 | entities for attribute values decoded. Enabling this attribute leaves |
329 | entities alone. |
330 | |
331 | =item $p->backquote |
332 | |
333 | =item $p->backquote( $bool ) |
334 | |
335 | By default, only ' and " are recognized as quote characters around |
336 | attribute values. MSIE also recognizes backquotes for some reason. |
337 | Enabling this attribute provides compatibility with this behaviour. |
338 | |
339 | =item $p->boolean_attribute_value( $val ) |
340 | |
341 | This method sets the value reported for boolean attributes inside HTML |
342 | start tags. By default, the name of the attribute is also used as its |
343 | value. This affects the values reported for C<tokens> and C<attr> |
344 | argspecs. |
345 | |
346 | =item $p->case_sensitive |
347 | |
348 | =item $p->case_sensitive( $bool ) |
349 | |
350 | By default, tagnames and attribute names are down-cased. Enabling this |
351 | attribute leaves them as found in the HTML source document. |
352 | |
353 | =item $p->closing_plaintext |
354 | |
355 | =item $p->closing_plaintext( $bool ) |
356 | |
357 | By default, "plaintext" element can never be closed. Everything up to |
358 | the end of the document is parsed in CDATA mode. This historical |
359 | behaviour is what at least MSIE does. Enabling this attribute makes |
360 | closing "</plaintext>" tag effective and the parsing process will resume |
361 | after seeing this tag. This emulates early gecko-based browsers. |
362 | |
363 | =item $p->empty_element_tags |
364 | |
365 | =item $p->empty_element_tags( $bool ) |
366 | |
367 | By default, empty element tags are not recognized as such and the "/" |
368 | before ">" is just treated like a normal name character (unless |
369 | C<strict_names> is enabled). Enabling this attribute make |
370 | C<HTML::Parser> recognize these tags. |
371 | |
372 | Empty element tags look like start tags, but end with the character |
373 | sequence "/>" instead of ">". When recognized by C<HTML::Parser> they |
374 | cause an artificial end event in addition to the start event. The |
375 | C<text> for the artificial end event will be empty and the C<tokenpos> |
376 | array will be undefined even though the the token array will have one |
377 | element containing the tag name. |
378 | |
379 | =item $p->marked_sections |
380 | |
381 | =item $p->marked_sections( $bool ) |
382 | |
383 | By default, section markings like <![CDATA[...]]> are treated like |
384 | ordinary text. When this attribute is enabled section markings are |
385 | honoured. |
386 | |
387 | There are currently no events associated with the marked section |
388 | markup, but the text can be returned as C<skipped_text>. |
389 | |
390 | =item $p->strict_comment |
391 | |
392 | =item $p->strict_comment( $bool ) |
393 | |
394 | By default, comments are terminated by the first occurrence of "-->". |
395 | This is the behaviour of most popular browsers (like Mozilla, Opera and |
396 | MSIE), but it is not correct according to the official HTML |
397 | standard. Officially, you need an even number of "--" tokens before |
398 | the closing ">" is recognized and there may not be anything but |
399 | whitespace between an even and an odd "--". |
400 | |
401 | The official behaviour is enabled by enabling this attribute. |
402 | |
403 | Enabling of 'strict_comment' also disables recognizing these forms as |
404 | comments: |
405 | |
406 | </ comment> |
407 | <! comment> |
408 | |
409 | |
410 | =item $p->strict_end |
411 | |
412 | =item $p->strict_end( $bool ) |
413 | |
414 | By default, attributes and other junk are allowed to be present on end tags in a |
415 | manner that emulates MSIE's behaviour. |
416 | |
417 | The official behaviour is enabled with this attribute. If enabled, |
418 | only whitespace is allowed between the tagname and the final ">". |
419 | |
420 | =item $p->strict_names |
421 | |
422 | =item $p->strict_names( $bool ) |
423 | |
424 | By default, almost anything is allowed in tag and attribute names. |
425 | This is the behaviour of most popular browsers and allows us to parse |
426 | some broken tags with invalid attribute values like: |
427 | |
428 | <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> |
429 | |
430 | By default, "LIST]" is parsed as a boolean attribute, not as |
431 | part of the ALT value as was clearly intended. This is also what |
432 | Mozilla sees. |
433 | |
434 | The official behaviour is enabled by enabling this attribute. If |
435 | enabled, it will cause the tag above to be reported as text |
436 | since "LIST]" is not a legal attribute name. |
437 | |
438 | =item $p->unbroken_text |
439 | |
440 | =item $p->unbroken_text( $bool ) |
441 | |
442 | By default, blocks of text are given to the text handler as soon as |
443 | possible (but the parser takes care always to break text at a |
444 | boundary between whitespace and non-whitespace so single words and |
445 | entities can always be decoded safely). This might create breaks that |
446 | make it hard to do transformations on the text. When this attribute is |
447 | enabled, blocks of text are always reported in one piece. This will |
448 | delay the text event until the following (non-text) event has been |
449 | recognized by the parser. |
450 | |
451 | Note that the C<offset> argspec will give you the offset of the first |
452 | segment of text and C<length> is the combined length of the segments. |
453 | Since there might be ignored tags in between, these numbers can't be |
454 | used to directly index in the original document file. |
455 | |
456 | =item $p->utf8_mode |
457 | |
458 | =item $p->utf8_mode( $bool ) |
459 | |
460 | Enable this option when parsing raw undecoded UTF-8. This tells the |
461 | parser that the entities expanded for strings reported by C<attr>, |
462 | C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end |
463 | up compatible with the surrounding text. |
464 | |
465 | If C<utf8_mode> is enabled then it is an error to pass strings |
466 | containing characters with code above 255 to the parse() method, and |
467 | the parse() method will croak if you try. |
468 | |
469 | Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 |
470 | encoded. The character can also be represented by the entity |
471 | "♥" or "♥". If we feed the parser: |
472 | |
473 | $p->parse("\xE2\x99\xA5♥"); |
474 | |
475 | then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without |
476 | C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. |
477 | The later string is what you want. |
478 | |
479 | This option is only available with perl-5.8 or better. |
480 | |
481 | =item $p->xml_mode |
482 | |
483 | =item $p->xml_mode( $bool ) |
484 | |
485 | Enabling this attribute changes the parser to allow some XML |
486 | constructs. This enables the behaviour controlled by individually by |
487 | the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and |
488 | C<xml_pic> attributes and also suppresses special treatment of |
489 | elements that are parsed as CDATA for HTML. |
490 | |
491 | =item $p->xml_pic |
492 | |
493 | =item $p->xml_pic( $bool ) |
494 | |
495 | By default, I<processing instructions> are terminated by ">". When |
496 | this attribute is enabled, processing instructions are terminated by |
497 | "?>" instead. |
498 | |
499 | =back |
500 | |
501 | As markup and text is recognized, handlers are invoked. The following |
502 | method is used to set up handlers for different events: |
503 | |
504 | =over |
505 | |
506 | =item $p->handler( event => \&subroutine, $argspec ) |
507 | |
508 | =item $p->handler( event => $method_name, $argspec ) |
509 | |
510 | =item $p->handler( event => \@accum, $argspec ) |
511 | |
512 | =item $p->handler( event => "" ); |
513 | |
514 | =item $p->handler( event => undef ); |
515 | |
516 | =item $p->handler( event ); |
517 | |
518 | This method assigns a subroutine, method, or array to handle an event. |
519 | |
520 | Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, |
521 | C<process>, C<start_document>, C<end_document> or C<default>. |
522 | |
523 | The C<\&subroutine> is a reference to a subroutine which is called to handle |
524 | the event. |
525 | |
526 | The C<$method_name> is the name of a method of $p which is called to handle |
527 | the event. |
528 | |
529 | The C<@accum> is an array that will hold the event information as |
530 | sub-arrays. |
531 | |
532 | If the second argument is "", the event is ignored. |
533 | If it is undef, the default handler is invoked for the event. |
534 | |
535 | The C<$argspec> is a string that describes the information to be reported |
536 | for the event. Any requested information that does not apply to a |
537 | specific event is passed as C<undef>. If argspec is omitted, then it |
538 | is left unchanged. |
539 | |
540 | The return value from $p->handler is the old callback routine or a |
541 | reference to the accumulator array. |
542 | |
543 | Any return values from handler callback routines/methods are always |
544 | ignored. A handler callback can request parsing to be aborted by |
545 | invoking the $p->eof method. A handler callback is not allowed to |
546 | invoke the $p->parse() or $p->parse_file() method. An exception will |
547 | be raised if it tries. |
548 | |
549 | Examples: |
550 | |
551 | $p->handler(start => "start", 'self, attr, attrseq, text' ); |
552 | |
553 | This causes the "start" method of object $p to be called for 'start' events. |
554 | The callback signature is $p->start(\%attr, \@attr_seq, $text). |
555 | |
556 | $p->handler(start => \&start, 'attr, attrseq, text' ); |
557 | |
558 | This causes subroutine start() to be called for 'start' events. |
559 | The callback signature is start(\%attr, \@attr_seq, $text). |
560 | |
561 | $p->handler(start => \@accum, '"S", attr, attrseq, text' ); |
562 | |
563 | This causes 'start' event information to be saved in @accum. |
564 | The array elements will be ['S', \%attr, \@attr_seq, $text]. |
565 | |
566 | $p->handler(start => ""); |
567 | |
568 | This causes 'start' events to be ignored. It also suppresses |
569 | invocations of any default handler for start events. It is in most |
570 | cases equivalent to $p->handler(start => sub {}), but is more |
571 | efficient. It is different from the empty-sub-handler in that |
572 | C<skipped_text> is not reset by it. |
573 | |
574 | $p->handler(start => undef); |
575 | |
576 | This causes no handler to be associated with start events. |
577 | If there is a default handler it will be invoked. |
578 | |
579 | =back |
580 | |
581 | Filters based on tags can be set up to limit the number of events |
582 | reported. The main bottleneck during parsing is often the huge number |
583 | of callbacks made from the parser. Applying filters can improve |
584 | performance significantly. |
585 | |
586 | The following methods control filters: |
587 | |
588 | =over |
589 | |
590 | =item $p->ignore_elements( @tags ) |
591 | |
592 | Both the C<start> event and the C<end> event as well as any events that |
593 | would be reported in between are suppressed. The ignored elements can |
594 | contain nested occurrences of itself. Example: |
595 | |
596 | $p->ignore_elements(qw(script style)); |
597 | |
598 | The C<script> and C<style> tags will always nest properly since their |
599 | content is parsed in CDATA mode. For most other tags |
600 | C<ignore_elements> must be used with caution since HTML is often not |
601 | I<well formed>. |
602 | |
603 | =item $p->ignore_tags( @tags ) |
604 | |
605 | Any C<start> and C<end> events involving any of the tags given are |
606 | suppressed. To reset the filter (i.e. don't suppress any C<start> and |
607 | C<end> events), call C<ignore_tags> without an argument. |
608 | |
609 | =item $p->report_tags( @tags ) |
610 | |
611 | Any C<start> and C<end> events involving any of the tags I<not> given |
612 | are suppressed. To reset the filter (i.e. report all C<start> and |
613 | C<end> events), call C<report_tags> without an argument. |
614 | |
615 | =back |
616 | |
617 | Internally, the system has two filter lists, one for C<report_tags> |
618 | and one for C<ignore_tags>, and both filters are applied. This |
619 | effectively gives C<ignore_tags> precedence over C<report_tags>. |
620 | |
621 | Examples: |
622 | |
623 | $p->ignore_tags(qw(style)); |
624 | $p->report_tags(qw(script style)); |
625 | |
626 | results in only C<script> events being reported. |
627 | |
628 | =head2 Argspec |
629 | |
630 | Argspec is a string containing a comma-separated list that describes |
631 | the information reported by the event. The following argspec |
632 | identifier names can be used: |
633 | |
634 | =over |
635 | |
636 | =item C<attr> |
637 | |
638 | Attr causes a reference to a hash of attribute name/value pairs to be |
639 | passed. |
640 | |
641 | Boolean attributes' values are either the value set by |
642 | $p->boolean_attribute_value, or the attribute name if no value has been |
643 | set by $p->boolean_attribute_value. |
644 | |
645 | This passes undef except for C<start> events. |
646 | |
647 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute |
648 | names are forced to lower case. |
649 | |
650 | General entities are decoded in the attribute values and |
651 | one layer of matching quotes enclosing the attribute values is removed. |
652 | |
653 | The Unicode character set is assumed for entity decoding. With Perl |
654 | version 5.6 or earlier only the Latin-1 range is supported, and |
655 | entities for characters outside the range 0..255 are left unchanged. |
656 | |
657 | =item C<@attr> |
658 | |
659 | Basically the same as C<attr>, but keys and values are passed as |
660 | individual arguments and the original sequence of the attributes is |
661 | kept. The parameters passed will be the same as the @attr calculated |
662 | here: |
663 | |
664 | @attr = map { $_ => $attr->{$_} } @$attrseq; |
665 | |
666 | assuming $attr and $attrseq here are the hash and array passed as the |
667 | result of C<attr> and C<attrseq> argspecs. |
668 | |
669 | This passes no values for events besides C<start>. |
670 | |
671 | =item C<attrseq> |
672 | |
673 | Attrseq causes a reference to an array of attribute names to be |
674 | passed. This can be useful if you want to walk the C<attr> hash in |
675 | the original sequence. |
676 | |
677 | This passes undef except for C<start> events. |
678 | |
679 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute |
680 | names are forced to lower case. |
681 | |
682 | =item C<column> |
683 | |
684 | Column causes the column number of the start of the event to be passed. |
685 | The first column on a line is 0. |
686 | |
687 | =item C<dtext> |
688 | |
689 | Dtext causes the decoded text to be passed. General entities are |
690 | automatically decoded unless the event was inside a CDATA section or |
691 | was between literal start and end tags (C<script>, C<style>, |
692 | C<xmp>, C<iframe>, C<title>, C<textarea> and C<plaintext>). |
693 | |
694 | The Unicode character set is assumed for entity decoding. With Perl |
695 | version 5.6 or earlier only the Latin-1 range is supported, and |
696 | entities for characters outside the range 0..255 are left unchanged. |
697 | |
698 | This passes undef except for C<text> events. |
699 | |
700 | =item C<event> |
701 | |
702 | Event causes the event name to be passed. |
703 | |
704 | The event name is one of C<text>, C<start>, C<end>, C<declaration>, |
705 | C<comment>, C<process>, C<start_document> or C<end_document>. |
706 | |
707 | =item C<is_cdata> |
708 | |
709 | Is_cdata causes a TRUE value to be passed if the event is inside a CDATA |
710 | section or between literal start and end tags (C<script>, |
711 | C<style>, C<xmp>, C<iframe>, C<title>, C<textarea> and C<plaintext>). |
712 | |
713 | if the flag is FALSE for a text event, then you should normally |
714 | either use C<dtext> or decode the entities yourself before the text is |
715 | processed further. |
716 | |
717 | =item C<length> |
718 | |
719 | Length causes the number of bytes of the source text of the event to |
720 | be passed. |
721 | |
722 | =item C<line> |
723 | |
724 | Line causes the line number of the start of the event to be passed. |
725 | The first line in the document is 1. Line counting doesn't start |
726 | until at least one handler requests this value to be reported. |
727 | |
728 | =item C<offset> |
729 | |
730 | Offset causes the byte position in the HTML document of the start of |
731 | the event to be passed. The first byte in the document has offset 0. |
732 | |
733 | =item C<offset_end> |
734 | |
735 | Offset_end causes the byte position in the HTML document of the end of |
736 | the event to be passed. This is the same as C<offset> + C<length>. |
737 | |
738 | =item C<self> |
739 | |
740 | Self causes the current object to be passed to the handler. If the |
741 | handler is a method, this must be the first element in the argspec. |
742 | |
743 | An alternative to passing self as an argspec is to register closures |
744 | that capture $self by themselves as handlers. Unfortunately this |
745 | creates circular references which prevent the HTML::Parser object |
746 | from being garbage collected. Using the C<self> argspec avoids this |
747 | problem. |
748 | |
749 | =item C<skipped_text> |
750 | |
751 | Skipped_text returns the concatenated text of all the events that have |
752 | been skipped since the last time an event was reported. Events might |
753 | be skipped because no handler is registered for them or because some |
754 | filter applies. Skipped text also includes marked section markup, |
755 | since there are no events that can catch it. |
756 | |
757 | If an C<"">-handler is registered for an event, then the text for this |
758 | event is not included in C<skipped_text>. Skipped text both before |
759 | and after the C<"">-event is included in the next reported |
760 | C<skipped_text>. |
761 | |
762 | =item C<tag> |
763 | |
764 | Same as C<tagname>, but prefixed with "/" if it belongs to an C<end> |
765 | event and "!" for a declaration. The C<tag> does not have any prefix |
766 | for C<start> events, and is in this case identical to C<tagname>. |
767 | |
768 | =item C<tagname> |
769 | |
770 | This is the element name (or I<generic identifier> in SGML jargon) for |
771 | start and end tags. Since HTML is case insensitive, this name is |
772 | forced to lower case to ease string matching. |
773 | |
774 | Since XML is case sensitive, the tagname case is not changed when |
775 | C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute |
776 | is set. |
777 | |
778 | The declaration type of declaration elements is also passed as a tagname, |
779 | even if that is a bit strange. |
780 | In fact, in the current implementation tagname is |
781 | identical to C<token0> except that the name may be forced to lower case. |
782 | |
783 | =item C<token0> |
784 | |
785 | Token0 causes the original text of the first token string to be |
786 | passed. This should always be the same as $tokens->[0]. |
787 | |
788 | For C<declaration> events, this is the declaration type. |
789 | |
790 | For C<start> and C<end> events, this is the tag name. |
791 | |
792 | For C<process> and non-strict C<comment> events, this is everything |
793 | inside the tag. |
794 | |
795 | This passes undef if there are no tokens in the event. |
796 | |
797 | =item C<tokenpos> |
798 | |
799 | Tokenpos causes a reference to an array of token positions to be |
800 | passed. For each string that appears in C<tokens>, this array |
801 | contains two numbers. The first number is the offset of the start of |
802 | the token in the original C<text> and the second number is the length |
803 | of the token. |
804 | |
805 | Boolean attributes in a C<start> event will have (0,0) for the |
806 | attribute value offset and length. |
807 | |
808 | This passes undef if there are no tokens in the event (e.g., C<text>) |
809 | and for artificial C<end> events triggered by empty element tags. |
810 | |
811 | If you are using these offsets and lengths to modify C<text>, you |
812 | should either work from right to left, or be very careful to calculate |
813 | the changes to the offsets. |
814 | |
815 | =item C<tokens> |
816 | |
817 | Tokens causes a reference to an array of token strings to be passed. |
818 | The strings are exactly as they were found in the original text, |
819 | no decoding or case changes are applied. |
820 | |
821 | For C<declaration> events, the array contains each word, comment, and |
822 | delimited string starting with the declaration type. |
823 | |
824 | For C<comment> events, this contains each sub-comment. If |
825 | $p->strict_comments is disabled, there will be only one sub-comment. |
826 | |
827 | For C<start> events, this contains the original tag name followed by |
828 | the attribute name/value pairs. The values of boolean attributes will |
829 | be either the value set by $p->boolean_attribute_value, or the |
830 | attribute name if no value has been set by |
831 | $p->boolean_attribute_value. |
832 | |
833 | For C<end> events, this contains the original tag name (always one token). |
834 | |
835 | For C<process> events, this contains the process instructions (always one |
836 | token). |
837 | |
838 | This passes C<undef> for C<text> events. |
839 | |
840 | =item C<text> |
841 | |
842 | Text causes the source text (including markup element delimiters) to be |
843 | passed. |
844 | |
845 | =item C<undef> |
846 | |
847 | Pass an undefined value. Useful as padding where the same handler |
848 | routine is registered for multiple events. |
849 | |
850 | =item C<'...'> |
851 | |
852 | A literal string of 0 to 255 characters enclosed |
853 | in single (') or double (") quotes is passed as entered. |
854 | |
855 | =back |
856 | |
857 | The whole argspec string can be wrapped up in C<'@{...}'> to signal |
858 | that the resulting event array should be flattened. This only makes a |
859 | difference if an array reference is used as the handler target. |
860 | Consider this example: |
861 | |
862 | $p->handler(text => [], 'text'); |
863 | $p->handler(text => [], '@{text}']); |
864 | |
865 | With two text events; C<"foo">, C<"bar">; then the first example will end |
866 | up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in |
867 | the handler target array. |
868 | |
869 | |
870 | =head2 Events |
871 | |
872 | Handlers for the following events can be registered: |
873 | |
874 | =over |
875 | |
876 | =item C<comment> |
877 | |
878 | This event is triggered when a markup comment is recognized. |
879 | |
880 | Example: |
881 | |
882 | <!-- This is a comment -- -- So is this --> |
883 | |
884 | =item C<declaration> |
885 | |
886 | This event is triggered when a I<markup declaration> is recognized. |
887 | |
888 | For typical HTML documents, the only declaration you are |
889 | likely to find is <!DOCTYPE ...>. |
890 | |
891 | Example: |
892 | |
893 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" |
894 | "http://www.w3.org/TR/html40/strict.dtd"> |
895 | |
896 | DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. |
897 | |
898 | =item C<default> |
899 | |
900 | This event is triggered for events that do not have a specific |
901 | handler. You can set up a handler for this event to catch stuff you |
902 | did not want to catch explicitly. |
903 | |
904 | =item C<end> |
905 | |
906 | This event is triggered when an end tag is recognized. |
907 | |
908 | Example: |
909 | |
910 | </A> |
911 | |
912 | =item C<end_document> |
913 | |
914 | This event is triggered when $p->eof is called and after any remaining |
915 | text is flushed. There is no document text associated with this event. |
916 | |
917 | =item C<process> |
918 | |
919 | This event is triggered when a processing instructions markup is |
920 | recognized. |
921 | |
922 | The format and content of processing instructions are system and |
923 | application dependent. |
924 | |
925 | Examples: |
926 | |
927 | <? HTML processing instructions > |
928 | <? XML processing instructions ?> |
929 | |
930 | =item C<start> |
931 | |
932 | This event is triggered when a start tag is recognized. |
933 | |
934 | Example: |
935 | |
936 | <A HREF="http://www.perl.com/"> |
937 | |
938 | =item C<start_document> |
939 | |
940 | This event is triggered before any other events for a new document. A |
941 | handler for it can be used to initialize stuff. There is no document |
942 | text associated with this event. |
943 | |
944 | =item C<text> |
945 | |
946 | This event is triggered when plain text (characters) is recognized. |
947 | The text may contain multiple lines. A sequence of text may be broken |
948 | between several text events unless $p->unbroken_text is enabled. |
949 | |
950 | The parser will make sure that it does not break a word or a sequence |
951 | of whitespace between two text events. |
952 | |
953 | =back |
954 | |
955 | =head2 Unicode |
956 | |
957 | The C<HTML::Parser> can parse Unicode strings when running under |
958 | perl-5.8 or better. If Unicode is passed to $p->parse() then chunks |
959 | of Unicode will be reported to the handlers. The offset and length |
960 | argspecs will also report their position in terms of characters. |
961 | |
962 | It is safe to parse raw undecoded UTF-8 if you either avoid decoding |
963 | entities and make sure to not use I<argspecs> that do, or enable the |
964 | C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be |
965 | useful when parsing from a file where you need the reported offsets |
966 | and lengths to match the byte offsets in the file. |
967 | |
968 | If a filename is passed to $p->parse_file() then the file will be read |
969 | in binary mode. This will be fine if the file contains only ASCII or |
970 | Latin-1 characters. If the file contains UTF-8 encoded text then care |
971 | must be taken when decoding entities as described in the previous |
972 | paragraph, but better is to open the file with the UTF-8 layer so that |
973 | it is decoded properly: |
974 | |
975 | open(my $fh, "<:utf8", "index.html") || die "...: $!"; |
976 | $p->parse_file($fh); |
977 | |
978 | If the file contains text encoded in a charset besides ASCII, Latin-1 |
979 | or UTF-8 then decoding will always be needed. |
980 | |
981 | =head1 VERSION 2 COMPATIBILITY |
982 | |
983 | When an C<HTML::Parser> object is constructed with no arguments, a set |
984 | of handlers is automatically provided that is compatible with the old |
985 | HTML::Parser version 2 callback methods. |
986 | |
987 | This is equivalent to the following method calls: |
988 | |
989 | $p->handler(start => "start", "self, tagname, attr, attrseq, text"); |
990 | $p->handler(end => "end", "self, tagname, text"); |
991 | $p->handler(text => "text", "self, text, is_cdata"); |
992 | $p->handler(process => "process", "self, token0, text"); |
993 | $p->handler(comment => |
994 | sub { |
995 | my($self, $tokens) = @_; |
996 | for (@$tokens) {$self->comment($_);}}, |
997 | "self, tokens"); |
998 | $p->handler(declaration => |
999 | sub { |
1000 | my $self = shift; |
1001 | $self->declaration(substr($_[0], 2, -1));}, |
1002 | "self, text"); |
1003 | |
1004 | Setting up these handlers can also be requested with the "api_version => |
1005 | 2" constructor option. |
1006 | |
1007 | =head1 SUBCLASSING |
1008 | |
1009 | The C<HTML::Parser> class is subclassable. Parser objects are plain |
1010 | hashes and C<HTML::Parser> reserves only hash keys that start with |
1011 | "_hparser". The parser state can be set up by invoking the init() |
1012 | method, which takes the same arguments as new(). |
1013 | |
1014 | =head1 EXAMPLES |
1015 | |
1016 | The first simple example shows how you might strip out comments from |
1017 | an HTML document. We achieve this by setting up a comment handler that |
1018 | does nothing and a default handler that will print out anything else: |
1019 | |
1020 | use HTML::Parser; |
1021 | HTML::Parser->new(default_h => [sub { print shift }, 'text'], |
1022 | comment_h => [""], |
1023 | )->parse_file(shift || die) || die $!; |
1024 | |
1025 | An alternative implementation is: |
1026 | |
1027 | use HTML::Parser; |
1028 | HTML::Parser->new(end_document_h => [sub { print shift }, |
1029 | 'skipped_text'], |
1030 | comment_h => [""], |
1031 | )->parse_file(shift || die) || die $!; |
1032 | |
1033 | This will in most cases be much more efficient since only a single |
1034 | callback will be made. |
1035 | |
1036 | The next example prints out the text that is inside the <title> |
1037 | element of an HTML document. Here we start by setting up a start |
1038 | handler. When it sees the title start tag it enables a text handler |
1039 | that prints any text found and an end handler that will terminate |
1040 | parsing as soon as the title end tag is seen: |
1041 | |
1042 | use HTML::Parser (); |
1043 | |
1044 | sub start_handler |
1045 | { |
1046 | return if shift ne "title"; |
1047 | my $self = shift; |
1048 | $self->handler(text => sub { print shift }, "dtext"); |
1049 | $self->handler(end => sub { shift->eof if shift eq "title"; }, |
1050 | "tagname,self"); |
1051 | } |
1052 | |
1053 | my $p = HTML::Parser->new(api_version => 3); |
1054 | $p->handler( start => \&start_handler, "tagname,self"); |
1055 | $p->parse_file(shift || die) || die $!; |
1056 | print "\n"; |
1057 | |
1058 | More examples are found in the F<eg/> directory of the C<HTML-Parser> |
1059 | distribution: the program C<hrefsub> shows how you can edit all links |
1060 | found in a document; the program C<htextsub> shows how to edit the text only; the |
1061 | program C<hstrip> shows how you can strip out certain tags/elements |
1062 | and/or attributes; and the program C<htext> show how to obtain the |
1063 | plain text, but not any script/style content. |
1064 | |
1065 | You can browse the F<eg/> directory online from the I<[Browse]> link on |
1066 | the http://search.cpan.org/~gaas/HTML-Parser/ page. |
1067 | |
1068 | =head1 BUGS |
1069 | |
1070 | The <style> and <script> sections do not end with the first "</", but |
1071 | need the complete corresponding end tag. The standard behaviour is |
1072 | not really practical. |
1073 | |
1074 | When the I<strict_comment> option is enabled, we still recognize |
1075 | comments where there is something other than whitespace between even |
1076 | and odd "--" markers. |
1077 | |
1078 | Once $p->boolean_attribute_value has been set, there is no way to |
1079 | restore the default behaviour. |
1080 | |
1081 | There is currently no way to get both quote characters |
1082 | into the same literal argspec. |
1083 | |
1084 | Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them |
1085 | to repeat the previous start tag or close the previous start tag |
1086 | respectively. |
1087 | |
1088 | NET tags, e.g. "code/.../" are not recognized. This is SGML |
1089 | shorthand for "<code>...</code>". |
1090 | |
1091 | Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not |
1092 | recognized. |
1093 | |
1094 | =head1 DIAGNOSTICS |
1095 | |
1096 | The following messages may be produced by HTML::Parser. The notation |
1097 | in this listing is the same as used in L<perldiag>: |
1098 | |
1099 | =over |
1100 | |
1101 | =item Not a reference to a hash |
1102 | |
1103 | (F) The object blessed into or subclassed from HTML::Parser is not a |
1104 | hash as required by the HTML::Parser methods. |
1105 | |
1106 | =item Bad signature in parser state object at %p |
1107 | |
1108 | (F) The _hparser_xs_state element does not refer to a valid state structure. |
1109 | Something must have changed the internal value |
1110 | stored in this hash element, or the memory has been overwritten. |
1111 | |
1112 | =item _hparser_xs_state element is not a reference |
1113 | |
1114 | (F) The _hparser_xs_state element has been destroyed. |
1115 | |
1116 | =item Can't find '_hparser_xs_state' element in HTML::Parser hash |
1117 | |
1118 | (F) The _hparser_xs_state element is missing from the parser hash. |
1119 | It was either deleted, or not created when the object was created. |
1120 | |
1121 | =item API version %s not supported by HTML::Parser %s |
1122 | |
1123 | (F) The constructor option 'api_version' with an argument greater than |
1124 | or equal to 4 is reserved for future extensions. |
1125 | |
1126 | =item Bad constructor option '%s' |
1127 | |
1128 | (F) An unknown constructor option key was passed to the new() or |
1129 | init() methods. |
1130 | |
1131 | =item Parse loop not allowed |
1132 | |
1133 | (F) A handler invoked the parse() or parse_file() method. |
1134 | This is not permitted. |
1135 | |
1136 | =item marked sections not supported |
1137 | |
1138 | (F) The $p->marked_sections() method was invoked in a HTML::Parser |
1139 | module that was compiled without support for marked sections. |
1140 | |
1141 | =item Unknown boolean attribute (%d) |
1142 | |
1143 | (F) Something is wrong with the internal logic that set up aliases for |
1144 | boolean attributes. |
1145 | |
1146 | =item Only code or array references allowed as handler |
1147 | |
1148 | (F) The second argument for $p->handler must be either a subroutine |
1149 | reference, then name of a subroutine or method, or a reference to an |
1150 | array. |
1151 | |
1152 | =item No handler for %s events |
1153 | |
1154 | (F) The first argument to $p->handler must be a valid event name; i.e. one |
1155 | of "start", "end", "text", "process", "declaration" or "comment". |
1156 | |
1157 | =item Unrecognized identifier %s in argspec |
1158 | |
1159 | (F) The identifier is not a known argspec name. |
1160 | Use one of the names mentioned in the argspec section above. |
1161 | |
1162 | =item Literal string is longer than 255 chars in argspec |
1163 | |
1164 | (F) The current implementation limits the length of literals in |
1165 | an argspec to 255 characters. Make the literal shorter. |
1166 | |
1167 | =item Backslash reserved for literal string in argspec |
1168 | |
1169 | (F) The backslash character "\" is not allowed in argspec literals. |
1170 | It is reserved to permit quoting inside a literal in a later version. |
1171 | |
1172 | =item Unterminated literal string in argspec |
1173 | |
1174 | (F) The terminating quote character for a literal was not found. |
1175 | |
1176 | =item Bad argspec (%s) |
1177 | |
1178 | (F) Only identifier names, literals, spaces and commas |
1179 | are allowed in argspecs. |
1180 | |
1181 | =item Missing comma separator in argspec |
1182 | |
1183 | (F) Identifiers in an argspec must be separated with ",". |
1184 | |
1185 | =item Parsing of undecoded UTF-8 will give garbage when decoding entities |
1186 | |
1187 | (W) The first chunk parsed appears to contain undecoded UTF-8 and one |
1188 | or more argspecs that decode entities are used for the callback |
1189 | handlers. |
1190 | |
1191 | The result of decoding will be a mix of encoded and decoded characters |
1192 | for any entities that expand to characters with code above 127. This |
1193 | is not a good thing. |
1194 | |
1195 | The solution is to use the Encode::encode_utf8() on the data before |
1196 | feeding it to the $p->parse(). For $p->parse_file() pass a file that |
1197 | has been opened in ":utf8" mode. |
1198 | |
1199 | The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode> |
1200 | is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided. |
1201 | |
1202 | =item Parsing string decoded with wrong endianness |
1203 | |
1204 | (W) The first character in the document is U+FFFE. This is not a |
1205 | legal Unicode character but a byte swapped BOM. The result of parsing |
1206 | will likely be garbage. |
1207 | |
1208 | =item Parsing of undecoded UTF-32 |
1209 | |
1210 | (W) The parser found the Unicode UTF-32 BOM signature at the start |
1211 | of the document. The result of parsing will likely be garbage. |
1212 | |
1213 | =item Parsing of undecoded UTF-16 |
1214 | |
1215 | (W) The parser found the Unicode UTF-16 BOM signature at the start of |
1216 | the document. The result of parsing will likely be garbage. |
1217 | |
1218 | =back |
1219 | |
1220 | =head1 SEE ALSO |
1221 | |
1222 | L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>, |
1223 | L<HTML::LinkExtor>, L<HTML::Form> |
1224 | |
1225 | L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution) |
1226 | |
1227 | http://www.w3.org/TR/html4 |
1228 | |
1229 | More information about marked sections and processing instructions may |
1230 | be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>. |
1231 | |
1232 | =head1 COPYRIGHT |
1233 | |
1234 | Copyright 1996-2008 Gisle Aas. All rights reserved. |
1235 | Copyright 1999-2000 Michael A. Chase. All rights reserved. |
1236 | |
1237 | This library is free software; you can redistribute it and/or |
1238 | modify it under the same terms as Perl itself. |
1239 | |
1240 | =cut |