Commit | Line | Data |
3fea05b9 |
1 | package HTML::Tagset; |
2 | |
3 | use strict; |
4 | |
5 | =head1 NAME |
6 | |
7 | HTML::Tagset - data tables useful in parsing HTML |
8 | |
9 | =head1 VERSION |
10 | |
11 | Version 3.20 |
12 | |
13 | =cut |
14 | |
15 | use vars qw( $VERSION ); |
16 | |
17 | $VERSION = '3.20'; |
18 | |
19 | =head1 SYNOPSIS |
20 | |
21 | use HTML::Tagset; |
22 | # Then use any of the items in the HTML::Tagset package |
23 | # as need arises |
24 | |
25 | =head1 DESCRIPTION |
26 | |
27 | This module contains several data tables useful in various kinds of |
28 | HTML parsing operations. |
29 | |
30 | Note that all tag names used are lowercase. |
31 | |
32 | In the following documentation, a "hashset" is a hash being used as a |
33 | set -- the hash conveys that its keys are there, and the actual values |
34 | associated with the keys are not significant. (But what values are |
35 | there, are always true.) |
36 | |
37 | =cut |
38 | |
39 | use vars qw( |
40 | $VERSION |
41 | %emptyElement %optionalEndTag %linkElements %boolean_attr |
42 | %isHeadElement %isBodyElement %isPhraseMarkup |
43 | %is_Possible_Strict_P_Content |
44 | %isHeadOrBodyElement |
45 | %isList %isTableElement %isFormElement |
46 | %isKnown %canTighten |
47 | @p_closure_barriers |
48 | %isCDATA_Parent |
49 | ); |
50 | |
51 | =head1 VARIABLES |
52 | |
53 | Note that none of these variables are exported. |
54 | |
55 | =head2 hashset %HTML::Tagset::emptyElement |
56 | |
57 | This hashset has as values the tag-names (GIs) of elements that cannot |
58 | have content. (For example, "base", "br", "hr".) So |
59 | C<$HTML::Tagset::emptyElement{'hr'}> exists and is true. |
60 | C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true. |
61 | |
62 | =cut |
63 | |
64 | %emptyElement = map {; $_ => 1 } qw(base link meta isindex |
65 | img br hr wbr |
66 | input area param |
67 | embed bgsound spacer |
68 | basefont col frame |
69 | ~comment ~literal |
70 | ~declaration ~pi |
71 | ); |
72 | # The "~"-initial names are for pseudo-elements used by HTML::Entities |
73 | # and TreeBuilder |
74 | |
75 | =head2 hashset %HTML::Tagset::optionalEndTag |
76 | |
77 | This hashset lists tag-names for elements that can have content, but whose |
78 | end-tags are generally, "safely", omissible. Example: |
79 | C<$HTML::Tagset::emptyElement{'li'}> exists and is true. |
80 | |
81 | =cut |
82 | |
83 | %optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td); |
84 | |
85 | =head2 hash %HTML::Tagset::linkElements |
86 | |
87 | Values in this hash are tagnames for elements that might contain |
88 | links, and the value for each is a reference to an array of the names |
89 | of attributes whose values can be links. |
90 | |
91 | =cut |
92 | |
93 | %linkElements = |
94 | ( |
95 | 'a' => ['href'], |
96 | 'applet' => ['archive', 'codebase', 'code'], |
97 | 'area' => ['href'], |
98 | 'base' => ['href'], |
99 | 'bgsound' => ['src'], |
100 | 'blockquote' => ['cite'], |
101 | 'body' => ['background'], |
102 | 'del' => ['cite'], |
103 | 'embed' => ['pluginspage', 'src'], |
104 | 'form' => ['action'], |
105 | 'frame' => ['src', 'longdesc'], |
106 | 'iframe' => ['src', 'longdesc'], |
107 | 'ilayer' => ['background'], |
108 | 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'], |
109 | 'input' => ['src', 'usemap'], |
110 | 'ins' => ['cite'], |
111 | 'isindex' => ['action'], |
112 | 'head' => ['profile'], |
113 | 'layer' => ['background', 'src'], |
114 | 'link' => ['href'], |
115 | 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'], |
116 | 'q' => ['cite'], |
117 | 'script' => ['src', 'for'], |
118 | 'table' => ['background'], |
119 | 'td' => ['background'], |
120 | 'th' => ['background'], |
121 | 'tr' => ['background'], |
122 | 'xmp' => ['href'], |
123 | ); |
124 | |
125 | =head2 hash %HTML::Tagset::boolean_attr |
126 | |
127 | This hash (not hashset) lists what attributes of what elements can be |
128 | printed without showing the value (for example, the "noshade" attribute |
129 | of "hr" elements). For elements with only one such attribute, its value |
130 | is simply that attribute name. For elements with many such attributes, |
131 | the value is a reference to a hashset containing all such attributes. |
132 | |
133 | =cut |
134 | |
135 | %boolean_attr = ( |
136 | # TODO: make these all hashes |
137 | 'area' => 'nohref', |
138 | 'dir' => 'compact', |
139 | 'dl' => 'compact', |
140 | 'hr' => 'noshade', |
141 | 'img' => 'ismap', |
142 | 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 }, |
143 | 'menu' => 'compact', |
144 | 'ol' => 'compact', |
145 | 'option' => 'selected', |
146 | 'select' => 'multiple', |
147 | 'td' => 'nowrap', |
148 | 'th' => 'nowrap', |
149 | 'ul' => 'compact', |
150 | ); |
151 | |
152 | #========================================================================== |
153 | # List of all elements from Extensible HTML version 1.0 Transitional DTD: |
154 | # |
155 | # a abbr acronym address applet area b base basefont bdo big |
156 | # blockquote body br button caption center cite code col colgroup |
157 | # dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 |
158 | # head hr html i iframe img input ins isindex kbd label legend li |
159 | # link map menu meta noframes noscript object ol optgroup option p |
160 | # param pre q s samp script select small span strike strong style |
161 | # sub sup table tbody td textarea tfoot th thead title tr tt u ul |
162 | # var |
163 | # |
164 | # Varia from Mozilla source internal table of tags: |
165 | # Implemented: |
166 | # xmp listing wbr nobr frame frameset noframes ilayer |
167 | # layer nolayer spacer embed multicol |
168 | # But these are unimplemented: |
169 | # sound?? keygen?? server?? |
170 | # Also seen here and there: |
171 | # marquee?? app?? (both unimplemented) |
172 | #========================================================================== |
173 | |
174 | =head2 hashset %HTML::Tagset::isPhraseMarkup |
175 | |
176 | This hashset contains all phrasal-level elements. |
177 | |
178 | =cut |
179 | |
180 | %isPhraseMarkup = map {; $_ => 1 } qw( |
181 | span abbr acronym q sub sup |
182 | cite code em kbd samp strong var dfn strike |
183 | b i u s tt small big |
184 | a img br |
185 | wbr nobr blink |
186 | font basefont bdo |
187 | spacer embed noembed |
188 | ); # had: center, hr, table |
189 | |
190 | |
191 | =head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content |
192 | |
193 | This hashset contains all phrasal-level elements that be content of a |
194 | P element, for a strict model of HTML. |
195 | |
196 | =cut |
197 | |
198 | %is_Possible_Strict_P_Content = ( |
199 | %isPhraseMarkup, |
200 | %isFormElement, |
201 | map {; $_ => 1} qw( object script map ) |
202 | # I've no idea why there's these latter exceptions. |
203 | # I'm just following the HTML4.01 DTD. |
204 | ); |
205 | |
206 | #from html4 strict: |
207 | #<!ENTITY % fontstyle "TT | I | B | BIG | SMALL"> |
208 | # |
209 | #<!ENTITY % phrase "EM | STRONG | DFN | CODE | |
210 | # SAMP | KBD | VAR | CITE | ABBR | ACRONYM" > |
211 | # |
212 | #<!ENTITY % special |
213 | # "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO"> |
214 | # |
215 | #<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON"> |
216 | # |
217 | #<!-- %inline; covers inline or "text-level" elements --> |
218 | #<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;"> |
219 | |
220 | =head2 hashset %HTML::Tagset::isHeadElement |
221 | |
222 | This hashset contains all elements that elements that should be |
223 | present only in the 'head' element of an HTML document. |
224 | |
225 | =cut |
226 | |
227 | %isHeadElement = map {; $_ => 1 } |
228 | qw(title base link meta isindex script style object bgsound); |
229 | |
230 | =head2 hashset %HTML::Tagset::isList |
231 | |
232 | This hashset contains all elements that can contain "li" elements. |
233 | |
234 | =cut |
235 | |
236 | %isList = map {; $_ => 1 } qw(ul ol dir menu); |
237 | |
238 | =head2 hashset %HTML::Tagset::isTableElement |
239 | |
240 | This hashset contains all elements that are to be found only in/under |
241 | a "table" element. |
242 | |
243 | =cut |
244 | |
245 | %isTableElement = map {; $_ => 1 } |
246 | qw(tr td th thead tbody tfoot caption col colgroup); |
247 | |
248 | =head2 hashset %HTML::Tagset::isFormElement |
249 | |
250 | This hashset contains all elements that are to be found only in/under |
251 | a "form" element. |
252 | |
253 | =cut |
254 | |
255 | %isFormElement = map {; $_ => 1 } |
256 | qw(input select option optgroup textarea button label); |
257 | |
258 | =head2 hashset %HTML::Tagset::isBodyMarkup |
259 | |
260 | This hashset contains all elements that are to be found only in/under |
261 | the "body" element of an HTML document. |
262 | |
263 | =cut |
264 | |
265 | %isBodyElement = map {; $_ => 1 } qw( |
266 | h1 h2 h3 h4 h5 h6 |
267 | p div pre plaintext address blockquote |
268 | xmp listing |
269 | center |
270 | |
271 | multicol |
272 | iframe ilayer nolayer |
273 | bgsound |
274 | |
275 | hr |
276 | ol ul dir menu li |
277 | dl dt dd |
278 | ins del |
279 | |
280 | fieldset legend |
281 | |
282 | map area |
283 | applet param object |
284 | isindex script noscript |
285 | table |
286 | center |
287 | form |
288 | ), |
289 | keys %isFormElement, |
290 | keys %isPhraseMarkup, # And everything phrasal |
291 | keys %isTableElement, |
292 | ; |
293 | |
294 | |
295 | =head2 hashset %HTML::Tagset::isHeadOrBodyElement |
296 | |
297 | This hashset includes all elements that I notice can fall either in |
298 | the head or in the body. |
299 | |
300 | =cut |
301 | |
302 | %isHeadOrBodyElement = map {; $_ => 1 } |
303 | qw(script isindex style object map area param noscript bgsound); |
304 | # i.e., if we find 'script' in the 'body' or the 'head', don't freak out. |
305 | |
306 | |
307 | =head2 hashset %HTML::Tagset::isKnown |
308 | |
309 | This hashset lists all known HTML elements. |
310 | |
311 | =cut |
312 | |
313 | %isKnown = (%isHeadElement, %isBodyElement, |
314 | map{; $_=>1 } |
315 | qw( head body html |
316 | frame frameset noframes |
317 | ~comment ~pi ~directive ~literal |
318 | )); |
319 | # that should be all known tags ever ever |
320 | |
321 | |
322 | =head2 hashset %HTML::Tagset::canTighten |
323 | |
324 | This hashset lists elements that might have ignorable whitespace as |
325 | children or siblings. |
326 | |
327 | =cut |
328 | |
329 | %canTighten = %isKnown; |
330 | delete @canTighten{ |
331 | keys(%isPhraseMarkup), 'input', 'select', |
332 | 'xmp', 'listing', 'plaintext', 'pre', |
333 | }; |
334 | # xmp, listing, plaintext, and pre are untightenable, and |
335 | # in a really special way. |
336 | @canTighten{'hr','br'} = (1,1); |
337 | # exceptional 'phrasal' things that ARE subject to tightening. |
338 | |
339 | # The one case where I can think of my tightening rules failing is: |
340 | # <p>foo bar<center> <em>baz quux</em> ... |
341 | # ^-- that would get deleted. |
342 | # But that's pretty gruesome code anyhow. You gets what you pays for. |
343 | |
344 | #========================================================================== |
345 | |
346 | =head2 array @HTML::Tagset::p_closure_barriers |
347 | |
348 | This array has a meaning that I have only seen a need for in |
349 | C<HTML::TreeBuilder>, but I include it here on the off chance that someone |
350 | might find it of use: |
351 | |
352 | When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p |
353 | element we might have to minimize. At first sight, we might say that |
354 | if there's a p anywhere in the lineage of this new p, it should be |
355 | closed. But that's wrong. Consider this document: |
356 | |
357 | <html> |
358 | <head> |
359 | <title>foo</title> |
360 | </head> |
361 | <body> |
362 | <p>foo |
363 | <table> |
364 | <tr> |
365 | <td> |
366 | foo |
367 | <p>bar |
368 | </td> |
369 | </tr> |
370 | </table> |
371 | </p> |
372 | </body> |
373 | </html> |
374 | |
375 | The second p is quite legally inside a much higher p. |
376 | |
377 | My formalization of the reason why this is legal, but this: |
378 | |
379 | <p>foo<p>bar</p></p> |
380 | |
381 | isn't, is that something about the table constitutes a "barrier" to |
382 | the application of the rule about what p must minimize. |
383 | |
384 | So C<@HTML::Tagset::p_closure_barriers> is the list of all such |
385 | barrier-tags. |
386 | |
387 | =cut |
388 | |
389 | @p_closure_barriers = qw( |
390 | li blockquote |
391 | ul ol menu dir |
392 | dl dt dd |
393 | td th tr table caption |
394 | div |
395 | ); |
396 | |
397 | # In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this |
398 | # monkey business of barriers to minimization! |
399 | |
400 | =head2 hashset %isCDATA_Parent |
401 | |
402 | This hashset includes all elements whose content is CDATA. |
403 | |
404 | =cut |
405 | |
406 | %isCDATA_Parent = map {; $_ => 1 } |
407 | qw(script style xmp listing plaintext); |
408 | |
409 | # TODO: there's nothing else that takes CDATA children, right? |
410 | |
411 | # As the HTML3 DTD (Raggett 1995-04-24) noted: |
412 | # The XMP, LISTING and PLAINTEXT tags are incompatible with SGML |
413 | # and derive from very early versions of HTML. They require non- |
414 | # standard parsers and will cause problems for processing |
415 | # documents with standard SGML tools. |
416 | |
417 | |
418 | =head1 CAVEATS |
419 | |
420 | You may find it useful to alter the behavior of modules (like |
421 | C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s |
422 | data tables by altering the data tables themselves. You are welcome |
423 | to try, but be careful; and be aware that different modules may or may |
424 | react differently to the data tables being changed. |
425 | |
426 | Note that it may be inappropriate to use these tables for I<producing> |
427 | HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames |
428 | for all elements that can appear either in the head or in the body, |
429 | such as "script". That doesn't mean that I am saying your code that |
430 | produces HTML should feel free to put script elements in either place! |
431 | If you are producing programs that spit out HTML, you should be |
432 | I<intimately> familiar with the DTDs for HTML or XHTML (available at |
433 | C<http://www.w3.org/>), and you should slavishly obey them, not |
434 | the data tables in this document. |
435 | |
436 | =head1 SEE ALSO |
437 | |
438 | L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor> |
439 | |
440 | =head1 COPYRIGHT & LICENSE |
441 | |
442 | Copyright 1995-2000 Gisle Aas. |
443 | |
444 | Copyright 2000-2005 Sean M. Burke. |
445 | |
446 | Copyright 2005-2008 Andy Lester. |
447 | |
448 | This program is free software; you can redistribute it and/or modify it |
449 | under the same terms as Perl itself. |
450 | |
451 | =head1 ACKNOWLEDGEMENTS |
452 | |
453 | Most of the code/data in this module was adapted from code written |
454 | by Gisle Aas for C<HTML::Element>, C<HTML::TreeBuilder>, and |
455 | C<HTML::LinkExtor>. Then it was maintained by Sean M. Burke. |
456 | |
457 | =head1 AUTHOR |
458 | |
459 | Current maintainer: Andy Lester, C<< <andy at petdance.com> >> |
460 | |
461 | =head1 BUGS |
462 | |
463 | Please report any bugs or feature requests to |
464 | C<bug-html-tagset at rt.cpan.org>, or through the web interface at |
465 | L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Tagset>. I will |
466 | be notified, and then you'll automatically be notified of progress on |
467 | your bug as I make changes. |
468 | |
469 | =cut |
470 | |
471 | 1; |