Add built local::lib
[catagits/Gitalist.git] / local-lib5 / lib / perl5 / HTML / Tagset.pm
CommitLineData
3fea05b9 1package HTML::Tagset;
2
3use strict;
4
5=head1 NAME
6
7HTML::Tagset - data tables useful in parsing HTML
8
9=head1 VERSION
10
11Version 3.20
12
13=cut
14
15use vars qw( $VERSION );
16
17$VERSION = '3.20';
18
19=head1 SYNOPSIS
20
21 use HTML::Tagset;
22 # Then use any of the items in the HTML::Tagset package
23 # as need arises
24
25=head1 DESCRIPTION
26
27This module contains several data tables useful in various kinds of
28HTML parsing operations.
29
30Note that all tag names used are lowercase.
31
32In the following documentation, a "hashset" is a hash being used as a
33set -- the hash conveys that its keys are there, and the actual values
34associated with the keys are not significant. (But what values are
35there, are always true.)
36
37=cut
38
39use vars qw(
40 $VERSION
41 %emptyElement %optionalEndTag %linkElements %boolean_attr
42 %isHeadElement %isBodyElement %isPhraseMarkup
43 %is_Possible_Strict_P_Content
44 %isHeadOrBodyElement
45 %isList %isTableElement %isFormElement
46 %isKnown %canTighten
47 @p_closure_barriers
48 %isCDATA_Parent
49);
50
51=head1 VARIABLES
52
53Note that none of these variables are exported.
54
55=head2 hashset %HTML::Tagset::emptyElement
56
57This hashset has as values the tag-names (GIs) of elements that cannot
58have content. (For example, "base", "br", "hr".) So
59C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
60C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
61
62=cut
63
64%emptyElement = map {; $_ => 1 } qw(base link meta isindex
65 img br hr wbr
66 input area param
67 embed bgsound spacer
68 basefont col frame
69 ~comment ~literal
70 ~declaration ~pi
71 );
72# The "~"-initial names are for pseudo-elements used by HTML::Entities
73# and TreeBuilder
74
75=head2 hashset %HTML::Tagset::optionalEndTag
76
77This hashset lists tag-names for elements that can have content, but whose
78end-tags are generally, "safely", omissible. Example:
79C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
80
81=cut
82
83%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
84
85=head2 hash %HTML::Tagset::linkElements
86
87Values in this hash are tagnames for elements that might contain
88links, and the value for each is a reference to an array of the names
89of attributes whose values can be links.
90
91=cut
92
93%linkElements =
94(
95 'a' => ['href'],
96 'applet' => ['archive', 'codebase', 'code'],
97 'area' => ['href'],
98 'base' => ['href'],
99 'bgsound' => ['src'],
100 'blockquote' => ['cite'],
101 'body' => ['background'],
102 'del' => ['cite'],
103 'embed' => ['pluginspage', 'src'],
104 'form' => ['action'],
105 'frame' => ['src', 'longdesc'],
106 'iframe' => ['src', 'longdesc'],
107 'ilayer' => ['background'],
108 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
109 'input' => ['src', 'usemap'],
110 'ins' => ['cite'],
111 'isindex' => ['action'],
112 'head' => ['profile'],
113 'layer' => ['background', 'src'],
114 'link' => ['href'],
115 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
116 'q' => ['cite'],
117 'script' => ['src', 'for'],
118 'table' => ['background'],
119 'td' => ['background'],
120 'th' => ['background'],
121 'tr' => ['background'],
122 'xmp' => ['href'],
123);
124
125=head2 hash %HTML::Tagset::boolean_attr
126
127This hash (not hashset) lists what attributes of what elements can be
128printed without showing the value (for example, the "noshade" attribute
129of "hr" elements). For elements with only one such attribute, its value
130is simply that attribute name. For elements with many such attributes,
131the value is a reference to a hashset containing all such attributes.
132
133=cut
134
135%boolean_attr = (
136# TODO: make these all hashes
137 'area' => 'nohref',
138 'dir' => 'compact',
139 'dl' => 'compact',
140 'hr' => 'noshade',
141 'img' => 'ismap',
142 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
143 'menu' => 'compact',
144 'ol' => 'compact',
145 'option' => 'selected',
146 'select' => 'multiple',
147 'td' => 'nowrap',
148 'th' => 'nowrap',
149 'ul' => 'compact',
150);
151
152#==========================================================================
153# List of all elements from Extensible HTML version 1.0 Transitional DTD:
154#
155# a abbr acronym address applet area b base basefont bdo big
156# blockquote body br button caption center cite code col colgroup
157# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
158# head hr html i iframe img input ins isindex kbd label legend li
159# link map menu meta noframes noscript object ol optgroup option p
160# param pre q s samp script select small span strike strong style
161# sub sup table tbody td textarea tfoot th thead title tr tt u ul
162# var
163#
164# Varia from Mozilla source internal table of tags:
165# Implemented:
166# xmp listing wbr nobr frame frameset noframes ilayer
167# layer nolayer spacer embed multicol
168# But these are unimplemented:
169# sound?? keygen?? server??
170# Also seen here and there:
171# marquee?? app?? (both unimplemented)
172#==========================================================================
173
174=head2 hashset %HTML::Tagset::isPhraseMarkup
175
176This hashset contains all phrasal-level elements.
177
178=cut
179
180%isPhraseMarkup = map {; $_ => 1 } qw(
181 span abbr acronym q sub sup
182 cite code em kbd samp strong var dfn strike
183 b i u s tt small big
184 a img br
185 wbr nobr blink
186 font basefont bdo
187 spacer embed noembed
188); # had: center, hr, table
189
190
191=head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
192
193This hashset contains all phrasal-level elements that be content of a
194P element, for a strict model of HTML.
195
196=cut
197
198%is_Possible_Strict_P_Content = (
199 %isPhraseMarkup,
200 %isFormElement,
201 map {; $_ => 1} qw( object script map )
202 # I've no idea why there's these latter exceptions.
203 # I'm just following the HTML4.01 DTD.
204);
205
206#from html4 strict:
207#<!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
208#
209#<!ENTITY % phrase "EM | STRONG | DFN | CODE |
210# SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
211#
212#<!ENTITY % special
213# "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
214#
215#<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
216#
217#<!-- %inline; covers inline or "text-level" elements -->
218#<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
219
220=head2 hashset %HTML::Tagset::isHeadElement
221
222This hashset contains all elements that elements that should be
223present only in the 'head' element of an HTML document.
224
225=cut
226
227%isHeadElement = map {; $_ => 1 }
228 qw(title base link meta isindex script style object bgsound);
229
230=head2 hashset %HTML::Tagset::isList
231
232This hashset contains all elements that can contain "li" elements.
233
234=cut
235
236%isList = map {; $_ => 1 } qw(ul ol dir menu);
237
238=head2 hashset %HTML::Tagset::isTableElement
239
240This hashset contains all elements that are to be found only in/under
241a "table" element.
242
243=cut
244
245%isTableElement = map {; $_ => 1 }
246 qw(tr td th thead tbody tfoot caption col colgroup);
247
248=head2 hashset %HTML::Tagset::isFormElement
249
250This hashset contains all elements that are to be found only in/under
251a "form" element.
252
253=cut
254
255%isFormElement = map {; $_ => 1 }
256 qw(input select option optgroup textarea button label);
257
258=head2 hashset %HTML::Tagset::isBodyMarkup
259
260This hashset contains all elements that are to be found only in/under
261the "body" element of an HTML document.
262
263=cut
264
265%isBodyElement = map {; $_ => 1 } qw(
266 h1 h2 h3 h4 h5 h6
267 p div pre plaintext address blockquote
268 xmp listing
269 center
270
271 multicol
272 iframe ilayer nolayer
273 bgsound
274
275 hr
276 ol ul dir menu li
277 dl dt dd
278 ins del
279
280 fieldset legend
281
282 map area
283 applet param object
284 isindex script noscript
285 table
286 center
287 form
288 ),
289 keys %isFormElement,
290 keys %isPhraseMarkup, # And everything phrasal
291 keys %isTableElement,
292;
293
294
295=head2 hashset %HTML::Tagset::isHeadOrBodyElement
296
297This hashset includes all elements that I notice can fall either in
298the head or in the body.
299
300=cut
301
302%isHeadOrBodyElement = map {; $_ => 1 }
303 qw(script isindex style object map area param noscript bgsound);
304 # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
305
306
307=head2 hashset %HTML::Tagset::isKnown
308
309This hashset lists all known HTML elements.
310
311=cut
312
313%isKnown = (%isHeadElement, %isBodyElement,
314 map{; $_=>1 }
315 qw( head body html
316 frame frameset noframes
317 ~comment ~pi ~directive ~literal
318));
319 # that should be all known tags ever ever
320
321
322=head2 hashset %HTML::Tagset::canTighten
323
324This hashset lists elements that might have ignorable whitespace as
325children or siblings.
326
327=cut
328
329%canTighten = %isKnown;
330delete @canTighten{
331 keys(%isPhraseMarkup), 'input', 'select',
332 'xmp', 'listing', 'plaintext', 'pre',
333};
334 # xmp, listing, plaintext, and pre are untightenable, and
335 # in a really special way.
336@canTighten{'hr','br'} = (1,1);
337 # exceptional 'phrasal' things that ARE subject to tightening.
338
339# The one case where I can think of my tightening rules failing is:
340# <p>foo bar<center> <em>baz quux</em> ...
341# ^-- that would get deleted.
342# But that's pretty gruesome code anyhow. You gets what you pays for.
343
344#==========================================================================
345
346=head2 array @HTML::Tagset::p_closure_barriers
347
348This array has a meaning that I have only seen a need for in
349C<HTML::TreeBuilder>, but I include it here on the off chance that someone
350might find it of use:
351
352When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p
353element we might have to minimize. At first sight, we might say that
354if there's a p anywhere in the lineage of this new p, it should be
355closed. But that's wrong. Consider this document:
356
357 <html>
358 <head>
359 <title>foo</title>
360 </head>
361 <body>
362 <p>foo
363 <table>
364 <tr>
365 <td>
366 foo
367 <p>bar
368 </td>
369 </tr>
370 </table>
371 </p>
372 </body>
373 </html>
374
375The second p is quite legally inside a much higher p.
376
377My formalization of the reason why this is legal, but this:
378
379 <p>foo<p>bar</p></p>
380
381isn't, is that something about the table constitutes a "barrier" to
382the application of the rule about what p must minimize.
383
384So C<@HTML::Tagset::p_closure_barriers> is the list of all such
385barrier-tags.
386
387=cut
388
389@p_closure_barriers = qw(
390 li blockquote
391 ul ol menu dir
392 dl dt dd
393 td th tr table caption
394 div
395 );
396
397# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
398# monkey business of barriers to minimization!
399
400=head2 hashset %isCDATA_Parent
401
402This hashset includes all elements whose content is CDATA.
403
404=cut
405
406%isCDATA_Parent = map {; $_ => 1 }
407 qw(script style xmp listing plaintext);
408
409# TODO: there's nothing else that takes CDATA children, right?
410
411# As the HTML3 DTD (Raggett 1995-04-24) noted:
412# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
413# and derive from very early versions of HTML. They require non-
414# standard parsers and will cause problems for processing
415# documents with standard SGML tools.
416
417
418=head1 CAVEATS
419
420You may find it useful to alter the behavior of modules (like
421C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s
422data tables by altering the data tables themselves. You are welcome
423to try, but be careful; and be aware that different modules may or may
424react differently to the data tables being changed.
425
426Note that it may be inappropriate to use these tables for I<producing>
427HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
428for all elements that can appear either in the head or in the body,
429such as "script". That doesn't mean that I am saying your code that
430produces HTML should feel free to put script elements in either place!
431If you are producing programs that spit out HTML, you should be
432I<intimately> familiar with the DTDs for HTML or XHTML (available at
433C<http://www.w3.org/>), and you should slavishly obey them, not
434the data tables in this document.
435
436=head1 SEE ALSO
437
438L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor>
439
440=head1 COPYRIGHT & LICENSE
441
442Copyright 1995-2000 Gisle Aas.
443
444Copyright 2000-2005 Sean M. Burke.
445
446Copyright 2005-2008 Andy Lester.
447
448This program is free software; you can redistribute it and/or modify it
449under the same terms as Perl itself.
450
451=head1 ACKNOWLEDGEMENTS
452
453Most of the code/data in this module was adapted from code written
454by Gisle Aas for C<HTML::Element>, C<HTML::TreeBuilder>, and
455C<HTML::LinkExtor>. Then it was maintained by Sean M. Burke.
456
457=head1 AUTHOR
458
459Current maintainer: Andy Lester, C<< <andy at petdance.com> >>
460
461=head1 BUGS
462
463Please report any bugs or feature requests to
464C<bug-html-tagset at rt.cpan.org>, or through the web interface at
465L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Tagset>. I will
466be notified, and then you'll automatically be notified of progress on
467your bug as I make changes.
468
469=cut
470
4711;