parse("ram=bar>");
+ $ok;
+}->join();
+
+is($ok,2);
+
diff --git a/ext/HTML/Parser/t/tokeparser.t b/ext/HTML/Parser/t/tokeparser.t
new file mode 100644
index 0000000..2084201
--- /dev/null
+++ b/ext/HTML/Parser/t/tokeparser.t
@@ -0,0 +1,164 @@
+use Test::More tests => 17;
+
+use strict;
+use HTML::TokeParser;
+
+# First we create an HTML document to test
+
+my $file = "ttest$$.htm";
+die "$file already exists" if -e $file;
+
+open(F, ">$file") or die "Can't create $file: $!";
+print F <<'EOT'; close(F);
+
+
+
+ This is the <title>
+
+
+
+
+
+
+
+ This is the title again
+
+
+ And this is a link to the Institute
+
+
process instruction >
+
+
+
+
+EOT
+
+END { unlink($file) || warn "Can't unlink $file: $!"; }
+
+
+my $p;
+
+
+$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
+ok($p->unbroken_text);
+if ($p->get_tag("foo", "title")) {
+ my $title = $p->get_trimmed_text;
+ #diag "Title: $title";
+ is($title, "This is the ");
+}
+undef($p);
+
+# Test with reference to glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(\*F);
+my $scount = 0;
+my $ecount = 0;
+my $tcount = 0;
+my $pcount = 0;
+while (my $token = $p->get_token) {
+ $scount++ if $token->[0] eq "S";
+ $ecount++ if $token->[0] eq "E";
+ $pcount++ if $token->[0] eq "PI";
+}
+undef($p);
+close F;
+
+# Test with glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(*F);
+$tcount++ while $p->get_tag;
+undef($p);
+close F;
+
+# Test with plain file name
+$p = HTML::TokeParser->new($file) || die;
+$tcount++ while $p->get_tag;
+undef($p);
+
+#diag "Number of tokens found: $tcount/2 = $scount + $ecount";
+is($tcount, 34);
+is($scount, 10);
+is($ecount, 7);
+is($pcount, 1);
+is($tcount/2, $scount + $ecount);
+
+ok(!HTML::TokeParser->new("/noT/thEre/$$"));
+
+
+$p = HTML::TokeParser->new($file) || die;
+$p->get_tag("a");
+my $atext = $p->get_text;
+undef($p);
+
+is($atext, "Perl\240Institute");
+
+# test parsing of embeded document
+$p = HTML::TokeParser->new(\<Title
+
+Heading
+
+HTML
+
+ok($p->get_tag("h1"));
+is($p->get_trimmed_text, "Heading");
+undef($p);
+
+# test parsing of large embedded documents
+my $doc = "foo is bar\n\n\n" x 2022;
+
+#use Time::HiRes qw(time);
+my $start = time;
+$p = HTML::TokeParser->new(\$doc);
+#diag "Construction time: ", time - $start;
+
+my $count;
+while (my $t = $p->get_token) {
+ $count++ if $t->[0] eq "S";
+}
+#diag "Parse time: ", time - $start;
+
+is($count, 2022);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+This is a heading
+This is some
text.
+
+This is some more text.
+
+This is even some more.
+EOT
+
+$p->get_tag("/h1");
+
+my $t = $p->get_trimmed_text("br", "p");
+is($t, "This is some text.");
+
+$p->get_tag;
+
+$t = $p->get_trimmed_text("br", "p");
+is($t,"This is some more text.");
+
+undef($p);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+
This is a bold heading
+This is some italic text.
This is some more text.
+
+This is even some more.
+EOT
+
+$p->get_tag("h1");
+
+$t = $p->get_phrase;
+is($t, "This is a bold heading");
+
+$t = $p->get_phrase;
+is($t, "");
+
+$p->get_tag;
+
+$t = $p->get_phrase;
+is($t, "This is some italic text. This is some more text.");
+
+undef($p);
diff --git a/ext/HTML/Parser/t/uentities.t b/ext/HTML/Parser/t/uentities.t
new file mode 100644
index 0000000..b9decc5
--- /dev/null
+++ b/ext/HTML/Parser/t/uentities.t
@@ -0,0 +1,67 @@
+# Test Unicode entities
+
+use HTML::Entities;
+
+use Test::More tests => 27;
+
+SKIP: {
+skip "This perl does not support Unicode or Unicode entities not selected",
+ 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT;
+
+is(decode_entities("&euro"), "&euro");
+is(decode_entities("€"), "\x{20AC}");
+
+is(decode_entities("å"), "å");
+is(decode_entities("å"), "å");
+
+is(decode_entities(""), chr(500000));
+
+is(decode_entities(""), "\x{10FFFD}");
+
+is(decode_entities(""), "\x{FFFC}");
+
+
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), chr(0xFFFD));
+is(decode_entities(""), chr(0xFFFD));
+
+is(decode_entities(""), "\0");
+is(decode_entities(""), "\0");
+is(decode_entities(""), "\0");
+is(decode_entities(""), "\0");
+
+is(decode_entities("ååå"), "ååå\x{FFF}");
+
+# This might fail when we get more than 64 bit UVs
+is(decode_entities(""), "");
+is(decode_entities(""), "");
+
+my $err;
+for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) {
+ my $a = join("", map chr, $_->[0] .. $_->[1]);
+
+ my $e = encode_entities($a);
+ my $d = decode_entities($e);
+
+ unless ($d eq $a) {
+ diag "Wrong decoding in range $_->[0] .. $_->[1]";
+ # use Devel::Peek; Dump($a); Dump($d);
+ $err++;
+ }
+}
+ok(!$err);
+
+
+is(decode_entities(""), chr(0x100085));
+
+is(decode_entities(""), chr(0x100085));
+
+is(decode_entities(""), chr(0xFFFD));
+
+is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}");
+}
diff --git a/ext/HTML/Parser/t/unbroken-text.t b/ext/HTML/Parser/t/unbroken-text.t
new file mode 100644
index 0000000..7de85a9
--- /dev/null
+++ b/ext/HTML/Parser/t/unbroken-text.t
@@ -0,0 +1,60 @@
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 3;
+
+my $text = "";
+sub text
+{
+ my $cdata = shift() ? "CDATA" : "TEXT";
+ my($offset, $line, $col, $t) = @_;
+ $text .= "[$cdata:$offset:$line.$col:$t]";
+}
+
+sub tag
+{
+ $text .= shift;
+}
+
+my $p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ start_h => [\&tag, "text"],
+ end_h => [\&tag, "text"],
+ );
+
+$p->parse("foo ");
+$p->parse("bar ");
+$p->parse("");
+$p->parse("bar\n");
+$p->parse("");
+$p->parse("
xmp");
+$p->parse("atend");
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foo bar ][TEXT:13:1.13:bar\n][CDATA:28:2.11:xmp]");
+
+$text = "";
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:37:2.20:atend]");
+
+
+$p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ );
+
+$text = "";
+$p->parse("foo");
+$p->parse("parse(">bar\n");
+$p->parse("fooparse("p>xmp");
+$p->parse("parse(">bar");
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]");
+
+
diff --git a/ext/HTML/Parser/t/unicode-bom.t b/ext/HTML/Parser/t/unicode-bom.t
new file mode 100644
index 0000000..34e066f
--- /dev/null
+++ b/ext/HTML/Parser/t/unicode-bom.t
@@ -0,0 +1,59 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 2;
+use HTML::Parser;
+
+SKIP: {
+skip "This perl does not support Unicode", 2 if $] < 5.008;
+
+my @parsed;
+my $p = HTML::Parser->new(
+ api_version => 3,
+ start_h => [\@parsed, 'tag, attr'],
+);
+
+my @warn;
+$SIG{__WARN__} = sub {
+ push(@warn, $_[0]);
+};
+
+$p->parse("\xEF\xBB\xBFHi there");
+$p->eof;
+
+#use Encode;
+$p->parse("\xEF\xBB\xBFHi there" . chr(0x263A));
+$p->eof;
+
+$p->parse("\xFF\xFEHi there");
+$p->eof;
+
+$p->parse("\xFE\xFFHi there");
+$p->eof;
+
+$p->parse("\0\0\xFF\xFEHi there");
+$p->eof;
+
+$p->parse("\xFE\xFF\0\0Hi there");
+$p->eof;
+
+is(join("", @warn), <new(
+ api_version => 3,
+ start_h => [\@parsed, 'tag'],
+);
+
+$p->parse("\xEF\xBB\xBFHi there");
+$p->eof;
+ok(!@warn);
+}
diff --git a/ext/HTML/Parser/t/unicode.t b/ext/HTML/Parser/t/unicode.t
new file mode 100644
index 0000000..82902de
--- /dev/null
+++ b/ext/HTML/Parser/t/unicode.t
@@ -0,0 +1,183 @@
+#!perl -w
+
+use strict;
+use HTML::Parser;
+use Test::More tests => 103;
+
+SKIP: {
+skip "This perl does not support Unicode", 103 if $] < 5.008;
+
+my @warn;
+$SIG{__WARN__} = sub {
+ push(@warn, $_[0]);
+};
+
+my @parsed;
+my $p = HTML::Parser->new(
+ api_version => 3,
+ default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'],
+);
+
+my $doc = "\x{263A}Smile ☺
\x{0420}";
+is(length($doc), 46);
+
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 0x263A);
+is($parsed[2][2], chr(0x263A));
+is($parsed[2][3], 7);
+is($parsed[2][4], 1);
+is($parsed[2][5], 8);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "");
+is($parsed[3][3], 8);
+is($parsed[3][6], 8);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
+is($parsed[4][8]{id}, "\x{2600}");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile ☺");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[7][0], "text");
+is($parsed[7][1], "\x{0420}");
+is($parsed[7][2], "\x{0420}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+is(@warn, 0);
+
+# Try to parse it as an UTF8 encoded string
+utf8::encode($doc);
+is(length($doc), 51);
+
+@parsed = ();
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 226);
+is($parsed[2][1], "\xE2\x98\xBA");
+is($parsed[2][2], "\xE2\x98\xBA");
+is($parsed[2][3], 7);
+is($parsed[2][4], 3);
+is($parsed[2][5], 10);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "");
+is($parsed[3][3], 10);
+is($parsed[3][6], 10);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
+is($parsed[4][8]{id}, "\xE2\x98\x80");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile ☺");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+my $file = "test-$$.html";
+open(my $fh, ">:utf8", $file) || die;
+print $fh <\x{263A} Love!
+♥ Love \x{2665}
+EOT
+close($fh) || die;
+
+@warn = ();
+@parsed = ();
+$p->parse_file($file);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \xE2\x99\xA5");
+is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage
+is($parsed[10][3], -s $file);
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw:utf8", $file) || die;
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\x{2665}");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \x{2665}");
+is($parsed[7][2], "\x{2665} Love \x{2665}");
+is($parsed[10][3], (-s $file) - 2 * 4);
+is(@warn, 0);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw", $file) || die;
+$p->utf8_mode(1);
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \xE2\x99\xA5");
+is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
+is($parsed[10][3], -s $file);
+is(@warn, 0);
+
+unlink($file);
+
+@parsed = ();
+$p->parse(q(foo))->eof;
+is(@parsed, "5");
+is($parsed[1][0], "start");
+is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
+
+ok(!HTML::Entities::_probably_utf8_chunk(""));
+ok(!HTML::Entities::_probably_utf8_chunk("f"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
+}
diff --git a/ext/HTML/Parser/t/xml-mode.t b/ext/HTML/Parser/t/xml-mode.t
new file mode 100644
index 0000000..cdfc5b0
--- /dev/null
+++ b/ext/HTML/Parser/t/xml-mode.t
@@ -0,0 +1,112 @@
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(xml_mode => 1,
+ );
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ $text .= "]";
+ }, "tagname,attr");
+$p->handler(end =>
+ sub {
+ $text .= "E[" . shift() . "]";
+ }, "tagname");
+$p->handler(process =>
+ sub {
+ $text .= "PI[" . shift() . "]";
+ }, "token0");
+$p->handler(text =>
+ sub {
+ $text .= shift;
+ }, "text");
+
+my $xml = <<'EOT';
+
+
+
+My first architectual document
+Geir Ove Gronmo, grove@infotek.no
+This is the first paragraph in this document
+This is the second paragraph
+
+
+
+EOT
+
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"]
+PI[IS10744:arch name="html"]
+S[DOC]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para]E[para]
+S[xmp]S[foo]E[foo]E[xmp]
+E[DOC]
+EOT
+
+$text = "";
+$p->xml_mode(0);
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"?]
+PI[IS10744:arch name="html"?]
+S[doc]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para/]
+S[xmp]E[xmp]
+E[doc]
+EOT
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+ xml_mode => 1);
+
+$p->handler("end" =>
+ sub {
+ my($tagname, $text) = @_;
+ is($tagname, "Xyzzy");
+ ok(!length($text));
+ }, "tagname,text");
+$p->parse("and some more")->eof;
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+ empty_element_tags => 1);
+
+$p->handler("end" =>
+ sub {
+ my($tagname, $text) = @_;
+ is($tagname, "xyzzy");
+ ok(!length($text));
+ }, "tagname,text");
+$p->parse("and some more")->eof;
+
+$p = HTML::Parser->new(
+ api_version => 3,
+ xml_pic => 1,
+);
+
+$p->handler(
+ "process" => sub {
+ my($text, $t0) = @_;
+ is($text, " bar?>");
+ is($t0, "foo > bar");
+ }, "text, token0");
+$p->parse(" bar?> and then")->eof;
diff --git a/ext/HTML/Parser/tokenpos.h b/ext/HTML/Parser/tokenpos.h
new file mode 100644
index 0000000..aa971bf
--- /dev/null
+++ b/ext/HTML/Parser/tokenpos.h
@@ -0,0 +1,49 @@
+struct token_pos
+{
+ char *beg;
+ char *end;
+};
+typedef struct token_pos token_pos_t;
+
+#define dTOKENS(init_lim) \
+ token_pos_t token_buf[init_lim]; \
+ int token_lim = init_lim; \
+ token_pos_t *tokens = token_buf; \
+ int num_tokens = 0
+
+#define PUSH_TOKEN(p_beg, p_end) \
+ STMT_START { \
+ ++num_tokens; \
+ if (num_tokens == token_lim) \
+ tokens_grow(&tokens, &token_lim, (bool)(tokens != token_buf)); \
+ tokens[num_tokens-1].beg = p_beg; \
+ tokens[num_tokens-1].end = p_end; \
+ } STMT_END
+
+#define FREE_TOKENS \
+ STMT_START { \
+ if (tokens != token_buf) \
+ Safefree(tokens); \
+ } STMT_END
+
+static void
+tokens_grow(token_pos_t **token_ptr, int *token_lim_ptr, bool tokens_on_heap)
+{
+ int new_lim = *token_lim_ptr;
+ if (new_lim < 4)
+ new_lim = 4;
+ new_lim *= 2;
+
+ if (tokens_on_heap) {
+ Renew(*token_ptr, new_lim, token_pos_t);
+ }
+ else {
+ token_pos_t *new_tokens;
+ int i;
+ New(57, new_tokens, new_lim, token_pos_t);
+ for (i = 0; i < *token_lim_ptr; i++)
+ new_tokens[i] = (*token_ptr)[i];
+ *token_ptr = new_tokens;
+ }
+ *token_lim_ptr = new_lim;
+}
diff --git a/ext/HTML/Parser/typemap b/ext/HTML/Parser/typemap
new file mode 100644
index 0000000..a323854
--- /dev/null
+++ b/ext/HTML/Parser/typemap
@@ -0,0 +1,5 @@
+PSTATE* T_PSTATE
+
+INPUT
+T_PSTATE
+ $var = get_pstate_hv(aTHX_ $arg)
diff --git a/ext/HTML/Parser/util.c b/ext/HTML/Parser/util.c
new file mode 100644
index 0000000..7e626bf
--- /dev/null
+++ b/ext/HTML/Parser/util.c
@@ -0,0 +1,312 @@
+/* $Id: util.c,v 2.30 2006/03/22 09:15:17 gisle Exp $
+ *
+ * Copyright 1999-2006, Gisle Aas.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the same terms as Perl itself.
+ */
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+
+EXTERN SV*
+sv_lower(pTHX_ SV* sv)
+{
+ STRLEN len;
+ char *s = SvPV_force(sv, len);
+ for (; len--; s++)
+ *s = toLOWER(*s);
+ return sv;
+}
+
+EXTERN int
+strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case)
+{
+ while (n--) {
+ if (ignore_case) {
+ if (toLOWER(*s1) != toLOWER(*s2))
+ return 0;
+ }
+ else {
+ if (*s1 != *s2)
+ return 0;
+ }
+ s1++;
+ s2++;
+ }
+ return 1;
+}
+
+static void
+grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e)
+{
+ /*
+ SvPVX ---> AAAAAA...BBBBBB
+ ^ ^ ^
+ t s e
+ */
+ STRLEN t_offset = *t - SvPVX(sv);
+ STRLEN s_offset = *s - SvPVX(sv);
+ STRLEN e_offset = *e - SvPVX(sv);
+
+ SvGROW(sv, e_offset + grow + 1);
+
+ *t = SvPVX(sv) + t_offset;
+ *s = SvPVX(sv) + s_offset;
+ *e = SvPVX(sv) + e_offset;
+
+ Move(*s, *s+grow, *e - *s, char);
+ *s += grow;
+ *e += grow;
+}
+
+EXTERN SV*
+decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix)
+{
+ STRLEN len;
+ char *s = SvPV_force(sv, len);
+ char *t = s;
+ char *end = s + len;
+ char *ent_start;
+
+ char *repl;
+ STRLEN repl_len;
+#ifdef UNICODE_HTML_PARSER
+ char buf[UTF8_MAXLEN];
+ int repl_utf8;
+ int high_surrogate = 0;
+#else
+ char buf[1];
+#endif
+
+#if defined(__GNUC__) && defined(UNICODE_HTML_PARSER)
+ /* gcc -Wall reports this variable as possibly used uninitialized */
+ repl_utf8 = 0;
+#endif
+
+ while (s < end) {
+ assert(t <= s);
+
+ if ((*t++ = *s++) != '&')
+ continue;
+
+ ent_start = s;
+ repl = 0;
+
+ if (*s == '#') {
+ UV num = 0;
+ UV prev = 0;
+ int ok = 0;
+ s++;
+ if (*s == 'x' || *s == 'X') {
+ s++;
+ while (*s) {
+ char *tmp = strchr(PL_hexdigit, *s);
+ if (!tmp)
+ break;
+ num = num << 4 | ((tmp - PL_hexdigit) & 15);
+ if (prev && num <= prev) {
+ /* overflow */
+ ok = 0;
+ break;
+ }
+ prev = num;
+ s++;
+ ok = 1;
+ }
+ }
+ else {
+ while (isDIGIT(*s)) {
+ num = num * 10 + (*s - '0');
+ if (prev && num < prev) {
+ /* overflow */
+ ok = 0;
+ break;
+ }
+ prev = num;
+ s++;
+ ok = 1;
+ }
+ }
+ if (ok) {
+#ifdef UNICODE_HTML_PARSER
+ if (!SvUTF8(sv) && num <= 255) {
+ buf[0] = (char) num;
+ repl = buf;
+ repl_len = 1;
+ repl_utf8 = 0;
+ }
+ else {
+ char *tmp;
+ if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */
+ if (high_surrogate != 0) {
+ t -= 3; /* Back up past 0xFFFD */
+ num = ((high_surrogate - 0xD800) << 10) +
+ (num - 0xDC00) + 0x10000;
+ high_surrogate = 0;
+ } else {
+ num = 0xFFFD;
+ }
+ }
+ else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */
+ high_surrogate = num;
+ num = 0xFFFD;
+ }
+ else {
+ high_surrogate = 0;
+ /* otherwise invalid? */
+ if ((num >= 0xFDD0 && num <= 0xFDEF) ||
+ ((num & 0xFFFE) == 0xFFFE) ||
+ num > 0x10FFFF)
+ {
+ num = 0xFFFD;
+ }
+ }
+
+ tmp = (char*)uvuni_to_utf8((U8*)buf, num);
+ repl = buf;
+ repl_len = tmp - buf;
+ repl_utf8 = 1;
+ }
+#else
+ if (num <= 255) {
+ buf[0] = (char) num & 0xFF;
+ repl = buf;
+ repl_len = 1;
+ }
+#endif
+ }
+ }
+ else {
+ char *ent_name = s;
+ while (isALNUM(*s))
+ s++;
+ if (ent_name != s && entity2char) {
+ SV** svp;
+ if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) ||
+ (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0)))
+ )
+ {
+ repl = SvPV(*svp, repl_len);
+#ifdef UNICODE_HTML_PARSER
+ repl_utf8 = SvUTF8(*svp);
+#endif
+ }
+ else if (expand_prefix) {
+ char *ss = s - 1;
+ while (ss > ent_name) {
+ svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0);
+ if (svp) {
+ repl = SvPV(*svp, repl_len);
+#ifdef UNICODE_HTML_PARSER
+ repl_utf8 = SvUTF8(*svp);
+#endif
+ s = ss;
+ break;
+ }
+ ss--;
+ }
+ }
+ }
+#ifdef UNICODE_HTML_PARSER
+ high_surrogate = 0;
+#endif
+ }
+
+ if (repl) {
+ char *repl_allocated = 0;
+ if (*s == ';')
+ s++;
+ t--; /* '&' already copied, undo it */
+
+#ifdef UNICODE_HTML_PARSER
+ if (*s != '&') {
+ high_surrogate = 0;
+ }
+
+ if (!SvUTF8(sv) && repl_utf8) {
+ /* need to upgrade sv before we continue */
+ STRLEN before_gap_len = t - SvPVX(sv);
+ char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len);
+ STRLEN after_gap_len = end - s;
+ char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len);
+
+ sv_setpvn(sv, before_gap, before_gap_len);
+ sv_catpvn(sv, after_gap, after_gap_len);
+ SvUTF8_on(sv);
+
+ Safefree(before_gap);
+ Safefree(after_gap);
+
+ s = t = SvPVX(sv) + before_gap_len;
+ end = SvPVX(sv) + before_gap_len + after_gap_len;
+ }
+ else if (SvUTF8(sv) && !repl_utf8) {
+ repl = (char*)bytes_to_utf8((U8*)repl, &repl_len);
+ repl_allocated = repl;
+ }
+#endif
+
+ if (t + repl_len > s) {
+ /* need to grow the string */
+ grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end);
+ }
+
+ /* copy replacement string into string */
+ while (repl_len--)
+ *t++ = *repl++;
+
+ if (repl_allocated)
+ Safefree(repl_allocated);
+ }
+ else {
+ while (ent_start < s)
+ *t++ = *ent_start++;
+ }
+ }
+
+ *t = '\0';
+ SvCUR_set(sv, t - SvPVX(sv));
+
+ return sv;
+}
+
+#ifdef UNICODE_HTML_PARSER
+static bool
+has_hibit(char *s, char *e)
+{
+ while (s < e) {
+ U8 ch = *s++;
+ if (!UTF8_IS_INVARIANT(ch)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+EXTERN bool
+probably_utf8_chunk(pTHX_ char *s, STRLEN len)
+{
+ char *e = s + len;
+ STRLEN clen;
+
+ /* ignore partial utf8 char at end of buffer */
+ while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1)))
+ e--;
+ if (s < e && UTF8_IS_START((U8)*(e - 1)))
+ e--;
+ clen = len - (e - s);
+ if (clen && UTF8SKIP(e) == clen) {
+ /* all promised continuation bytes are present */
+ e = s + len;
+ }
+
+ if (!has_hibit(s, e))
+ return 0;
+
+ return is_utf8_string((U8*)s, e - s);
+}
+#endif
diff --git a/lib/HTML/Tagset.pm b/lib/HTML/Tagset.pm
new file mode 100644
index 0000000..754137f
--- /dev/null
+++ b/lib/HTML/Tagset.pm
@@ -0,0 +1,471 @@
+package HTML::Tagset;
+
+use strict;
+
+=head1 NAME
+
+HTML::Tagset - data tables useful in parsing HTML
+
+=head1 VERSION
+
+Version 3.20
+
+=cut
+
+use vars qw( $VERSION );
+
+$VERSION = '3.20';
+
+=head1 SYNOPSIS
+
+ use HTML::Tagset;
+ # Then use any of the items in the HTML::Tagset package
+ # as need arises
+
+=head1 DESCRIPTION
+
+This module contains several data tables useful in various kinds of
+HTML parsing operations.
+
+Note that all tag names used are lowercase.
+
+In the following documentation, a "hashset" is a hash being used as a
+set -- the hash conveys that its keys are there, and the actual values
+associated with the keys are not significant. (But what values are
+there, are always true.)
+
+=cut
+
+use vars qw(
+ $VERSION
+ %emptyElement %optionalEndTag %linkElements %boolean_attr
+ %isHeadElement %isBodyElement %isPhraseMarkup
+ %is_Possible_Strict_P_Content
+ %isHeadOrBodyElement
+ %isList %isTableElement %isFormElement
+ %isKnown %canTighten
+ @p_closure_barriers
+ %isCDATA_Parent
+);
+
+=head1 VARIABLES
+
+Note that none of these variables are exported.
+
+=head2 hashset %HTML::Tagset::emptyElement
+
+This hashset has as values the tag-names (GIs) of elements that cannot
+have content. (For example, "base", "br", "hr".) So
+C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
+C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
+
+=cut
+
+%emptyElement = map {; $_ => 1 } qw(base link meta isindex
+ img br hr wbr
+ input area param
+ embed bgsound spacer
+ basefont col frame
+ ~comment ~literal
+ ~declaration ~pi
+ );
+# The "~"-initial names are for pseudo-elements used by HTML::Entities
+# and TreeBuilder
+
+=head2 hashset %HTML::Tagset::optionalEndTag
+
+This hashset lists tag-names for elements that can have content, but whose
+end-tags are generally, "safely", omissible. Example:
+C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
+
+=cut
+
+%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
+
+=head2 hash %HTML::Tagset::linkElements
+
+Values in this hash are tagnames for elements that might contain
+links, and the value for each is a reference to an array of the names
+of attributes whose values can be links.
+
+=cut
+
+%linkElements =
+(
+ 'a' => ['href'],
+ 'applet' => ['archive', 'codebase', 'code'],
+ 'area' => ['href'],
+ 'base' => ['href'],
+ 'bgsound' => ['src'],
+ 'blockquote' => ['cite'],
+ 'body' => ['background'],
+ 'del' => ['cite'],
+ 'embed' => ['pluginspage', 'src'],
+ 'form' => ['action'],
+ 'frame' => ['src', 'longdesc'],
+ 'iframe' => ['src', 'longdesc'],
+ 'ilayer' => ['background'],
+ 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
+ 'input' => ['src', 'usemap'],
+ 'ins' => ['cite'],
+ 'isindex' => ['action'],
+ 'head' => ['profile'],
+ 'layer' => ['background', 'src'],
+ 'link' => ['href'],
+ 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
+ 'q' => ['cite'],
+ 'script' => ['src', 'for'],
+ 'table' => ['background'],
+ 'td' => ['background'],
+ 'th' => ['background'],
+ 'tr' => ['background'],
+ 'xmp' => ['href'],
+);
+
+=head2 hash %HTML::Tagset::boolean_attr
+
+This hash (not hashset) lists what attributes of what elements can be
+printed without showing the value (for example, the "noshade" attribute
+of "hr" elements). For elements with only one such attribute, its value
+is simply that attribute name. For elements with many such attributes,
+the value is a reference to a hashset containing all such attributes.
+
+=cut
+
+%boolean_attr = (
+# TODO: make these all hashes
+ 'area' => 'nohref',
+ 'dir' => 'compact',
+ 'dl' => 'compact',
+ 'hr' => 'noshade',
+ 'img' => 'ismap',
+ 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
+ 'menu' => 'compact',
+ 'ol' => 'compact',
+ 'option' => 'selected',
+ 'select' => 'multiple',
+ 'td' => 'nowrap',
+ 'th' => 'nowrap',
+ 'ul' => 'compact',
+);
+
+#==========================================================================
+# List of all elements from Extensible HTML version 1.0 Transitional DTD:
+#
+# a abbr acronym address applet area b base basefont bdo big
+# blockquote body br button caption center cite code col colgroup
+# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
+# head hr html i iframe img input ins isindex kbd label legend li
+# link map menu meta noframes noscript object ol optgroup option p
+# param pre q s samp script select small span strike strong style
+# sub sup table tbody td textarea tfoot th thead title tr tt u ul
+# var
+#
+# Varia from Mozilla source internal table of tags:
+# Implemented:
+# xmp listing wbr nobr frame frameset noframes ilayer
+# layer nolayer spacer embed multicol
+# But these are unimplemented:
+# sound?? keygen?? server??
+# Also seen here and there:
+# marquee?? app?? (both unimplemented)
+#==========================================================================
+
+=head2 hashset %HTML::Tagset::isPhraseMarkup
+
+This hashset contains all phrasal-level elements.
+
+=cut
+
+%isPhraseMarkup = map {; $_ => 1 } qw(
+ span abbr acronym q sub sup
+ cite code em kbd samp strong var dfn strike
+ b i u s tt small big
+ a img br
+ wbr nobr blink
+ font basefont bdo
+ spacer embed noembed
+); # had: center, hr, table
+
+
+=head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
+
+This hashset contains all phrasal-level elements that be content of a
+P element, for a strict model of HTML.
+
+=cut
+
+%is_Possible_Strict_P_Content = (
+ %isPhraseMarkup,
+ %isFormElement,
+ map {; $_ => 1} qw( object script map )
+ # I've no idea why there's these latter exceptions.
+ # I'm just following the HTML4.01 DTD.
+);
+
+#from html4 strict:
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+
+=head2 hashset %HTML::Tagset::isHeadElement
+
+This hashset contains all elements that elements that should be
+present only in the 'head' element of an HTML document.
+
+=cut
+
+%isHeadElement = map {; $_ => 1 }
+ qw(title base link meta isindex script style object bgsound);
+
+=head2 hashset %HTML::Tagset::isList
+
+This hashset contains all elements that can contain "li" elements.
+
+=cut
+
+%isList = map {; $_ => 1 } qw(ul ol dir menu);
+
+=head2 hashset %HTML::Tagset::isTableElement
+
+This hashset contains all elements that are to be found only in/under
+a "table" element.
+
+=cut
+
+%isTableElement = map {; $_ => 1 }
+ qw(tr td th thead tbody tfoot caption col colgroup);
+
+=head2 hashset %HTML::Tagset::isFormElement
+
+This hashset contains all elements that are to be found only in/under
+a "form" element.
+
+=cut
+
+%isFormElement = map {; $_ => 1 }
+ qw(input select option optgroup textarea button label);
+
+=head2 hashset %HTML::Tagset::isBodyMarkup
+
+This hashset contains all elements that are to be found only in/under
+the "body" element of an HTML document.
+
+=cut
+
+%isBodyElement = map {; $_ => 1 } qw(
+ h1 h2 h3 h4 h5 h6
+ p div pre plaintext address blockquote
+ xmp listing
+ center
+
+ multicol
+ iframe ilayer nolayer
+ bgsound
+
+ hr
+ ol ul dir menu li
+ dl dt dd
+ ins del
+
+ fieldset legend
+
+ map area
+ applet param object
+ isindex script noscript
+ table
+ center
+ form
+ ),
+ keys %isFormElement,
+ keys %isPhraseMarkup, # And everything phrasal
+ keys %isTableElement,
+;
+
+
+=head2 hashset %HTML::Tagset::isHeadOrBodyElement
+
+This hashset includes all elements that I notice can fall either in
+the head or in the body.
+
+=cut
+
+%isHeadOrBodyElement = map {; $_ => 1 }
+ qw(script isindex style object map area param noscript bgsound);
+ # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
+
+
+=head2 hashset %HTML::Tagset::isKnown
+
+This hashset lists all known HTML elements.
+
+=cut
+
+%isKnown = (%isHeadElement, %isBodyElement,
+ map{; $_=>1 }
+ qw( head body html
+ frame frameset noframes
+ ~comment ~pi ~directive ~literal
+));
+ # that should be all known tags ever ever
+
+
+=head2 hashset %HTML::Tagset::canTighten
+
+This hashset lists elements that might have ignorable whitespace as
+children or siblings.
+
+=cut
+
+%canTighten = %isKnown;
+delete @canTighten{
+ keys(%isPhraseMarkup), 'input', 'select',
+ 'xmp', 'listing', 'plaintext', 'pre',
+};
+ # xmp, listing, plaintext, and pre are untightenable, and
+ # in a really special way.
+@canTighten{'hr','br'} = (1,1);
+ # exceptional 'phrasal' things that ARE subject to tightening.
+
+# The one case where I can think of my tightening rules failing is:
+# foo bar
baz quux ...
+# ^-- that would get deleted.
+# But that's pretty gruesome code anyhow. You gets what you pays for.
+
+#==========================================================================
+
+=head2 array @HTML::Tagset::p_closure_barriers
+
+This array has a meaning that I have only seen a need for in
+C, but I include it here on the off chance that someone
+might find it of use:
+
+When we see a "EpE" token, we go lookup up the lineage for a p
+element we might have to minimize. At first sight, we might say that
+if there's a p anywhere in the lineage of this new p, it should be
+closed. But that's wrong. Consider this document:
+
+
+
+ foo
+
+
+ foo
+
+