From: Steve Peters This is more content...
+This is even some more.
+EOT
+
+$p->get_tag("/h1");
+
+my $t = $p->get_trimmed_text("br", "p");
+is($t, "This is some text.");
+
+$p->get_tag;
+
+$t = $p->get_trimmed_text("br", "p");
+is($t,"This is some more text.");
+
+undef($p);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+
+This is even some more.
+EOT
+
+$p->get_tag("h1");
+
+$t = $p->get_phrase;
+is($t, "This is a bold heading");
+
+$t = $p->get_phrase;
+is($t, "");
+
+$p->get_tag;
+
+$t = $p->get_phrase;
+is($t, "This is some italic text. This is some more text.");
+
+undef($p);
diff --git a/ext/HTML/Parser/t/uentities.t b/ext/HTML/Parser/t/uentities.t
new file mode 100644
index 0000000..b9decc5
--- /dev/null
+++ b/ext/HTML/Parser/t/uentities.t
@@ -0,0 +1,67 @@
+# Test Unicode entities
+
+use HTML::Entities;
+
+use Test::More tests => 27;
+
+SKIP: {
+skip "This perl does not support Unicode or Unicode entities not selected",
+ 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT;
+
+is(decode_entities("&euro"), "&euro");
+is(decode_entities("€"), "\x{20AC}");
+
+is(decode_entities("å"), "å");
+is(decode_entities("å"), "å");
+
+is(decode_entities(""), chr(500000));
+
+is(decode_entities(""), "\x{10FFFD}");
+
+is(decode_entities(""), "\x{FFFC}");
+
+
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), chr(0xFFFD));
+is(decode_entities(""), chr(0xFFFD));
+
+is(decode_entities(""), "\0");
+is(decode_entities(""), "\0");
+is(decode_entities(""), "\0");
+is(decode_entities(""), "\0");
+
+is(decode_entities("ååå"), "ååå\x{FFF}");
+
+# This might fail when we get more than 64 bit UVs
+is(decode_entities(""), "");
+is(decode_entities(""), "");
+
+my $err;
+for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) {
+ my $a = join("", map chr, $_->[0] .. $_->[1]);
+
+ my $e = encode_entities($a);
+ my $d = decode_entities($e);
+
+ unless ($d eq $a) {
+ diag "Wrong decoding in range $_->[0] .. $_->[1]";
+ # use Devel::Peek; Dump($a); Dump($d);
+ $err++;
+ }
+}
+ok(!$err);
+
+
+is(decode_entities(""), chr(0x100085));
+
+is(decode_entities(""), chr(0x100085));
+
+is(decode_entities(""), chr(0xFFFD));
+
+is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}");
+}
diff --git a/ext/HTML/Parser/t/unbroken-text.t b/ext/HTML/Parser/t/unbroken-text.t
new file mode 100644
index 0000000..7de85a9
--- /dev/null
+++ b/ext/HTML/Parser/t/unbroken-text.t
@@ -0,0 +1,60 @@
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 3;
+
+my $text = "";
+sub text
+{
+ my $cdata = shift() ? "CDATA" : "TEXT";
+ my($offset, $line, $col, $t) = @_;
+ $text .= "[$cdata:$offset:$line.$col:$t]";
+}
+
+sub tag
+{
+ $text .= shift;
+}
+
+my $p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ start_h => [\&tag, "text"],
+ end_h => [\&tag, "text"],
+ );
+
+$p->parse("foo ");
+$p->parse("bar ");
+$p->parse(" foo bar foo
+ bar
+
+
+By default, "LIST]" is parsed as a boolean attribute, not as
+part of the ALT value as was clearly intended. This is also what
+Mozilla sees.
+
+The official behaviour is enabled by enabling this attribute. If
+enabled, it will cause the tag above to be reported as text
+since "LIST]" is not a legal attribute name.
+
+=item $p->unbroken_text
+
+=item $p->unbroken_text( $bool )
+
+By default, blocks of text are given to the text handler as soon as
+possible (but the parser takes care always to break text at a
+boundary between whitespace and non-whitespace so single words and
+entities can always be decoded safely). This might create breaks that
+make it hard to do transformations on the text. When this attribute is
+enabled, blocks of text are always reported in one piece. This will
+delay the text event until the following (non-text) event has been
+recognized by the parser.
+
+Note that the C
Next example
+
+This is very nice example.
+
+EOT
+
+
+#------------------------------------------------------
+
+my @tags;
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(a em));
+$p->ignore_tags(qw(em));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<Next example
+
+This is yet another very nice example.
+
+EOT
+is(join('|', @tags), 'a', 'report_tags followed by ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset report_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->report_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<Next example
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h3', 'report_tags replaces filter');
+
+
+#------------------------------------------------------
+
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h1 h2));
+$p->ignore_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<Next example
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->ignore_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1', 'ignore_tags before report_tags');
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_elements("script");
+my $res="";
+$p->handler(default=> sub {$res.=$_[0];}, 'text');
+$p->parse(<<'EOT')->eof;
+A C D F
+EOT
+is($res,"A C D F\n","ignore without "
+ ignore this
+
+
+
+.
+HTML
+
+@p = $p->links;
+
+# There should be 4 links in the document
+is(@p, 4);
+
+for (@p) {
+ ($t, %attr) = @$_ if $_->[0] eq 'img';
+}
+
+is($t, 'img');
+
+is(delete $attr{src}, "http://www.sn.no/foo/img.jpg");
+
+is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif");
+
+ok(!scalar(keys %attr)); # there should be no more attributes
+}
diff --git a/ext/HTML/Parser/t/linkextor-rel.t b/ext/HTML/Parser/t/linkextor-rel.t
new file mode 100644
index 0000000..1190a96
--- /dev/null
+++ b/ext/HTML/Parser/t/linkextor-rel.t
@@ -0,0 +1,36 @@
+use Test::More tests => 4;
+
+require HTML::LinkExtor;
+
+$HTML = <
+
.
+HTML
+
+
+# Try the callback interface
+$links = "";
+$p = HTML::LinkExtor->new(
+ sub {
+ my($tag, %links) = @_;
+ #diag "$tag @{[%links]}";
+ $links .= "$tag @{[%links]}\n";
+ });
+
+$p->parse($HTML); $p->eof;
+
+ok($links =~ m|^base href http://www\.sn\.no/$|m);
+ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m);
+ok($links =~ m|^a href link\.html$|m);
+
+# Used to be problems when using the links method on a document with
+# no links it it. This is a test to prove that it works.
+$p = new HTML::LinkExtor;
+$p->parse("this is a document with no links"); $p->eof;
+@a = $p->links;
+is(@a, 0);
diff --git a/ext/HTML/Parser/t/magic.t b/ext/HTML/Parser/t/magic.t
new file mode 100644
index 0000000..366f275
--- /dev/null
+++ b/ext/HTML/Parser/t/magic.t
@@ -0,0 +1,41 @@
+# Check that the magic signature at the top of struct p_state works and that we
+# catch modifications to _hparser_xs_state gracefully
+
+use Test::More tests => 5;
+
+use HTML::Parser;
+
+$p = HTML::Parser->new(api_version => 3);
+
+$p->xml_mode(1);
+
+# We should not be able to simply modify this stuff
+eval {
+ ${$p->{_hparser_xs_state}} += 4;
+};
+like($@, qr/^Modification of a read-only value attempted/);
+
+
+my $x = delete $p->{_hparser_xs_state};
+
+eval {
+ $p->xml_mode(1);
+};
+like($@, qr/^Can't find '_hparser_xs_state'/);
+
+$p->{_hparser_xs_state} = \($$x + 16);
+
+eval {
+ $p->xml_mode(1);
+};
+like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/);
+
+$p->{_hparser_xs_state} = 33;
+eval {
+ $p->xml_mode(1);
+};
+like($@, qr/^_hparser_xs_state element is not a reference/);
+
+$p->{_hparser_xs_state} = $x;
+
+ok($p->xml_mode(0));
diff --git a/ext/HTML/Parser/t/marked-sect.t b/ext/HTML/Parser/t/marked-sect.t
new file mode 100644
index 0000000..6a63478
--- /dev/null
+++ b/ext/HTML/Parser/t/marked-sect.t
@@ -0,0 +1,121 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $tag;
+my $text;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"],
+ text_h => [sub { $text .= shift }, "dtext"],
+ );
+
+
+use Test::More tests => 14;
+
+SKIP: {
+eval {
+ $p->marked_sections(1);
+};
+skip $@, 14 if $@;
+
+$p->parse("");
+is($text, "foo");
+
+$p->parse("");
+is($text, "foobar");
+
+$p->parse("]]>\n
");
+is($text, "foobarfoo\n");
+
+$text = "";
+$p->parse("parse(",bar>]]>
");
+is($text, "<foo]]>");
+
+$text = "";
+$p->parse("]]>]]>å
");
+is($text, "ååå");
+is($tag, "br");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "fooå");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "fooå");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "fooå");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "fooå");
+
+# offsets/line/column numbers
+$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"],
+ marked_sections => 1,
+ );
+$p->parse(<<'EOT')->eof;
+Test
+EOT
+
+my @x;
+sub x {
+ my($line, $col, $offset, $event, $text) = @_;
+ $text =~ s/\n/\\n/g;
+ $text =~ s/ /./g;
+ push(@x, "$line.$col:$offset $event \"$text\"\n");
+}
+
+#diag @x;
+is(join("", @x), <<'EOT');
+1.0:0 start_document ""
+1.0:0 start ""
+9.6:79 text "Test"
+9.10:83 end "
"
+9.15:88 text "\n"
+10.0:89 end_document ""
+EOT
+
+my $doc = ">|\n|<
Heading
+
+
+This is a text with a link.
+EOT
+
+my $p = HTML::PullParser->new(doc => $doc,
+ start => 'event,tagname,@attr',
+ end => 'event,tagname',
+ text => 'event,dtext',
+
+ ignore_elements => [qw(script style)],
+ unbroken_text => 1,
+ boolean_attribute_value => 1,
+ );
+
+my $t = $p->get_token;
+is($t->[0], "start");
+is($t->[1], "title");
+$p->unget_token($t);
+
+my @a;
+while (my $t = $p->get_token) {
+ for (@$t) {
+ s/\s/./g;
+ }
+ push(@a, join("|", @$t));
+}
+
+my $res = join("\n", @a, "");
+#diag $res;
+is($res, <<'EOT');
+start|title
+text|Title
+end|title
+text|..
+start|h1|id|3
+text|Heading
+end|h1
+text|...This.is.a.text.with.a.
+start|a|href|http://www.sol.no|name|l1
+text|link
+end|a
+text|..
+EOT
+
diff --git a/ext/HTML/Parser/t/script.t b/ext/HTML/Parser/t/script.t
new file mode 100644
index 0000000..2a75ccb
--- /dev/null
+++ b/ext/HTML/Parser/t/script.t
@@ -0,0 +1,41 @@
+#!perl -w
+
+use strict;
+use Test;
+plan tests => 1;
+
+use HTML::Parser;
+
+my $TEXT = "";
+sub h
+{
+ my($event, $tagname, $text) = @_;
+ for ($event, $tagname, $text) {
+ if (defined) {
+ s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+ }
+ else {
+ $_ = " ));
+$p->eof;
+
+ok($TEXT, <<'EOT');
+[start_document,]
+[start,td, ]
+[end_document,]
+[start,script,]
+[end,script,]
+[start,script,]
+[end,td, ]
+[end,tr,link
+and some text.
+EOT
+
+sub a_handler {
+ push(@doc, shift);
+ my $text = shift;
+ push(@doc, uc($text));
+}
+
+
+is(join("", @doc), <<'EOT');
+link
+and some text.
+EOT
+
+#
+# Comment stripper. Interaction with "" handlers.
+#
+my $doc = <This is the title again
+
+
+ And this is a link to the Institute
+
+
process instruction >
+
+
+
+
+EOT
+
+END { unlink($file) || warn "Can't unlink $file: $!"; }
+
+
+my $p;
+
+
+$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
+ok($p->unbroken_text);
+if ($p->get_tag("foo", "title")) {
+ my $title = $p->get_trimmed_text;
+ #diag "Title: $title";
+ is($title, "This is the
+Heading
+
+HTML
+
+ok($p->get_tag("h1"));
+is($p->get_trimmed_text, "Heading");
+undef($p);
+
+# test parsing of large embedded documents
+my $doc = "foo is bar\n\n\n" x 2022;
+
+#use Time::HiRes qw(time);
+my $start = time;
+$p = HTML::TokeParser->new(\$doc);
+#diag "Construction time: ", time - $start;
+
+my $count;
+while (my $t = $p->get_token) {
+ $count++ if $t->[0] eq "S";
+}
+#diag "Parse time: ", time - $start;
+
+is($count, 2022);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+This is a heading
+This is some
text.
+
+This is some more text.
+This is a bold heading
+This is some italic text.
This is some more text.
+Smile ☺
\x{0420}";
+is(length($doc), 46);
+
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
+is($parsed[4][8]{id}, "\x{2600}");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile ☺");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[7][0], "text");
+is($parsed[7][1], "\x{0420}");
+is($parsed[7][2], "\x{0420}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+is(@warn, 0);
+
+# Try to parse it as an UTF8 encoded string
+utf8::encode($doc);
+is(length($doc), 51);
+
+@parsed = ();
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "
");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
+is($parsed[4][8]{id}, "\xE2\x98\x80");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile ☺");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+my $file = "test-$$.html";
+open(my $fh, ">:utf8", $file) || die;
+print $fh <
♥ Love \x{2665}
+EOT
+close($fh) || die;
+
+@warn = ();
+@parsed = ();
+$p->parse_file($file);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \xE2\x99\xA5");
+is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage
+is($parsed[10][3], -s $file);
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw:utf8", $file) || die;
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\x{2665}");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \x{2665}");
+is($parsed[7][2], "\x{2665} Love \x{2665}");
+is($parsed[10][3], (-s $file) - 2 * 4);
+is(@warn, 0);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw", $file) || die;
+$p->utf8_mode(1);
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \xE2\x99\xA5");
+is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
+is($parsed[10][3], -s $file);
+is(@warn, 0);
+
+unlink($file);
+
+@parsed = ();
+$p->parse(q(foo))->eof;
+is(@parsed, "5");
+is($parsed[1][0], "start");
+is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
+
+ok(!HTML::Entities::_probably_utf8_chunk(""));
+ok(!HTML::Entities::_probably_utf8_chunk("f"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
+}
diff --git a/ext/HTML/Parser/t/xml-mode.t b/ext/HTML/Parser/t/xml-mode.t
new file mode 100644
index 0000000..cdfc5b0
--- /dev/null
+++ b/ext/HTML/Parser/t/xml-mode.t
@@ -0,0 +1,112 @@
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(xml_mode => 1,
+ );
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ $text .= "]";
+ }, "tagname,attr");
+$p->handler(end =>
+ sub {
+ $text .= "E[" . shift() . "]";
+ }, "tagname");
+$p->handler(process =>
+ sub {
+ $text .= "PI[" . shift() . "]";
+ }, "token0");
+$p->handler(text =>
+ sub {
+ $text .= shift;
+ }, "text");
+
+my $xml = <<'EOT';
+
+
+
+
+
+
+
+ foo
+
+
foo
bar
+ +isn't, is that something about the table constitutes a "barrier" to +the application of the rule about what p must minimize. + +So C<@HTML::Tagset::p_closure_barriers> is the list of all such +barrier-tags. + +=cut + +@p_closure_barriers = qw( + li blockquote + ul ol menu dir + dl dt dd + td th tr table caption + div + ); + +# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this +# monkey business of barriers to minimization! + +=head2 hashset %isCDATA_Parent + +This hashset includes all elements whose content is CDATA. + +=cut + +%isCDATA_Parent = map {; $_ => 1 } + qw(script style xmp listing plaintext); + +# TODO: there's nothing else that takes CDATA children, right? + +# As the HTML3 DTD (Raggett 1995-04-24) noted: +# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML +# and derive from very early versions of HTML. They require non- +# standard parsers and will cause problems for processing +# documents with standard SGML tools. + + +=head1 CAVEATS + +You may find it useful to alter the behavior of modules (like +C