From: Steve Peters This is more content...
-This is even some more.
-EOT
-
-$p->get_tag("/h1");
-
-my $t = $p->get_trimmed_text("br", "p");
-is($t, "This is some text.");
-
-$p->get_tag;
-
-$t = $p->get_trimmed_text("br", "p");
-is($t,"This is some more text.");
-
-undef($p);
-
-$p = HTML::TokeParser->new(\<<'EOT');
-
-This is even some more.
-EOT
-
-$p->get_tag("h1");
-
-$t = $p->get_phrase;
-is($t, "This is a bold heading");
-
-$t = $p->get_phrase;
-is($t, "");
-
-$p->get_tag;
-
-$t = $p->get_phrase;
-is($t, "This is some italic text. This is some more text.");
-
-undef($p);
diff --git a/ext/HTML/Parser/t/uentities.t b/ext/HTML/Parser/t/uentities.t
deleted file mode 100644
index b9decc5..0000000
--- a/ext/HTML/Parser/t/uentities.t
+++ /dev/null
@@ -1,67 +0,0 @@
-# Test Unicode entities
-
-use HTML::Entities;
-
-use Test::More tests => 27;
-
-SKIP: {
-skip "This perl does not support Unicode or Unicode entities not selected",
- 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT;
-
-is(decode_entities("&euro"), "&euro");
-is(decode_entities("€"), "\x{20AC}");
-
-is(decode_entities("å"), "å");
-is(decode_entities("å"), "å");
-
-is(decode_entities(""), chr(500000));
-
-is(decode_entities(""), "\x{10FFFD}");
-
-is(decode_entities(""), "\x{FFFC}");
-
-
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), chr(0xFFFD));
-is(decode_entities(""), chr(0xFFFD));
-
-is(decode_entities(""), "\0");
-is(decode_entities(""), "\0");
-is(decode_entities(""), "\0");
-is(decode_entities(""), "\0");
-
-is(decode_entities("ååå"), "ååå\x{FFF}");
-
-# This might fail when we get more than 64 bit UVs
-is(decode_entities(""), "");
-is(decode_entities(""), "");
-
-my $err;
-for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) {
- my $a = join("", map chr, $_->[0] .. $_->[1]);
-
- my $e = encode_entities($a);
- my $d = decode_entities($e);
-
- unless ($d eq $a) {
- diag "Wrong decoding in range $_->[0] .. $_->[1]";
- # use Devel::Peek; Dump($a); Dump($d);
- $err++;
- }
-}
-ok(!$err);
-
-
-is(decode_entities(""), chr(0x100085));
-
-is(decode_entities(""), chr(0x100085));
-
-is(decode_entities(""), chr(0xFFFD));
-
-is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}");
-}
diff --git a/ext/HTML/Parser/t/unbroken-text.t b/ext/HTML/Parser/t/unbroken-text.t
deleted file mode 100644
index 7de85a9..0000000
--- a/ext/HTML/Parser/t/unbroken-text.t
+++ /dev/null
@@ -1,60 +0,0 @@
-use strict;
-use HTML::Parser;
-
-use Test::More tests => 3;
-
-my $text = "";
-sub text
-{
- my $cdata = shift() ? "CDATA" : "TEXT";
- my($offset, $line, $col, $t) = @_;
- $text .= "[$cdata:$offset:$line.$col:$t]";
-}
-
-sub tag
-{
- $text .= shift;
-}
-
-my $p = HTML::Parser->new(unbroken_text => 1,
- text_h => [\&text, "is_cdata,offset,line,column,text"],
- start_h => [\&tag, "text"],
- end_h => [\&tag, "text"],
- );
-
-$p->parse("foo ");
-$p->parse("bar ");
-$p->parse(" foo bar foo
- bar
-
-
-By default, "LIST]" is parsed as a boolean attribute, not as
-part of the ALT value as was clearly intended. This is also what
-Mozilla sees.
-
-The official behaviour is enabled by enabling this attribute. If
-enabled, it will cause the tag above to be reported as text
-since "LIST]" is not a legal attribute name.
-
-=item $p->unbroken_text
-
-=item $p->unbroken_text( $bool )
-
-By default, blocks of text are given to the text handler as soon as
-possible (but the parser takes care always to break text at a
-boundary between whitespace and non-whitespace so single words and
-entities can always be decoded safely). This might create breaks that
-make it hard to do transformations on the text. When this attribute is
-enabled, blocks of text are always reported in one piece. This will
-delay the text event until the following (non-text) event has been
-recognized by the parser.
-
-Note that the C
Next example
-
-This is very nice example.
-
-EOT
-
-
-#------------------------------------------------------
-
-my @tags;
-$p = HTML::Parser->new(api_version => 3);
-$p->report_tags(qw(a em));
-$p->ignore_tags(qw(em));
-$p->handler(end => sub {push @tags, @_;}, 'tagname');
-
-$p->parse(<Next example
-
-This is yet another very nice example.
-
-EOT
-is(join('|', @tags), 'a', 'report_tags followed by ignore_tags');
-
-
-#------------------------------------------------------
-
-@tags = ();
-$p = HTML::Parser->new(api_version => 3);
-$p->report_tags(qw(h1));
-$p->report_tags();
-$p->handler(end => sub {push @tags, @_;}, 'tagname');
-
-$p->parse(<Next example
-Next example
-
-EOT
-is(join('|', @tags), 'h1|h2', 'reset report_tags filter');
-
-
-#------------------------------------------------------
-
-@tags = ();
-$p = HTML::Parser->new(api_version => 3);
-$p->report_tags(qw(h1 h2));
-$p->ignore_tags(qw(h2));
-$p->report_tags(qw(h1 h2));
-$p->handler(end => sub {push @tags, @_;}, 'tagname');
-
-$p->parse(<Next example
-Next example
-
-EOT
-is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags');
-
-
-#------------------------------------------------------
-
-@tags = ();
-$p = HTML::Parser->new(api_version => 3);
-$p->report_tags(qw(h1 h2));
-$p->ignore_tags(qw(h2));
-$p->report_tags();
-$p->handler(end => sub {push @tags, @_;}, 'tagname');
-
-$p->parse(<Next example
-Next example
-
-EOT
-is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags');
-
-
-#------------------------------------------------------
-
-@tags = ();
-$p = HTML::Parser->new(api_version => 3);
-$p->report_tags(qw(h1 h2));
-$p->report_tags(qw(h3));
-$p->handler(end => sub {push @tags, @_;}, 'tagname');
-
-$p->parse(<Next example
-Next example
-Next example
-
-EOT
-is(join('|', @tags), 'h3', 'report_tags replaces filter');
-
-
-#------------------------------------------------------
-
-
-@tags = ();
-$p = HTML::Parser->new(api_version => 3);
-$p->ignore_tags(qw(h1 h2));
-$p->ignore_tags(qw(h3));
-$p->handler(end => sub {push @tags, @_;}, 'tagname');
-
-$p->parse(<Next example
-Next example
-Next example
-
-EOT
-is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter');
-
-
-#------------------------------------------------------
-
-@tags = ();
-$p = HTML::Parser->new(api_version => 3);
-$p->ignore_tags(qw(h2));
-$p->ignore_tags();
-$p->handler(end => sub {push @tags, @_;}, 'tagname');
-
-$p->parse(<Next example
-Next example
-
-EOT
-is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter');
-
-
-#------------------------------------------------------
-
-@tags = ();
-$p = HTML::Parser->new(api_version => 3);
-$p->ignore_tags(qw(h2));
-$p->report_tags(qw(h1 h2));
-$p->handler(end => sub {push @tags, @_;}, 'tagname');
-
-$p->parse(<Next example
-Next example
-
-EOT
-is(join('|', @tags), 'h1', 'ignore_tags before report_tags');
-#------------------------------------------------------
-
-$p = HTML::Parser->new(api_version => 3);
-$p->ignore_elements("script");
-my $res="";
-$p->handler(default=> sub {$res.=$_[0];}, 'text');
-$p->parse(<<'EOT')->eof;
-A C D F
-EOT
-is($res,"A C D F\n","ignore without "
- ignore this
-
-
-
-.
-HTML
-
-@p = $p->links;
-
-# There should be 4 links in the document
-is(@p, 4);
-
-for (@p) {
- ($t, %attr) = @$_ if $_->[0] eq 'img';
-}
-
-is($t, 'img');
-
-is(delete $attr{src}, "http://www.sn.no/foo/img.jpg");
-
-is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif");
-
-ok(!scalar(keys %attr)); # there should be no more attributes
-}
diff --git a/ext/HTML/Parser/t/linkextor-rel.t b/ext/HTML/Parser/t/linkextor-rel.t
deleted file mode 100644
index 1190a96..0000000
--- a/ext/HTML/Parser/t/linkextor-rel.t
+++ /dev/null
@@ -1,36 +0,0 @@
-use Test::More tests => 4;
-
-require HTML::LinkExtor;
-
-$HTML = <
-
.
-HTML
-
-
-# Try the callback interface
-$links = "";
-$p = HTML::LinkExtor->new(
- sub {
- my($tag, %links) = @_;
- #diag "$tag @{[%links]}";
- $links .= "$tag @{[%links]}\n";
- });
-
-$p->parse($HTML); $p->eof;
-
-ok($links =~ m|^base href http://www\.sn\.no/$|m);
-ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m);
-ok($links =~ m|^a href link\.html$|m);
-
-# Used to be problems when using the links method on a document with
-# no links it it. This is a test to prove that it works.
-$p = new HTML::LinkExtor;
-$p->parse("this is a document with no links"); $p->eof;
-@a = $p->links;
-is(@a, 0);
diff --git a/ext/HTML/Parser/t/magic.t b/ext/HTML/Parser/t/magic.t
deleted file mode 100644
index 366f275..0000000
--- a/ext/HTML/Parser/t/magic.t
+++ /dev/null
@@ -1,41 +0,0 @@
-# Check that the magic signature at the top of struct p_state works and that we
-# catch modifications to _hparser_xs_state gracefully
-
-use Test::More tests => 5;
-
-use HTML::Parser;
-
-$p = HTML::Parser->new(api_version => 3);
-
-$p->xml_mode(1);
-
-# We should not be able to simply modify this stuff
-eval {
- ${$p->{_hparser_xs_state}} += 4;
-};
-like($@, qr/^Modification of a read-only value attempted/);
-
-
-my $x = delete $p->{_hparser_xs_state};
-
-eval {
- $p->xml_mode(1);
-};
-like($@, qr/^Can't find '_hparser_xs_state'/);
-
-$p->{_hparser_xs_state} = \($$x + 16);
-
-eval {
- $p->xml_mode(1);
-};
-like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/);
-
-$p->{_hparser_xs_state} = 33;
-eval {
- $p->xml_mode(1);
-};
-like($@, qr/^_hparser_xs_state element is not a reference/);
-
-$p->{_hparser_xs_state} = $x;
-
-ok($p->xml_mode(0));
diff --git a/ext/HTML/Parser/t/marked-sect.t b/ext/HTML/Parser/t/marked-sect.t
deleted file mode 100644
index 6a63478..0000000
--- a/ext/HTML/Parser/t/marked-sect.t
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $tag;
-my $text;
-
-use HTML::Parser ();
-my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"],
- text_h => [sub { $text .= shift }, "dtext"],
- );
-
-
-use Test::More tests => 14;
-
-SKIP: {
-eval {
- $p->marked_sections(1);
-};
-skip $@, 14 if $@;
-
-$p->parse("");
-is($text, "foo");
-
-$p->parse("");
-is($text, "foobar");
-
-$p->parse("]]>\n
");
-is($text, "foobarfoo\n");
-
-$text = "";
-$p->parse("parse(",bar>]]>
");
-is($text, "<foo]]>");
-
-$text = "";
-$p->parse("]]>]]>å
");
-is($text, "ååå");
-is($tag, "br");
-
-$text = "";
-$p->parse("]]>
");
-is($text, "");
-
-$text = "";
-$p->parse("]]>
");
-is($text, "fooå");
-
-$text = "";
-$p->parse("]]>
");
-is($text, "fooå");
-
-$text = "";
-$p->parse("]]>
");
-is($text, "fooå");
-
-$text = "";
-$p->parse("]]>
");
-is($text, "fooå");
-
-# offsets/line/column numbers
-$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"],
- marked_sections => 1,
- );
-$p->parse(<<'EOT')->eof;
-Test
-EOT
-
-my @x;
-sub x {
- my($line, $col, $offset, $event, $text) = @_;
- $text =~ s/\n/\\n/g;
- $text =~ s/ /./g;
- push(@x, "$line.$col:$offset $event \"$text\"\n");
-}
-
-#diag @x;
-is(join("", @x), <<'EOT');
-1.0:0 start_document ""
-1.0:0 start ""
-9.6:79 text "Test"
-9.10:83 end "
"
-9.15:88 text "\n"
-10.0:89 end_document ""
-EOT
-
-my $doc = ">|\n|<
Heading
-
-
-This is a text with a link.
-EOT
-
-my $p = HTML::PullParser->new(doc => $doc,
- start => 'event,tagname,@attr',
- end => 'event,tagname',
- text => 'event,dtext',
-
- ignore_elements => [qw(script style)],
- unbroken_text => 1,
- boolean_attribute_value => 1,
- );
-
-my $t = $p->get_token;
-is($t->[0], "start");
-is($t->[1], "title");
-$p->unget_token($t);
-
-my @a;
-while (my $t = $p->get_token) {
- for (@$t) {
- s/\s/./g;
- }
- push(@a, join("|", @$t));
-}
-
-my $res = join("\n", @a, "");
-#diag $res;
-is($res, <<'EOT');
-start|title
-text|Title
-end|title
-text|..
-start|h1|id|3
-text|Heading
-end|h1
-text|...This.is.a.text.with.a.
-start|a|href|http://www.sol.no|name|l1
-text|link
-end|a
-text|..
-EOT
-
diff --git a/ext/HTML/Parser/t/script.t b/ext/HTML/Parser/t/script.t
deleted file mode 100644
index 2a75ccb..0000000
--- a/ext/HTML/Parser/t/script.t
+++ /dev/null
@@ -1,41 +0,0 @@
-#!perl -w
-
-use strict;
-use Test;
-plan tests => 1;
-
-use HTML::Parser;
-
-my $TEXT = "";
-sub h
-{
- my($event, $tagname, $text) = @_;
- for ($event, $tagname, $text) {
- if (defined) {
- s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
- }
- else {
- $_ = " ));
-$p->eof;
-
-ok($TEXT, <<'EOT');
-[start_document,]
-[start,td, ]
-[end_document,]
-[start,script,]
-[end,script,]
-[start,script,]
-[end,td, ]
-[end,tr,link
-and some text.
-EOT
-
-sub a_handler {
- push(@doc, shift);
- my $text = shift;
- push(@doc, uc($text));
-}
-
-
-is(join("", @doc), <<'EOT');
-link
-and some text.
-EOT
-
-#
-# Comment stripper. Interaction with "" handlers.
-#
-my $doc = <This is the title again
-
-
- And this is a link to the Institute
-
-
process instruction >
-
-
-
-
-EOT
-
-END { unlink($file) || warn "Can't unlink $file: $!"; }
-
-
-my $p;
-
-
-$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
-ok($p->unbroken_text);
-if ($p->get_tag("foo", "title")) {
- my $title = $p->get_trimmed_text;
- #diag "Title: $title";
- is($title, "This is the
-Heading
-
-HTML
-
-ok($p->get_tag("h1"));
-is($p->get_trimmed_text, "Heading");
-undef($p);
-
-# test parsing of large embedded documents
-my $doc = "foo is bar\n\n\n" x 2022;
-
-#use Time::HiRes qw(time);
-my $start = time;
-$p = HTML::TokeParser->new(\$doc);
-#diag "Construction time: ", time - $start;
-
-my $count;
-while (my $t = $p->get_token) {
- $count++ if $t->[0] eq "S";
-}
-#diag "Parse time: ", time - $start;
-
-is($count, 2022);
-
-$p = HTML::TokeParser->new(\<<'EOT');
-This is a heading
-This is some
text.
-
-This is some more text.
-This is a bold heading
-This is some italic text.
This is some more text.
-Smile ☺
\x{0420}";
-is(length($doc), 46);
-
-$p->parse($doc)->eof;
-
-#use Data::Dump; Data::Dump::dump(@parsed);
-
-is(@parsed, 9);
-is($parsed[0][0], "start_document");
-
-is($parsed[1][0], "start");
-is($parsed[1][1], "");
-is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
-is($parsed[4][8]{id}, "\x{2600}");
-
-is($parsed[5][0], "text");
-is($parsed[5][1], "Smile ☺");
-is($parsed[5][2], "Smile \x{263A}");
-
-is($parsed[7][0], "text");
-is($parsed[7][1], "\x{0420}");
-is($parsed[7][2], "\x{0420}");
-
-is($parsed[8][0], "end_document");
-is($parsed[8][3], length($doc));
-is($parsed[8][5], length($doc));
-is($parsed[8][6], length($doc));
-is(@warn, 0);
-
-# Try to parse it as an UTF8 encoded string
-utf8::encode($doc);
-is(length($doc), 51);
-
-@parsed = ();
-$p->parse($doc)->eof;
-
-#use Data::Dump; Data::Dump::dump(@parsed);
-
-is(@parsed, 9);
-is($parsed[0][0], "start_document");
-
-is($parsed[1][0], "start");
-is($parsed[1][1], "
");
-is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
-is($parsed[4][8]{id}, "\xE2\x98\x80");
-
-is($parsed[5][0], "text");
-is($parsed[5][1], "Smile ☺");
-is($parsed[5][2], "Smile \x{263A}");
-
-is($parsed[8][0], "end_document");
-is($parsed[8][3], length($doc));
-is($parsed[8][5], length($doc));
-is($parsed[8][6], length($doc));
-
-is(@warn, 1);
-like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
-
-my $file = "test-$$.html";
-open(my $fh, ">:utf8", $file) || die;
-print $fh <
♥ Love \x{2665}
-EOT
-close($fh) || die;
-
-@warn = ();
-@parsed = ();
-$p->parse_file($file);
-is(@parsed, "11");
-is($parsed[6][0], "start");
-is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
-is($parsed[7][0], "text");
-is($parsed[7][1], "♥ Love \xE2\x99\xA5");
-is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage
-is($parsed[10][3], -s $file);
-is(@warn, 1);
-like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
-
-@warn = ();
-@parsed = ();
-open($fh, "<:raw:utf8", $file) || die;
-$p->parse_file($fh);
-is(@parsed, "11");
-is($parsed[6][0], "start");
-is($parsed[6][8]{id}, "\x{2665}\x{2665}");
-is($parsed[7][0], "text");
-is($parsed[7][1], "♥ Love \x{2665}");
-is($parsed[7][2], "\x{2665} Love \x{2665}");
-is($parsed[10][3], (-s $file) - 2 * 4);
-is(@warn, 0);
-
-@warn = ();
-@parsed = ();
-open($fh, "<:raw", $file) || die;
-$p->utf8_mode(1);
-$p->parse_file($fh);
-is(@parsed, "11");
-is($parsed[6][0], "start");
-is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
-is($parsed[7][0], "text");
-is($parsed[7][1], "♥ Love \xE2\x99\xA5");
-is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
-is($parsed[10][3], -s $file);
-is(@warn, 0);
-
-unlink($file);
-
-@parsed = ();
-$p->parse(q(foo))->eof;
-is(@parsed, "5");
-is($parsed[1][0], "start");
-is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
-
-ok(!HTML::Entities::_probably_utf8_chunk(""));
-ok(!HTML::Entities::_probably_utf8_chunk("f"));
-ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
-ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
-ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
-ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
-ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
-ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
-}
diff --git a/ext/HTML/Parser/t/xml-mode.t b/ext/HTML/Parser/t/xml-mode.t
deleted file mode 100644
index cdfc5b0..0000000
--- a/ext/HTML/Parser/t/xml-mode.t
+++ /dev/null
@@ -1,112 +0,0 @@
-use strict;
-use Test::More tests => 8;
-
-use HTML::Parser ();
-my $p = HTML::Parser->new(xml_mode => 1,
- );
-
-my $text = "";
-$p->handler(start =>
- sub {
- my($tag, $attr) = @_;
- $text .= "S[$tag";
- for my $k (sort keys %$attr) {
- my $v = $attr->{$k};
- $text .= " $k=$v";
- }
- $text .= "]";
- }, "tagname,attr");
-$p->handler(end =>
- sub {
- $text .= "E[" . shift() . "]";
- }, "tagname");
-$p->handler(process =>
- sub {
- $text .= "PI[" . shift() . "]";
- }, "token0");
-$p->handler(text =>
- sub {
- $text .= shift;
- }, "text");
-
-my $xml = <<'EOT';
-
-
-
-
-
-
-
- foo
-
-
foo
bar
- -isn't, is that something about the table constitutes a "barrier" to -the application of the rule about what p must minimize. - -So C<@HTML::Tagset::p_closure_barriers> is the list of all such -barrier-tags. - -=cut - -@p_closure_barriers = qw( - li blockquote - ul ol menu dir - dl dt dd - td th tr table caption - div - ); - -# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this -# monkey business of barriers to minimization! - -=head2 hashset %isCDATA_Parent - -This hashset includes all elements whose content is CDATA. - -=cut - -%isCDATA_Parent = map {; $_ => 1 } - qw(script style xmp listing plaintext); - -# TODO: there's nothing else that takes CDATA children, right? - -# As the HTML3 DTD (Raggett 1995-04-24) noted: -# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML -# and derive from very early versions of HTML. They require non- -# standard parsers and will cause problems for processing -# documents with standard SGML tools. - - -=head1 CAVEATS - -You may find it useful to alter the behavior of modules (like -C