parse("ram=bar>");
- $ok;
-}->join();
-
-is($ok,2);
-
diff --git a/ext/HTML/Parser/t/tokeparser.t b/ext/HTML/Parser/t/tokeparser.t
deleted file mode 100644
index 2084201..0000000
--- a/ext/HTML/Parser/t/tokeparser.t
+++ /dev/null
@@ -1,164 +0,0 @@
-use Test::More tests => 17;
-
-use strict;
-use HTML::TokeParser;
-
-# First we create an HTML document to test
-
-my $file = "ttest$$.htm";
-die "$file already exists" if -e $file;
-
-open(F, ">$file") or die "Can't create $file: $!";
-print F <<'EOT'; close(F);
-
-
-
- This is the <title>
-
-
-
-
-
-
-
- This is the title again
-
-
- And this is a link to the Institute
-
-
process instruction >
-
-
-
-
-EOT
-
-END { unlink($file) || warn "Can't unlink $file: $!"; }
-
-
-my $p;
-
-
-$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
-ok($p->unbroken_text);
-if ($p->get_tag("foo", "title")) {
- my $title = $p->get_trimmed_text;
- #diag "Title: $title";
- is($title, "This is the ");
-}
-undef($p);
-
-# Test with reference to glob
-open(F, $file) || die "Can't open $file: $!";
-$p = HTML::TokeParser->new(\*F);
-my $scount = 0;
-my $ecount = 0;
-my $tcount = 0;
-my $pcount = 0;
-while (my $token = $p->get_token) {
- $scount++ if $token->[0] eq "S";
- $ecount++ if $token->[0] eq "E";
- $pcount++ if $token->[0] eq "PI";
-}
-undef($p);
-close F;
-
-# Test with glob
-open(F, $file) || die "Can't open $file: $!";
-$p = HTML::TokeParser->new(*F);
-$tcount++ while $p->get_tag;
-undef($p);
-close F;
-
-# Test with plain file name
-$p = HTML::TokeParser->new($file) || die;
-$tcount++ while $p->get_tag;
-undef($p);
-
-#diag "Number of tokens found: $tcount/2 = $scount + $ecount";
-is($tcount, 34);
-is($scount, 10);
-is($ecount, 7);
-is($pcount, 1);
-is($tcount/2, $scount + $ecount);
-
-ok(!HTML::TokeParser->new("/noT/thEre/$$"));
-
-
-$p = HTML::TokeParser->new($file) || die;
-$p->get_tag("a");
-my $atext = $p->get_text;
-undef($p);
-
-is($atext, "Perl\240Institute");
-
-# test parsing of embeded document
-$p = HTML::TokeParser->new(\<Title
-
-Heading
-
-HTML
-
-ok($p->get_tag("h1"));
-is($p->get_trimmed_text, "Heading");
-undef($p);
-
-# test parsing of large embedded documents
-my $doc = "foo is bar\n\n\n" x 2022;
-
-#use Time::HiRes qw(time);
-my $start = time;
-$p = HTML::TokeParser->new(\$doc);
-#diag "Construction time: ", time - $start;
-
-my $count;
-while (my $t = $p->get_token) {
- $count++ if $t->[0] eq "S";
-}
-#diag "Parse time: ", time - $start;
-
-is($count, 2022);
-
-$p = HTML::TokeParser->new(\<<'EOT');
-This is a heading
-This is some
text.
-
-This is some more text.
-
-This is even some more.
-EOT
-
-$p->get_tag("/h1");
-
-my $t = $p->get_trimmed_text("br", "p");
-is($t, "This is some text.");
-
-$p->get_tag;
-
-$t = $p->get_trimmed_text("br", "p");
-is($t,"This is some more text.");
-
-undef($p);
-
-$p = HTML::TokeParser->new(\<<'EOT');
-
This is a bold heading
-This is some italic text.
This is some more text.
-
-This is even some more.
-EOT
-
-$p->get_tag("h1");
-
-$t = $p->get_phrase;
-is($t, "This is a bold heading");
-
-$t = $p->get_phrase;
-is($t, "");
-
-$p->get_tag;
-
-$t = $p->get_phrase;
-is($t, "This is some italic text. This is some more text.");
-
-undef($p);
diff --git a/ext/HTML/Parser/t/uentities.t b/ext/HTML/Parser/t/uentities.t
deleted file mode 100644
index b9decc5..0000000
--- a/ext/HTML/Parser/t/uentities.t
+++ /dev/null
@@ -1,67 +0,0 @@
-# Test Unicode entities
-
-use HTML::Entities;
-
-use Test::More tests => 27;
-
-SKIP: {
-skip "This perl does not support Unicode or Unicode entities not selected",
- 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT;
-
-is(decode_entities("&euro"), "&euro");
-is(decode_entities("€"), "\x{20AC}");
-
-is(decode_entities("å"), "å");
-is(decode_entities("å"), "å");
-
-is(decode_entities(""), chr(500000));
-
-is(decode_entities(""), "\x{10FFFD}");
-
-is(decode_entities(""), "\x{FFFC}");
-
-
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), "\x{FFFD}");
-is(decode_entities(""), chr(0xFFFD));
-is(decode_entities(""), chr(0xFFFD));
-
-is(decode_entities(""), "\0");
-is(decode_entities(""), "\0");
-is(decode_entities(""), "\0");
-is(decode_entities(""), "\0");
-
-is(decode_entities("ååå"), "ååå\x{FFF}");
-
-# This might fail when we get more than 64 bit UVs
-is(decode_entities(""), "");
-is(decode_entities(""), "");
-
-my $err;
-for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) {
- my $a = join("", map chr, $_->[0] .. $_->[1]);
-
- my $e = encode_entities($a);
- my $d = decode_entities($e);
-
- unless ($d eq $a) {
- diag "Wrong decoding in range $_->[0] .. $_->[1]";
- # use Devel::Peek; Dump($a); Dump($d);
- $err++;
- }
-}
-ok(!$err);
-
-
-is(decode_entities(""), chr(0x100085));
-
-is(decode_entities(""), chr(0x100085));
-
-is(decode_entities(""), chr(0xFFFD));
-
-is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}");
-}
diff --git a/ext/HTML/Parser/t/unbroken-text.t b/ext/HTML/Parser/t/unbroken-text.t
deleted file mode 100644
index 7de85a9..0000000
--- a/ext/HTML/Parser/t/unbroken-text.t
+++ /dev/null
@@ -1,60 +0,0 @@
-use strict;
-use HTML::Parser;
-
-use Test::More tests => 3;
-
-my $text = "";
-sub text
-{
- my $cdata = shift() ? "CDATA" : "TEXT";
- my($offset, $line, $col, $t) = @_;
- $text .= "[$cdata:$offset:$line.$col:$t]";
-}
-
-sub tag
-{
- $text .= shift;
-}
-
-my $p = HTML::Parser->new(unbroken_text => 1,
- text_h => [\&text, "is_cdata,offset,line,column,text"],
- start_h => [\&tag, "text"],
- end_h => [\&tag, "text"],
- );
-
-$p->parse("foo ");
-$p->parse("bar ");
-$p->parse("");
-$p->parse("bar\n");
-$p->parse("");
-$p->parse("
xmp");
-$p->parse("atend");
-
-#diag $text;
-is($text, "[TEXT:0:1.0:foo bar ][TEXT:13:1.13:bar\n][CDATA:28:2.11:xmp]");
-
-$text = "";
-$p->eof;
-
-#diag $text;
-is($text, "[TEXT:37:2.20:atend]");
-
-
-$p = HTML::Parser->new(unbroken_text => 1,
- text_h => [\&text, "is_cdata,offset,line,column,text"],
- );
-
-$text = "";
-$p->parse("foo");
-$p->parse("parse(">bar\n");
-$p->parse("fooparse("p>xmp");
-$p->parse("parse(">bar");
-$p->eof;
-
-#diag $text;
-is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]");
-
-
diff --git a/ext/HTML/Parser/t/unicode-bom.t b/ext/HTML/Parser/t/unicode-bom.t
deleted file mode 100644
index 34e066f..0000000
--- a/ext/HTML/Parser/t/unicode-bom.t
+++ /dev/null
@@ -1,59 +0,0 @@
-#!perl -w
-
-use strict;
-use Test::More tests => 2;
-use HTML::Parser;
-
-SKIP: {
-skip "This perl does not support Unicode", 2 if $] < 5.008;
-
-my @parsed;
-my $p = HTML::Parser->new(
- api_version => 3,
- start_h => [\@parsed, 'tag, attr'],
-);
-
-my @warn;
-$SIG{__WARN__} = sub {
- push(@warn, $_[0]);
-};
-
-$p->parse("\xEF\xBB\xBFHi there");
-$p->eof;
-
-#use Encode;
-$p->parse("\xEF\xBB\xBFHi there" . chr(0x263A));
-$p->eof;
-
-$p->parse("\xFF\xFEHi there");
-$p->eof;
-
-$p->parse("\xFE\xFFHi there");
-$p->eof;
-
-$p->parse("\0\0\xFF\xFEHi there");
-$p->eof;
-
-$p->parse("\xFE\xFF\0\0Hi there");
-$p->eof;
-
-is(join("", @warn), <new(
- api_version => 3,
- start_h => [\@parsed, 'tag'],
-);
-
-$p->parse("\xEF\xBB\xBFHi there");
-$p->eof;
-ok(!@warn);
-}
diff --git a/ext/HTML/Parser/t/unicode.t b/ext/HTML/Parser/t/unicode.t
deleted file mode 100644
index 82902de..0000000
--- a/ext/HTML/Parser/t/unicode.t
+++ /dev/null
@@ -1,183 +0,0 @@
-#!perl -w
-
-use strict;
-use HTML::Parser;
-use Test::More tests => 103;
-
-SKIP: {
-skip "This perl does not support Unicode", 103 if $] < 5.008;
-
-my @warn;
-$SIG{__WARN__} = sub {
- push(@warn, $_[0]);
-};
-
-my @parsed;
-my $p = HTML::Parser->new(
- api_version => 3,
- default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'],
-);
-
-my $doc = "\x{263A}Smile ☺
\x{0420}";
-is(length($doc), 46);
-
-$p->parse($doc)->eof;
-
-#use Data::Dump; Data::Dump::dump(@parsed);
-
-is(@parsed, 9);
-is($parsed[0][0], "start_document");
-
-is($parsed[1][0], "start");
-is($parsed[1][1], "");
-SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") };
-is($parsed[1][3], 0);
-is($parsed[1][4], 7);
-
-is($parsed[2][0], "text");
-is(ord($parsed[2][1]), 0x263A);
-is($parsed[2][2], chr(0x263A));
-is($parsed[2][3], 7);
-is($parsed[2][4], 1);
-is($parsed[2][5], 8);
-is($parsed[2][6], 7);
-
-is($parsed[3][0], "end");
-is($parsed[3][1], "");
-is($parsed[3][3], 8);
-is($parsed[3][6], 8);
-
-is($parsed[4][0], "start");
-is($parsed[4][1], "");
-is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
-is($parsed[4][8]{id}, "\x{2600}");
-
-is($parsed[5][0], "text");
-is($parsed[5][1], "Smile ☺");
-is($parsed[5][2], "Smile \x{263A}");
-
-is($parsed[7][0], "text");
-is($parsed[7][1], "\x{0420}");
-is($parsed[7][2], "\x{0420}");
-
-is($parsed[8][0], "end_document");
-is($parsed[8][3], length($doc));
-is($parsed[8][5], length($doc));
-is($parsed[8][6], length($doc));
-is(@warn, 0);
-
-# Try to parse it as an UTF8 encoded string
-utf8::encode($doc);
-is(length($doc), 51);
-
-@parsed = ();
-$p->parse($doc)->eof;
-
-#use Data::Dump; Data::Dump::dump(@parsed);
-
-is(@parsed, 9);
-is($parsed[0][0], "start_document");
-
-is($parsed[1][0], "start");
-is($parsed[1][1], "");
-SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") };
-is($parsed[1][3], 0);
-is($parsed[1][4], 7);
-
-is($parsed[2][0], "text");
-is(ord($parsed[2][1]), 226);
-is($parsed[2][1], "\xE2\x98\xBA");
-is($parsed[2][2], "\xE2\x98\xBA");
-is($parsed[2][3], 7);
-is($parsed[2][4], 3);
-is($parsed[2][5], 10);
-is($parsed[2][6], 7);
-
-is($parsed[3][0], "end");
-is($parsed[3][1], "");
-is($parsed[3][3], 10);
-is($parsed[3][6], 10);
-
-is($parsed[4][0], "start");
-is($parsed[4][1], "");
-is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
-is($parsed[4][8]{id}, "\xE2\x98\x80");
-
-is($parsed[5][0], "text");
-is($parsed[5][1], "Smile ☺");
-is($parsed[5][2], "Smile \x{263A}");
-
-is($parsed[8][0], "end_document");
-is($parsed[8][3], length($doc));
-is($parsed[8][5], length($doc));
-is($parsed[8][6], length($doc));
-
-is(@warn, 1);
-like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
-
-my $file = "test-$$.html";
-open(my $fh, ">:utf8", $file) || die;
-print $fh <\x{263A} Love!
-♥ Love \x{2665}
-EOT
-close($fh) || die;
-
-@warn = ();
-@parsed = ();
-$p->parse_file($file);
-is(@parsed, "11");
-is($parsed[6][0], "start");
-is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
-is($parsed[7][0], "text");
-is($parsed[7][1], "♥ Love \xE2\x99\xA5");
-is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage
-is($parsed[10][3], -s $file);
-is(@warn, 1);
-like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
-
-@warn = ();
-@parsed = ();
-open($fh, "<:raw:utf8", $file) || die;
-$p->parse_file($fh);
-is(@parsed, "11");
-is($parsed[6][0], "start");
-is($parsed[6][8]{id}, "\x{2665}\x{2665}");
-is($parsed[7][0], "text");
-is($parsed[7][1], "♥ Love \x{2665}");
-is($parsed[7][2], "\x{2665} Love \x{2665}");
-is($parsed[10][3], (-s $file) - 2 * 4);
-is(@warn, 0);
-
-@warn = ();
-@parsed = ();
-open($fh, "<:raw", $file) || die;
-$p->utf8_mode(1);
-$p->parse_file($fh);
-is(@parsed, "11");
-is($parsed[6][0], "start");
-is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
-is($parsed[7][0], "text");
-is($parsed[7][1], "♥ Love \xE2\x99\xA5");
-is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
-is($parsed[10][3], -s $file);
-is(@warn, 0);
-
-unlink($file);
-
-@parsed = ();
-$p->parse(q(foo))->eof;
-is(@parsed, "5");
-is($parsed[1][0], "start");
-is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
-
-ok(!HTML::Entities::_probably_utf8_chunk(""));
-ok(!HTML::Entities::_probably_utf8_chunk("f"));
-ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
-ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
-ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
-ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
-ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
-ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
-}
diff --git a/ext/HTML/Parser/t/xml-mode.t b/ext/HTML/Parser/t/xml-mode.t
deleted file mode 100644
index cdfc5b0..0000000
--- a/ext/HTML/Parser/t/xml-mode.t
+++ /dev/null
@@ -1,112 +0,0 @@
-use strict;
-use Test::More tests => 8;
-
-use HTML::Parser ();
-my $p = HTML::Parser->new(xml_mode => 1,
- );
-
-my $text = "";
-$p->handler(start =>
- sub {
- my($tag, $attr) = @_;
- $text .= "S[$tag";
- for my $k (sort keys %$attr) {
- my $v = $attr->{$k};
- $text .= " $k=$v";
- }
- $text .= "]";
- }, "tagname,attr");
-$p->handler(end =>
- sub {
- $text .= "E[" . shift() . "]";
- }, "tagname");
-$p->handler(process =>
- sub {
- $text .= "PI[" . shift() . "]";
- }, "token0");
-$p->handler(text =>
- sub {
- $text .= shift;
- }, "text");
-
-my $xml = <<'EOT';
-
-
-
-My first architectual document
-Geir Ove Gronmo, grove@infotek.no
-This is the first paragraph in this document
-This is the second paragraph
-
-
-
-EOT
-
-$p->parse($xml)->eof;
-
-is($text, <<'EOT');
-PI[xml version="1.0"]
-PI[IS10744:arch name="html"]
-S[DOC]
-S[title html=h1]My first architectual documentE[title]
-S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
-S[para]This is the first paragraph in this documentE[para]
-S[para html=p]This is the second paragraphE[para]
-S[para]E[para]
-S[xmp]S[foo]E[foo]E[xmp]
-E[DOC]
-EOT
-
-$text = "";
-$p->xml_mode(0);
-$p->parse($xml)->eof;
-
-is($text, <<'EOT');
-PI[xml version="1.0"?]
-PI[IS10744:arch name="html"?]
-S[doc]
-S[title html=h1]My first architectual documentE[title]
-S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
-S[para]This is the first paragraph in this documentE[para]
-S[para html=p]This is the second paragraphE[para]
-S[para/]
-S[xmp]E[xmp]
-E[doc]
-EOT
-
-# Test that we get an empty tag back
-$p = HTML::Parser->new(api_version => 3,
- xml_mode => 1);
-
-$p->handler("end" =>
- sub {
- my($tagname, $text) = @_;
- is($tagname, "Xyzzy");
- ok(!length($text));
- }, "tagname,text");
-$p->parse("and some more")->eof;
-
-# Test that we get an empty tag back
-$p = HTML::Parser->new(api_version => 3,
- empty_element_tags => 1);
-
-$p->handler("end" =>
- sub {
- my($tagname, $text) = @_;
- is($tagname, "xyzzy");
- ok(!length($text));
- }, "tagname,text");
-$p->parse("and some more")->eof;
-
-$p = HTML::Parser->new(
- api_version => 3,
- xml_pic => 1,
-);
-
-$p->handler(
- "process" => sub {
- my($text, $t0) = @_;
- is($text, " bar?>");
- is($t0, "foo > bar");
- }, "text, token0");
-$p->parse(" bar?> and then")->eof;
diff --git a/ext/HTML/Parser/tokenpos.h b/ext/HTML/Parser/tokenpos.h
deleted file mode 100644
index aa971bf..0000000
--- a/ext/HTML/Parser/tokenpos.h
+++ /dev/null
@@ -1,49 +0,0 @@
-struct token_pos
-{
- char *beg;
- char *end;
-};
-typedef struct token_pos token_pos_t;
-
-#define dTOKENS(init_lim) \
- token_pos_t token_buf[init_lim]; \
- int token_lim = init_lim; \
- token_pos_t *tokens = token_buf; \
- int num_tokens = 0
-
-#define PUSH_TOKEN(p_beg, p_end) \
- STMT_START { \
- ++num_tokens; \
- if (num_tokens == token_lim) \
- tokens_grow(&tokens, &token_lim, (bool)(tokens != token_buf)); \
- tokens[num_tokens-1].beg = p_beg; \
- tokens[num_tokens-1].end = p_end; \
- } STMT_END
-
-#define FREE_TOKENS \
- STMT_START { \
- if (tokens != token_buf) \
- Safefree(tokens); \
- } STMT_END
-
-static void
-tokens_grow(token_pos_t **token_ptr, int *token_lim_ptr, bool tokens_on_heap)
-{
- int new_lim = *token_lim_ptr;
- if (new_lim < 4)
- new_lim = 4;
- new_lim *= 2;
-
- if (tokens_on_heap) {
- Renew(*token_ptr, new_lim, token_pos_t);
- }
- else {
- token_pos_t *new_tokens;
- int i;
- New(57, new_tokens, new_lim, token_pos_t);
- for (i = 0; i < *token_lim_ptr; i++)
- new_tokens[i] = (*token_ptr)[i];
- *token_ptr = new_tokens;
- }
- *token_lim_ptr = new_lim;
-}
diff --git a/ext/HTML/Parser/typemap b/ext/HTML/Parser/typemap
deleted file mode 100644
index a323854..0000000
--- a/ext/HTML/Parser/typemap
+++ /dev/null
@@ -1,5 +0,0 @@
-PSTATE* T_PSTATE
-
-INPUT
-T_PSTATE
- $var = get_pstate_hv(aTHX_ $arg)
diff --git a/ext/HTML/Parser/util.c b/ext/HTML/Parser/util.c
deleted file mode 100644
index 7e626bf..0000000
--- a/ext/HTML/Parser/util.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/* $Id: util.c,v 2.30 2006/03/22 09:15:17 gisle Exp $
- *
- * Copyright 1999-2006, Gisle Aas.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the same terms as Perl itself.
- */
-
-#ifndef EXTERN
-#define EXTERN extern
-#endif
-
-
-EXTERN SV*
-sv_lower(pTHX_ SV* sv)
-{
- STRLEN len;
- char *s = SvPV_force(sv, len);
- for (; len--; s++)
- *s = toLOWER(*s);
- return sv;
-}
-
-EXTERN int
-strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case)
-{
- while (n--) {
- if (ignore_case) {
- if (toLOWER(*s1) != toLOWER(*s2))
- return 0;
- }
- else {
- if (*s1 != *s2)
- return 0;
- }
- s1++;
- s2++;
- }
- return 1;
-}
-
-static void
-grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e)
-{
- /*
- SvPVX ---> AAAAAA...BBBBBB
- ^ ^ ^
- t s e
- */
- STRLEN t_offset = *t - SvPVX(sv);
- STRLEN s_offset = *s - SvPVX(sv);
- STRLEN e_offset = *e - SvPVX(sv);
-
- SvGROW(sv, e_offset + grow + 1);
-
- *t = SvPVX(sv) + t_offset;
- *s = SvPVX(sv) + s_offset;
- *e = SvPVX(sv) + e_offset;
-
- Move(*s, *s+grow, *e - *s, char);
- *s += grow;
- *e += grow;
-}
-
-EXTERN SV*
-decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix)
-{
- STRLEN len;
- char *s = SvPV_force(sv, len);
- char *t = s;
- char *end = s + len;
- char *ent_start;
-
- char *repl;
- STRLEN repl_len;
-#ifdef UNICODE_HTML_PARSER
- char buf[UTF8_MAXLEN];
- int repl_utf8;
- int high_surrogate = 0;
-#else
- char buf[1];
-#endif
-
-#if defined(__GNUC__) && defined(UNICODE_HTML_PARSER)
- /* gcc -Wall reports this variable as possibly used uninitialized */
- repl_utf8 = 0;
-#endif
-
- while (s < end) {
- assert(t <= s);
-
- if ((*t++ = *s++) != '&')
- continue;
-
- ent_start = s;
- repl = 0;
-
- if (*s == '#') {
- UV num = 0;
- UV prev = 0;
- int ok = 0;
- s++;
- if (*s == 'x' || *s == 'X') {
- s++;
- while (*s) {
- char *tmp = strchr(PL_hexdigit, *s);
- if (!tmp)
- break;
- num = num << 4 | ((tmp - PL_hexdigit) & 15);
- if (prev && num <= prev) {
- /* overflow */
- ok = 0;
- break;
- }
- prev = num;
- s++;
- ok = 1;
- }
- }
- else {
- while (isDIGIT(*s)) {
- num = num * 10 + (*s - '0');
- if (prev && num < prev) {
- /* overflow */
- ok = 0;
- break;
- }
- prev = num;
- s++;
- ok = 1;
- }
- }
- if (ok) {
-#ifdef UNICODE_HTML_PARSER
- if (!SvUTF8(sv) && num <= 255) {
- buf[0] = (char) num;
- repl = buf;
- repl_len = 1;
- repl_utf8 = 0;
- }
- else {
- char *tmp;
- if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */
- if (high_surrogate != 0) {
- t -= 3; /* Back up past 0xFFFD */
- num = ((high_surrogate - 0xD800) << 10) +
- (num - 0xDC00) + 0x10000;
- high_surrogate = 0;
- } else {
- num = 0xFFFD;
- }
- }
- else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */
- high_surrogate = num;
- num = 0xFFFD;
- }
- else {
- high_surrogate = 0;
- /* otherwise invalid? */
- if ((num >= 0xFDD0 && num <= 0xFDEF) ||
- ((num & 0xFFFE) == 0xFFFE) ||
- num > 0x10FFFF)
- {
- num = 0xFFFD;
- }
- }
-
- tmp = (char*)uvuni_to_utf8((U8*)buf, num);
- repl = buf;
- repl_len = tmp - buf;
- repl_utf8 = 1;
- }
-#else
- if (num <= 255) {
- buf[0] = (char) num & 0xFF;
- repl = buf;
- repl_len = 1;
- }
-#endif
- }
- }
- else {
- char *ent_name = s;
- while (isALNUM(*s))
- s++;
- if (ent_name != s && entity2char) {
- SV** svp;
- if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) ||
- (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0)))
- )
- {
- repl = SvPV(*svp, repl_len);
-#ifdef UNICODE_HTML_PARSER
- repl_utf8 = SvUTF8(*svp);
-#endif
- }
- else if (expand_prefix) {
- char *ss = s - 1;
- while (ss > ent_name) {
- svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0);
- if (svp) {
- repl = SvPV(*svp, repl_len);
-#ifdef UNICODE_HTML_PARSER
- repl_utf8 = SvUTF8(*svp);
-#endif
- s = ss;
- break;
- }
- ss--;
- }
- }
- }
-#ifdef UNICODE_HTML_PARSER
- high_surrogate = 0;
-#endif
- }
-
- if (repl) {
- char *repl_allocated = 0;
- if (*s == ';')
- s++;
- t--; /* '&' already copied, undo it */
-
-#ifdef UNICODE_HTML_PARSER
- if (*s != '&') {
- high_surrogate = 0;
- }
-
- if (!SvUTF8(sv) && repl_utf8) {
- /* need to upgrade sv before we continue */
- STRLEN before_gap_len = t - SvPVX(sv);
- char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len);
- STRLEN after_gap_len = end - s;
- char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len);
-
- sv_setpvn(sv, before_gap, before_gap_len);
- sv_catpvn(sv, after_gap, after_gap_len);
- SvUTF8_on(sv);
-
- Safefree(before_gap);
- Safefree(after_gap);
-
- s = t = SvPVX(sv) + before_gap_len;
- end = SvPVX(sv) + before_gap_len + after_gap_len;
- }
- else if (SvUTF8(sv) && !repl_utf8) {
- repl = (char*)bytes_to_utf8((U8*)repl, &repl_len);
- repl_allocated = repl;
- }
-#endif
-
- if (t + repl_len > s) {
- /* need to grow the string */
- grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end);
- }
-
- /* copy replacement string into string */
- while (repl_len--)
- *t++ = *repl++;
-
- if (repl_allocated)
- Safefree(repl_allocated);
- }
- else {
- while (ent_start < s)
- *t++ = *ent_start++;
- }
- }
-
- *t = '\0';
- SvCUR_set(sv, t - SvPVX(sv));
-
- return sv;
-}
-
-#ifdef UNICODE_HTML_PARSER
-static bool
-has_hibit(char *s, char *e)
-{
- while (s < e) {
- U8 ch = *s++;
- if (!UTF8_IS_INVARIANT(ch)) {
- return 1;
- }
- }
- return 0;
-}
-
-
-EXTERN bool
-probably_utf8_chunk(pTHX_ char *s, STRLEN len)
-{
- char *e = s + len;
- STRLEN clen;
-
- /* ignore partial utf8 char at end of buffer */
- while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1)))
- e--;
- if (s < e && UTF8_IS_START((U8)*(e - 1)))
- e--;
- clen = len - (e - s);
- if (clen && UTF8SKIP(e) == clen) {
- /* all promised continuation bytes are present */
- e = s + len;
- }
-
- if (!has_hibit(s, e))
- return 0;
-
- return is_utf8_string((U8*)s, e - s);
-}
-#endif
diff --git a/lib/HTML/Tagset.pm b/lib/HTML/Tagset.pm
deleted file mode 100644
index 754137f..0000000
--- a/lib/HTML/Tagset.pm
+++ /dev/null
@@ -1,471 +0,0 @@
-package HTML::Tagset;
-
-use strict;
-
-=head1 NAME
-
-HTML::Tagset - data tables useful in parsing HTML
-
-=head1 VERSION
-
-Version 3.20
-
-=cut
-
-use vars qw( $VERSION );
-
-$VERSION = '3.20';
-
-=head1 SYNOPSIS
-
- use HTML::Tagset;
- # Then use any of the items in the HTML::Tagset package
- # as need arises
-
-=head1 DESCRIPTION
-
-This module contains several data tables useful in various kinds of
-HTML parsing operations.
-
-Note that all tag names used are lowercase.
-
-In the following documentation, a "hashset" is a hash being used as a
-set -- the hash conveys that its keys are there, and the actual values
-associated with the keys are not significant. (But what values are
-there, are always true.)
-
-=cut
-
-use vars qw(
- $VERSION
- %emptyElement %optionalEndTag %linkElements %boolean_attr
- %isHeadElement %isBodyElement %isPhraseMarkup
- %is_Possible_Strict_P_Content
- %isHeadOrBodyElement
- %isList %isTableElement %isFormElement
- %isKnown %canTighten
- @p_closure_barriers
- %isCDATA_Parent
-);
-
-=head1 VARIABLES
-
-Note that none of these variables are exported.
-
-=head2 hashset %HTML::Tagset::emptyElement
-
-This hashset has as values the tag-names (GIs) of elements that cannot
-have content. (For example, "base", "br", "hr".) So
-C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
-C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
-
-=cut
-
-%emptyElement = map {; $_ => 1 } qw(base link meta isindex
- img br hr wbr
- input area param
- embed bgsound spacer
- basefont col frame
- ~comment ~literal
- ~declaration ~pi
- );
-# The "~"-initial names are for pseudo-elements used by HTML::Entities
-# and TreeBuilder
-
-=head2 hashset %HTML::Tagset::optionalEndTag
-
-This hashset lists tag-names for elements that can have content, but whose
-end-tags are generally, "safely", omissible. Example:
-C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
-
-=cut
-
-%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
-
-=head2 hash %HTML::Tagset::linkElements
-
-Values in this hash are tagnames for elements that might contain
-links, and the value for each is a reference to an array of the names
-of attributes whose values can be links.
-
-=cut
-
-%linkElements =
-(
- 'a' => ['href'],
- 'applet' => ['archive', 'codebase', 'code'],
- 'area' => ['href'],
- 'base' => ['href'],
- 'bgsound' => ['src'],
- 'blockquote' => ['cite'],
- 'body' => ['background'],
- 'del' => ['cite'],
- 'embed' => ['pluginspage', 'src'],
- 'form' => ['action'],
- 'frame' => ['src', 'longdesc'],
- 'iframe' => ['src', 'longdesc'],
- 'ilayer' => ['background'],
- 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
- 'input' => ['src', 'usemap'],
- 'ins' => ['cite'],
- 'isindex' => ['action'],
- 'head' => ['profile'],
- 'layer' => ['background', 'src'],
- 'link' => ['href'],
- 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
- 'q' => ['cite'],
- 'script' => ['src', 'for'],
- 'table' => ['background'],
- 'td' => ['background'],
- 'th' => ['background'],
- 'tr' => ['background'],
- 'xmp' => ['href'],
-);
-
-=head2 hash %HTML::Tagset::boolean_attr
-
-This hash (not hashset) lists what attributes of what elements can be
-printed without showing the value (for example, the "noshade" attribute
-of "hr" elements). For elements with only one such attribute, its value
-is simply that attribute name. For elements with many such attributes,
-the value is a reference to a hashset containing all such attributes.
-
-=cut
-
-%boolean_attr = (
-# TODO: make these all hashes
- 'area' => 'nohref',
- 'dir' => 'compact',
- 'dl' => 'compact',
- 'hr' => 'noshade',
- 'img' => 'ismap',
- 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
- 'menu' => 'compact',
- 'ol' => 'compact',
- 'option' => 'selected',
- 'select' => 'multiple',
- 'td' => 'nowrap',
- 'th' => 'nowrap',
- 'ul' => 'compact',
-);
-
-#==========================================================================
-# List of all elements from Extensible HTML version 1.0 Transitional DTD:
-#
-# a abbr acronym address applet area b base basefont bdo big
-# blockquote body br button caption center cite code col colgroup
-# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
-# head hr html i iframe img input ins isindex kbd label legend li
-# link map menu meta noframes noscript object ol optgroup option p
-# param pre q s samp script select small span strike strong style
-# sub sup table tbody td textarea tfoot th thead title tr tt u ul
-# var
-#
-# Varia from Mozilla source internal table of tags:
-# Implemented:
-# xmp listing wbr nobr frame frameset noframes ilayer
-# layer nolayer spacer embed multicol
-# But these are unimplemented:
-# sound?? keygen?? server??
-# Also seen here and there:
-# marquee?? app?? (both unimplemented)
-#==========================================================================
-
-=head2 hashset %HTML::Tagset::isPhraseMarkup
-
-This hashset contains all phrasal-level elements.
-
-=cut
-
-%isPhraseMarkup = map {; $_ => 1 } qw(
- span abbr acronym q sub sup
- cite code em kbd samp strong var dfn strike
- b i u s tt small big
- a img br
- wbr nobr blink
- font basefont bdo
- spacer embed noembed
-); # had: center, hr, table
-
-
-=head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
-
-This hashset contains all phrasal-level elements that be content of a
-P element, for a strict model of HTML.
-
-=cut
-
-%is_Possible_Strict_P_Content = (
- %isPhraseMarkup,
- %isFormElement,
- map {; $_ => 1} qw( object script map )
- # I've no idea why there's these latter exceptions.
- # I'm just following the HTML4.01 DTD.
-);
-
-#from html4 strict:
-#
-#
-#
-#
-#
-#
-#
-#
-#
-#
-
-=head2 hashset %HTML::Tagset::isHeadElement
-
-This hashset contains all elements that elements that should be
-present only in the 'head' element of an HTML document.
-
-=cut
-
-%isHeadElement = map {; $_ => 1 }
- qw(title base link meta isindex script style object bgsound);
-
-=head2 hashset %HTML::Tagset::isList
-
-This hashset contains all elements that can contain "li" elements.
-
-=cut
-
-%isList = map {; $_ => 1 } qw(ul ol dir menu);
-
-=head2 hashset %HTML::Tagset::isTableElement
-
-This hashset contains all elements that are to be found only in/under
-a "table" element.
-
-=cut
-
-%isTableElement = map {; $_ => 1 }
- qw(tr td th thead tbody tfoot caption col colgroup);
-
-=head2 hashset %HTML::Tagset::isFormElement
-
-This hashset contains all elements that are to be found only in/under
-a "form" element.
-
-=cut
-
-%isFormElement = map {; $_ => 1 }
- qw(input select option optgroup textarea button label);
-
-=head2 hashset %HTML::Tagset::isBodyMarkup
-
-This hashset contains all elements that are to be found only in/under
-the "body" element of an HTML document.
-
-=cut
-
-%isBodyElement = map {; $_ => 1 } qw(
- h1 h2 h3 h4 h5 h6
- p div pre plaintext address blockquote
- xmp listing
- center
-
- multicol
- iframe ilayer nolayer
- bgsound
-
- hr
- ol ul dir menu li
- dl dt dd
- ins del
-
- fieldset legend
-
- map area
- applet param object
- isindex script noscript
- table
- center
- form
- ),
- keys %isFormElement,
- keys %isPhraseMarkup, # And everything phrasal
- keys %isTableElement,
-;
-
-
-=head2 hashset %HTML::Tagset::isHeadOrBodyElement
-
-This hashset includes all elements that I notice can fall either in
-the head or in the body.
-
-=cut
-
-%isHeadOrBodyElement = map {; $_ => 1 }
- qw(script isindex style object map area param noscript bgsound);
- # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
-
-
-=head2 hashset %HTML::Tagset::isKnown
-
-This hashset lists all known HTML elements.
-
-=cut
-
-%isKnown = (%isHeadElement, %isBodyElement,
- map{; $_=>1 }
- qw( head body html
- frame frameset noframes
- ~comment ~pi ~directive ~literal
-));
- # that should be all known tags ever ever
-
-
-=head2 hashset %HTML::Tagset::canTighten
-
-This hashset lists elements that might have ignorable whitespace as
-children or siblings.
-
-=cut
-
-%canTighten = %isKnown;
-delete @canTighten{
- keys(%isPhraseMarkup), 'input', 'select',
- 'xmp', 'listing', 'plaintext', 'pre',
-};
- # xmp, listing, plaintext, and pre are untightenable, and
- # in a really special way.
-@canTighten{'hr','br'} = (1,1);
- # exceptional 'phrasal' things that ARE subject to tightening.
-
-# The one case where I can think of my tightening rules failing is:
-# foo bar
baz quux ...
-# ^-- that would get deleted.
-# But that's pretty gruesome code anyhow. You gets what you pays for.
-
-#==========================================================================
-
-=head2 array @HTML::Tagset::p_closure_barriers
-
-This array has a meaning that I have only seen a need for in
-C, but I include it here on the off chance that someone
-might find it of use:
-
-When we see a "EpE" token, we go lookup up the lineage for a p
-element we might have to minimize. At first sight, we might say that
-if there's a p anywhere in the lineage of this new p, it should be
-closed. But that's wrong. Consider this document:
-
-
-
- foo
-
-
- foo
-
-