å

From: Steve Peters Date: Thu, 5 Jun 2008 01:58:26 +0000 (+0000) Subject: Assimilate HTML-Parser and HTML-Tagset. HTML-Parser is now a prereq X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=42e4baff3065f0219e40d48113b9180ea9333dbc;p=p5sagit%2Fp5-mst-13.2.git Assimilate HTML-Parser and HTML-Tagset. HTML-Parser is now a prereq for Pod-Simple and HTML-Tagset is a prereq for HTML-Parser. I also resorted the MANIFEST file. p4raw-id: //depot/perl@33998 --- diff --git a/MANIFEST b/MANIFEST index b4bba83..51c1d3d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -75,8 +75,8 @@ ext/attrs/t/attrs.t See if attrs works with C_{ext/B/B/Concise.pm Compiler Concise backend
ext/B/B/Debug.pm Compiler Debug backend
ext/B/B/Deparse.pm Compiler Deparse backend
-ext/B/B/Lint.pm Compiler Lint backend
ext/B/B/Lint/Debug.pm Adds debugging stringification to B::
+ext/B/B/Lint.pm Compiler Lint backend
ext/B/B.pm Compiler backend support functions and methods
ext/B/B/Showlex.pm Compiler Showlex backend
ext/B/B/Terse.pm Compiler Terse backend
@@ -350,6 +350,7 @@ ext/Devel/PPPort/t/MY_CXT.t Devel::PPPort test file
ext/Devel/PPPort/t/newCONSTSUB.t Devel::PPPort test file
ext/Devel/PPPort/t/newRV.t Devel::PPPort test file
ext/Devel/PPPort/t/newSVpv.t Devel::PPPort test file
+ext/Devel/PPPort/TODO Devel::PPPort Todo
ext/Devel/PPPort/t/podtest.t Devel::PPPort test file
ext/Devel/PPPort/t/ppphtest.t Devel::PPPort test file
ext/Devel/PPPort/t/pvs.t Devel::PPPort test file
@@ -365,7 +366,6 @@ ext/Devel/PPPort/t/threads.t Devel::PPPort test file
ext/Devel/PPPort/t/uv.t Devel::PPPort test file
ext/Devel/PPPort/t/variables.t Devel::PPPort test file
ext/Devel/PPPort/t/warn.t Devel::PPPort test file
-ext/Devel/PPPort/TODO Devel::PPPort Todo
ext/Devel/PPPort/typemap Devel::PPPort Typemap
ext/Digest/MD5/Changes Digest::MD5 extension changes
ext/Digest/MD5/hints/dec_osf.pl Hints for named architecture
@@ -693,6 +693,71 @@ ext/Hash/Util/lib/Hash/Util.pm Hash::Util
ext/Hash/Util/Makefile.PL Makefile for Hash::Util
ext/Hash/Util/t/Util.t See if Hash::Util works
ext/Hash/Util/Util.xs XS bits of Hash::Util
+ext/HTML/Parser/hints/solaris.pl files for HTML::Parser
+ext/HTML/Parser/hparser.c files for HTML::Parser
+ext/HTML/Parser/hparser.h files for HTML::Parser
+ext/HTML/Parser/lib/HTML/Entities.pm file for HTML::Entities
+ext/HTML/Parser/lib/HTML/Filter.pm file for HTML::Filter
+ext/HTML/Parser/lib/HTML/HeadParser.pm file for HTML::HeadParser
+ext/HTML/Parser/lib/HTML/LinkExtor.pm file for HTML::LinkExtor
+ext/HTML/Parser/lib/HTML/PullParser.pm file for HTML::PullParser
+ext/HTML/Parser/lib/HTML/TokeParser.pm file for HTML::TokeParser
+ext/HTML/Parser/Makefile.PL files for HTML::Parser
+ext/HTML/Parser/mkhctype files for HTML::Parser
+ext/HTML/Parser/mkpfunc files for HTML::Parser
+ext/HTML/Parser/Parser.pm files for HTML::Parser
+ext/HTML/Parser/Parser.xs files for HTML::Parser
+ext/HTML/Parser/t/api_version.t test for HTML::Parser
+ext/HTML/Parser/t/argspec2.t test for HTML::Parser
+ext/HTML/Parser/t/argspec-bad.t test for HTML::Parser
+ext/HTML/Parser/t/argspec.t test for HTML::Parser
+ext/HTML/Parser/t/attr-encoded.t test for HTML::Parser
+ext/HTML/Parser/t/callback.t test for HTML::Parser
+ext/HTML/Parser/t/case-sensitive.t test for HTML::Parser
+ext/HTML/Parser/t/cases.t test for HTML::Parser
+ext/HTML/Parser/t/comment.t test for HTML::Parser
+ext/HTML/Parser/t/crashme.t test for HTML::Parser
+ext/HTML/Parser/t/declaration.t test for HTML::Parser
+ext/HTML/Parser/t/default.t test for HTML::Parser
+ext/HTML/Parser/t/document.t test for HTML::Parser
+ext/HTML/Parser/t/dtext.t test for HTML::Parser
+ext/HTML/Parser/t/entities2.t test for HTML::Parser
+ext/HTML/Parser/t/entities.t test for HTML::Parser
+ext/HTML/Parser/t/filter-methods.t test for HTML::Parser
+ext/HTML/Parser/t/filter.t test for HTML::Parser
+ext/HTML/Parser/t/handler-eof.t test for HTML::Parser
+ext/HTML/Parser/t/handler.t test for HTML::Parser
+ext/HTML/Parser/t/headparser-http.t test for HTML::Parser
+ext/HTML/Parser/t/headparser.t test for HTML::Parser
+ext/HTML/Parser/t/ignore.t test for HTML::Parser
+ext/HTML/Parser/t/largetags.t test for HTML::Parser
+ext/HTML/Parser/t/linkextor-base.t test for HTML::Parser
+ext/HTML/Parser/t/linkextor-rel.t test for HTML::Parser
+ext/HTML/Parser/t/magic.t test for HTML::Parser
+ext/HTML/Parser/t/marked-sect.t test for HTML::Parser
+ext/HTML/Parser/t/msie-compat.t test for HTML::Parser
+ext/HTML/Parser/t/offset.t test for HTML::Parser
+ext/HTML/Parser/tokenpos.h files for HTML::Parser
+ext/HTML/Parser/t/options.t test for HTML::Parser
+ext/HTML/Parser/t/parsefile.t test for HTML::Parser
+ext/HTML/Parser/t/parser.t test for HTML::Parser
+ext/HTML/Parser/t/plaintext.t test for HTML::Parser
+ext/HTML/Parser/t/pod.t test for HTML::Parser
+ext/HTML/Parser/t/process.t test for HTML::Parser
+ext/HTML/Parser/t/pullparser.t test for HTML::Parser
+ext/HTML/Parser/t/script.t test for HTML::Parser
+ext/HTML/Parser/t/skipped-text.t test for HTML::Parser
+ext/HTML/Parser/t/stack-realloc.t test for HTML::Parser
+ext/HTML/Parser/t/textarea.t test for HTML::Parser
+ext/HTML/Parser/t/threads.t test for HTML::Parser
+ext/HTML/Parser/t/tokeparser.t test for HTML::Parser
+ext/HTML/Parser/t/uentities.t test for HTML::Parser
+ext/HTML/Parser/t/unbroken-text.t test for HTML::Parser
+ext/HTML/Parser/t/unicode-bom.t test for HTML::Parser
+ext/HTML/Parser/t/unicode.t test for HTML::Parser
+ext/HTML/Parser/t/xml-mode.t test for HTML::Parser
+ext/HTML/Parser/typemap files for HTML::Parser
+ext/HTML/Parser/util.c files for HTML::Parser
ext/I18N/Langinfo/fallback/const-c.inc I18N::Langinfo
ext/I18N/Langinfo/fallback/const-xs.inc I18N::Langinfo
ext/I18N/Langinfo/Langinfo.pm I18N::Langinfo
@@ -848,12 +913,12 @@ ext/IPC/SysV/README IPC::SysV README
ext/IPC/SysV/regen.pl IPC::SysV file regeneration script
ext/IPC/SysV/SysV.xs IPC::SysV extension Perl module
ext/IPC/SysV/t/ipcsysv.t IPC::SysV test file
-ext/IPC/SysV/t/pod.t IPC::SysV test file
-ext/IPC/SysV/t/podcov.t IPC::SysV test file
ext/IPC/SysV/t/msg.t IPC::SysV test file
+ext/IPC/SysV/TODO IPC::SysV todo file
+ext/IPC/SysV/t/podcov.t IPC::SysV test file
+ext/IPC/SysV/t/pod.t IPC::SysV test file
ext/IPC/SysV/t/sem.t IPC::SysV test file
ext/IPC/SysV/t/shm.t IPC::SysV test file
-ext/IPC/SysV/TODO IPC::SysV todo file
ext/IPC/SysV/typemap IPC::SysV typemap
ext/List/Util/Changes Util extension
ext/List/Util/lib/List/Util.pm List::Util
@@ -1478,9 +1543,9 @@ lib/Attribute/Handlers/t/data_convert.t Test attribute data conversion
lib/Attribute/Handlers/t/linerep.t See if Attribute::Handlers works
lib/Attribute/Handlers/t/multi.t See if Attribute::Handlers works
lib/attributes.pm For "sub foo : attrlist"
+lib/AutoLoader.pm Autoloader base class
lib/AutoLoader/t/01AutoLoader.t See if AutoLoader works
lib/AutoLoader/t/02AutoSplit.t See if AutoSplit works
-lib/AutoLoader.pm Autoloader base class
lib/AutoSplit.pm Split up autoload functions
lib/autouse.pm Load and call a function only when it's used
lib/autouse.t See if autouse works
@@ -1582,8 +1647,8 @@ lib/CGI/t/start_end_end.t See if CGI.pm works
lib/CGI/t/start_end_start.t See if CGI.pm works
lib/CGI/t/switch.t See if CGI::Switch still loads
lib/CGI/t/uploadInfo.t See if CGI.pm works
-lib/CGI/t/upload.t See if CGI.pm works
lib/CGI/t/upload_post_text.txt.packed Test data for CGI.pm
+lib/CGI/t/upload.t See if CGI.pm works
lib/CGI/t/util-58.t See if 5.8-dependent features work
lib/CGI/t/util.t See if CGI.pm works
lib/CGI/Util.pm Utility functions
@@ -1839,10 +1904,10 @@ lib/ExtUtils/t/eu_command.t See if ExtUtils::Command works
lib/ExtUtils/t/FIRST_MAKEFILE.t See if FIRST_MAKEFILE works
lib/ExtUtils/t/fixin.t See if ExtUtils::MakeMaker works
lib/ExtUtils/t/hints.t See if hint files are honored.
+lib/ExtUtils/t/Installapi2.t See if new api for ExtUtils::Install::install() works
lib/ExtUtils/t/INSTALL_BASE.t Test INSTALL_BASE in MakeMaker
lib/ExtUtils/t/Installed.t See if ExtUtils::Installed works
lib/ExtUtils/t/Install.t See if ExtUtils::Install works
-lib/ExtUtils/t/Installapi2.t See if new api for ExtUtils::Install::install() works
lib/ExtUtils/t/INST_PREFIX.t See if MakeMaker can apply PREFIXs
lib/ExtUtils/t/INST.t Check MakeMaker INST_* macros
lib/ExtUtils/t/Liblist.t See if ExtUtils::Liblist works
@@ -1972,6 +2037,9 @@ lib/Getopt/Std.t See if Getopt::Std and Getopt::Long work
lib/h2ph.t See if h2ph works like it should
lib/h2xs.t See if h2xs produces expected lists of files
lib/hostname.pl Old hostname code
+lib/HTML/Tagset.pm HTML::Tagset
+lib/HTML/Tagset/t/00_about_verbose.t HTML::Tagset
+lib/HTML/Tagset/t/01_old_junk.t HTML::Tagset
lib/I18N/Collate.pm Routines to do strxfrm-based collation
lib/I18N/Collate.t See if I18N::Collate works
lib/I18N/LangTags/ChangeLog I18N::LangTags
@@ -2347,8 +2415,8 @@ lib/parent.pm Establish an ISA relationship with base classes at compile time
lib/parent/t/compile-time-file.t tests for parent.pm
lib/parent/t/compile-time.t tests for parent.pm
lib/parent/t/lib/Dummy2.plugin test files for parent.pm
-lib/parent/t/lib/Dummy.pm test files for parent.pm
lib/parent/t/lib/Dummy/Outside.pm test files for parent.pm
+lib/parent/t/lib/Dummy.pm test files for parent.pm
lib/parent/t/lib/FileThatOnlyExistsAsPMC.pmc test files for parent.pm
lib/parent/t/lib/ReturnsFalse.pm test files for parent.pm
lib/parent/t/parent-classfromclassfile.t tests for parent.pm
@@ -2565,9 +2633,9 @@ lib/Pod/Simple/t/testlib3/squaa/Vliff.pm Pod::Simple test file
lib/Pod/Simple/t/tiedfh.t Pod::Simple test file
lib/Pod/Simple/t/verbatim.t Pod::Simple test file
lib/Pod/Simple/t/verb_fmt.t Pod::Simple test file
-lib/Pod/Simple/t/x_nixer.t Pod::Simple test file
lib/Pod/Simple/t/xhtml01.t Pod::Simple test file
lib/Pod/Simple/t/xhtml05.t Pod::Simple test file
+lib/Pod/Simple/t/x_nixer.t Pod::Simple test file
lib/Pod/Simple/XHTML.pm turn Pod into XHTML
lib/Pod/Simple/XMLOutStream.pm turn Pod into XML
lib/Pod/t/basic.cap podlators test
@@ -2593,8 +2661,8 @@ lib/Pod/t/htmllink.t pod2html link test
lib/Pod/t/htmlview.pod pod2html render test input data
lib/Pod/t/htmlview.t pod2html render test
lib/Pod/t/InputObjects.t See if Pod::InputObjects works
-lib/Pod/t/man.t podlators test
lib/Pod/t/man-options.t podlators test
+lib/Pod/t/man.t podlators test
lib/Pod/t/parselink.t podlators test
lib/Pod/t/pod2html-lib.pl pod2html testing library
lib/Pod/t/pod2latex.t See if Pod::LaTeX works
@@ -2612,9 +2680,9 @@ lib/Search/Dict.pm Perform binary search on dictionaries
lib/Search/Dict.t See if Search::Dict works
lib/SelectSaver.pm Enforce proper select scoping
lib/SelectSaver.t See if SelectSaver works
-lib/SelfLoader/t/02SelfLoader-buggy.t See if SelfLoader works
lib/SelfLoader.pm Load functions only on demand
lib/SelfLoader/t/01SelfLoader.t See if SelfLoader works
+lib/SelfLoader/t/02SelfLoader-buggy.t See if SelfLoader works
lib/Shell.pm Make AUTOLOADed system() calls
lib/Shell.t Tests for above
lib/shellwords.pl Perl library to split into words with shell quoting
@@ -2637,29 +2705,29 @@ lib/syslog.pl Perl library supporting syslogging
lib/tainted.pl Old code for tainting
lib/TAP/Base.pm A parser for Test Anything Protocol
lib/TAP/Formatter/Color.pm A parser for Test Anything Protocol
-lib/TAP/Formatter/Console.pm A parser for Test Anything Protocol
lib/TAP/Formatter/Console/ParallelSession.pm A parser for Test Anything Protocol
+lib/TAP/Formatter/Console.pm A parser for Test Anything Protocol
lib/TAP/Formatter/Console/Session.pm A parser for Test Anything Protocol
lib/TAP/Harness.pm A parser for Test Anything Protocol
-lib/TAP/Parser.pm A parser for Test Anything Protocol
lib/TAP/Parser/Aggregator.pm A parser for Test Anything Protocol
lib/TAP/Parser/Grammar.pm A parser for Test Anything Protocol
-lib/TAP/Parser/Iterator.pm A parser for Test Anything Protocol
lib/TAP/Parser/Iterator/Array.pm A parser for Test Anything Protocol
+lib/TAP/Parser/Iterator.pm A parser for Test Anything Protocol
lib/TAP/Parser/Iterator/Process.pm A parser for Test Anything Protocol
lib/TAP/Parser/Iterator/Stream.pm A parser for Test Anything Protocol
lib/TAP/Parser/Multiplexer.pm A parser for Test Anything Protocol
-lib/TAP/Parser/Result.pm A parser for Test Anything Protocol
+lib/TAP/Parser.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Bailout.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Comment.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Plan.pm A parser for Test Anything Protocol
+lib/TAP/Parser/Result.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Pragma.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Test.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Unknown.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Version.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/YAML.pm A parser for Test Anything Protocol
-lib/TAP/Parser/Source.pm A parser for Test Anything Protocol
lib/TAP/Parser/Source/Perl.pm A parser for Test Anything Protocol
+lib/TAP/Parser/Source.pm A parser for Test Anything Protocol
lib/TAP/Parser/Utils.pm A parser for Test Anything Protocol
lib/TAP/Parser/YAMLish/Reader.pm A parser for Test Anything Protocol
lib/TAP/Parser/YAMLish/Writer.pm A parser for Test Anything Protocol
@@ -2710,9 +2778,9 @@ lib/Test/Harness/t/nofork.t Test::Harness test
lib/Test/Harness/t/parse.t Test::Harness test
lib/Test/Harness/t/premature-bailout.t Test::Harness test
lib/Test/Harness/t/process.t Test::Harness test
-lib/Test/Harness/t/prove.t Test::Harness test
lib/Test/Harness/t/proverc.t Test::Harness test
lib/Test/Harness/t/proverun.t Test::Harness test
+lib/Test/Harness/t/prove.t Test::Harness test
lib/Test/Harness/t/regression.t Test::Harness test
lib/Test/Harness/t/results.t Test::Harness test
lib/Test/Harness/t/source.t Test::Harness test
@@ -2724,8 +2792,8 @@ lib/Test/Harness/t/testargs.t Test::Harness test
lib/Test/Harness/t/unicode.t Test::Harness test
lib/Test/Harness/t/utils.t Test::Harness test
lib/Test/Harness/t/yamlish-output.t Test::Harness test
-lib/Test/Harness/t/yamlish-writer.t Test::Harness test
lib/Test/Harness/t/yamlish.t Test::Harness test
+lib/Test/Harness/t/yamlish-writer.t Test::Harness test
lib/Test/More.pm More utilities for writing tests
lib/Test.pm A simple framework for writing test scripts
lib/Test/Simple/Changes Test::Simple changes
@@ -3173,8 +3241,8 @@ parser.h parser object header
patchlevel.h The current patch level of perl
perlapi.c Perl API functions
perlapi.h Perl API function declarations
-perldtrace.d D script for Perl probes
perl.c main()
+perldtrace.d D script for Perl probes
perl.h Global declarations
perlio.c C code for PerlIO abstraction
perlio.h PerlIO abstraction
@@ -3563,6 +3631,7 @@ t/io/through.t See if pipe passes data intact
t/io/utf8.t See if file seeking works
t/japh/abigail.t Obscure tests
t/lib/1_compile.t See if the various libraries and extensions compile
+t/lib/App/Prove/Plugin/Dummy.pm Module for testing Test::Harness
t/lib/Cname.pm Test charnames in regexes (op/pat.t)
t/lib/common.pl Helper for lib/{warnings,feature}.t
t/lib/commonsense.t See if configuration meets basic needs
@@ -3583,7 +3652,9 @@ t/lib/compress/truncate.pl Compress::Zlib
t/lib/compress/zlib-generic.pl Compress::Zlib
t/lib/contains_pod.xr Pod-Parser test file
t/lib/cygwin.t Builtin cygwin function tests
-t/lib/App/Prove/Plugin/Dummy.pm Module for testing Test::Harness
+t/lib/data/catme.1 Test data for Test::Harness
+t/lib/data/proverc Test data for Test::Harness
+t/lib/data/sample.yml Test data for Test::Harness
t/lib/Devel/switchd.pm Module for t/run/switchd.t
t/lib/Dev/Null.pm Module for testing Test::Harness
t/lib/dprof/test1_t Perl code profiler tests
@@ -3635,9 +3706,6 @@ t/lib/mypragma.t Test the example user pragma
t/lib/NoFork.pm Module for testing Test::Harness
t/lib/no_load.t Test that some modules don't load others
t/lib/proxy_constant_subs.t Test that Proxy Constant Subs behave correctly
-t/lib/data/catme.1 Test data for Test::Harness
-t/lib/data/proverc Test data for Test::Harness
-t/lib/data/sample.yml Test data for Test::Harness
t/lib/sample-tests/bailout Test data for Test::Harness
t/lib/sample-tests/bignum Test data for Test::Harness
t/lib/sample-tests/bignum_many Test data for Test::Harness
@@ -3673,10 +3741,10 @@ t/lib/sample-tests/simple Test data for Test::Harness
t/lib/sample-tests/simple_fail Test data for Test::Harness
t/lib/sample-tests/simple_yaml Test data for Test::Harness
t/lib/sample-tests/skip Test data for Test::Harness
-t/lib/sample-tests/skip_nomsg Test data for Test::Harness
t/lib/sample-tests/skipall Test data for Test::Harness
t/lib/sample-tests/skipall_nomsg Test data for Test::Harness
t/lib/sample-tests/skipall_v13 Test data for Test::Harness
+t/lib/sample-tests/skip_nomsg Test data for Test::Harness
t/lib/sample-tests/space_after_plan Test data for Test::Harness
t/lib/sample-tests/stdout_stderr Test data for Test::Harness
t/lib/sample-tests/strict Test data for Test::Harness
@@ -3704,8 +3772,8 @@ t/lib/strict/subs Tests of "use strict 'subs'" for strict.t
t/lib/strict/vars Tests of "use strict 'vars'" for strict.t
t/lib/Test/Simple/Catch.pm Utility module for testing Test::Simple
t/lib/Test/Simple/sample_tests/death_in_eval.plx for exit.t
-t/lib/Test/Simple/sample_tests/death_with_handler.plx for exit.t
t/lib/Test/Simple/sample_tests/death.plx for exit.t
+t/lib/Test/Simple/sample_tests/death_with_handler.plx for exit.t
t/lib/Test/Simple/sample_tests/exit.plx for exit.t
t/lib/Test/Simple/sample_tests/extras.plx for exit.t
t/lib/Test/Simple/sample_tests/five_fail.plx for exit.t
@@ -3791,11 +3859,11 @@ t/Module_Pluggable/20dodgy_files.t Module::Pluggable tests
t/Module_Pluggable/21editor_junk.t Module::Pluggable tests
t/Module_Pluggable/acme/Acme/MyTest/Plugin/Foo.pm Module::Pluggable tests
t/Module_Pluggable/lib/Acme/MyTest/Plugin/Foo.pm Module::Pluggable tests
-t/Module_Pluggable/lib/EditorJunk/Plugin/Foo.pm Module::Pluggable tests
-t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm~ Module::Pluggable tests
-t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swp Module::Pluggable tests
t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm Module::Pluggable tests
+t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm~ Module::Pluggable tests
t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swo Module::Pluggable tests
+t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swp Module::Pluggable tests
+t/Module_Pluggable/lib/EditorJunk/Plugin/Foo.pm Module::Pluggable tests
t/Module_Pluggable/lib/ExtTest/Plugin/Bar.plugin Module::Pluggable tests
t/Module_Pluggable/lib/ExtTest/Plugin/Foo.plugin Module::Pluggable tests
t/Module_Pluggable/lib/ExtTest/Plugin/Quux/Foo.plugin Module::Pluggable tests
@@ -3880,8 +3948,8 @@ t/op/die_exit.t See if die and exit status interaction works
t/op/die.t See if die works
t/op/dor.t See if defined-or (//) works
t/op/do.t See if subroutines work
-t/op/each.t See if hash iterators work
t/op/each_array.t See if array iterators work
+t/op/each.t See if hash iterators work
t/op/eval.t See if eval operator works
t/op/exec.t See if exec, system and qx work
t/op/exists_sub.t See if exists(&sub) works
@@ -4095,6 +4163,7 @@ t/uni/latin2.t See if Unicode in latin2 works
t/uni/lower.t See if Unicode casing works
t/uni/overload.t See if Unicode overloading works
t/uni/sprintf.t See if Unicode sprintf works
+t/uni/tie.t See if Unicode tie works
t/uni/title.t See if Unicode casing works
t/uni/tr_7jis.t See if Unicode tr/// in 7jis works
t/uni/tr_eucjp.t See if Unicode tr/// in eucjp works
@@ -4104,7 +4173,6 @@ t/uni/upper.t See if Unicode casing works
t/uni/write.t See if Unicode formats work
t/win32/system.t See if system works in Win*
t/win32/system_tests Test runner for system.t
-t/uni/tie.t See if Unicode tie works
t/x2p/s2p.t See if s2p/psed work
uconfig.h Configuration header for microperl
uconfig.sh Configuration script for microperl
diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl
index 5b7486f..5cae67d 100644
--- a/Porting/Maintainers.pl
+++ b/Porting/Maintainers.pl
@@ -418,6 +418,20 @@ package Maintainers;
'CPAN' => 1,
},

+ 'HTML::Parser' =>
+ {
+ 'MAINTAINER' => 'gaas',
+ 'FILES' => q[ext/HTML/Parser],
+ 'CPAN' => 1,
+ },
+
+ 'HTML::Tagset' =>
+ {
+ 'MAINTAINER' => 'petdance',
+ 'FILES' => q[lib/HTML/Tagset.pm lib/HTML/Tagset],
+ 'CPAN' => 1,
+ },
+
'I18N::LangTags' =>
{
'MAINTAINER' => 'sburke',
diff --git a/ext/HTML/Parser/Makefile.PL b/ext/HTML/Parser/Makefile.PL
new file mode 100644
index 0000000..79081f7
--- /dev/null
+++ b/ext/HTML/Parser/Makefile.PL
@@ -0,0 +1,30 @@
+require 5.006;
+use strict;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+ NAME => 'HTML::Parser',
+ VERSION_FROM => 'Parser.pm',
+ H => [ "hparser.h", "hctype.h", "tokenpos.h", "pfunc.h",
+ "hparser.c", "util.c",
+ ],
+ PREREQ_PM => {
+ 'HTML::Tagset' => 3,
+ 'Test::More' => 0, # only needed to run 'make test'
+ },
+ DEFINE => "-DMARKED_SECTION",
+ dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
+ clean => { FILES => 'hctype.h pfunc.h' },
+);
+
+
+sub MY::postamble
+{
+ '
+pfunc.h : mkpfunc
+ $(PERL) mkpfunc >pfunc.h
+
+hctype.h : mkhctype
+ $(PERL) mkhctype >hctype.h
+'
+}
diff --git a/ext/HTML/Parser/Parser.pm b/ext/HTML/Parser/Parser.pm
new file mode 100644
index 0000000..72d5a98
--- /dev/null
+++ b/ext/HTML/Parser/Parser.pm
@@ -0,0 +1,1233 @@
+package HTML::Parser;
+
+# Copyright 1996-2007, Gisle Aas.
+# Copyright 1999-2000, Michael A. Chase.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the same terms as Perl itself.
+
+use strict;
+use vars qw($VERSION @ISA);
+
+$VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $
+
+require HTML::Entities;
+
+require XSLoader;
+XSLoader::load('HTML::Parser', $VERSION);
+
+sub new
+{
+ my $class = shift;
+ my $self = bless {}, $class;
+ return $self->init(@_);
+}
+
+
+sub init
+{
+ my $self = shift;
+ $self->_alloc_pstate;
+
+ my %arg = @_;
+ my $api_version = delete $arg{api_version} || (@_ ? 3 : 2);
+ if ($api_version >= 4) {
+ require Carp;
+ Carp::croak("API version $api_version not supported " .
+ "by HTML::Parser $VERSION");
+ }
+
+ if ($api_version < 3) {
+ # Set up method callbacks compatible with HTML-Parser-2.xx
+ $self->handler(text => "text", "self,text,is_cdata");
+ $self->handler(end => "end", "self,tagname,text");
+ $self->handler(process => "process", "self,token0,text");
+ $self->handler(start => "start",
+ "self,tagname,attr,attrseq,text");
+
+ $self->handler(comment =>
+ sub {
+ my($self, $tokens) = @_;
+ for (@$tokens) {
+ $self->comment($_);
+ }
+ }, "self,tokens");
+
+ $self->handler(declaration =>
+ sub {
+ my $self = shift;
+ $self->declaration(substr($_[0], 2, -1));
+ }, "self,text");
+ }
+
+ if (my $h = delete $arg{handlers}) {
+ $h = {@$h} if ref($h) eq "ARRAY";
+ while (my($event, $cb) = each %$h) {
+ $self->handler($event => @$cb);
+ }
+ }
+
+ # In the end we try to assume plain attribute or handler
+ while (my($option, $val) = each %arg) {
+ if ($option =~ /^(\w+)_h$/) {
+ $self->handler($1 => @$val);
+ }
+ elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) {
+ require Carp;
+ Carp::croak("Bad constructor option '$option'");
+ }
+ else {
+ $self->$option($val);
+ }
+ }
+
+ return $self;
+}
+
+
+sub parse_file
+{
+ my($self, $file) = @_;
+ my $opened;
+ if (!ref($file) && ref(\$file) ne "GLOB") {
+ # Assume $file is a filename
+ local(*F);
+ open(F, $file) || return undef;
+ binmode(F); # should we? good for byte counts
+ $opened++;
+ $file = *F;
+ }
+ my $chunk = '';
+ while (read($file, $chunk, 512)) {
+ $self->parse($chunk) || last;
+ }
+ close($file) if $opened;
+ $self->eof;
+}
+
+
+sub netscape_buggy_comment # legacy
+{
+ my $self = shift;
+ require Carp;
+ Carp::carp("netscape_buggy_comment() is deprecated. " .
+ "Please use the strict_comment() method instead");
+ my $old = !$self->strict_comment;
+ $self->strict_comment(!shift) if @_;
+ return $old;
+}
+
+# set up method stubs
+sub text { }
+*start = \&text;
+*end = \&text;
+*comment = \&text;
+*declaration = \&text;
+*process = \&text;
+
+1;
+
+__END__
+
+
+=head1 NAME
+
+HTML::Parser - HTML parser class
+
+=head1 SYNOPSIS
+
+ use HTML::Parser ();
+
+ # Create parser object
+ $p = HTML::Parser->new( api_version => 3,
+ start_h => [\&start, "tagname, attr"],
+ end_h => [\&end, "tagname"],
+ marked_sections => 1,
+ );
+
+ # Parse document text chunk by chunk
+ $p->parse($chunk1);
+ $p->parse($chunk2);
+ #...
+ $p->eof; # signal end of document
+
+ # Parse directly from file
+ $p->parse_file("foo.html");
+ # or
+ open(my $fh, "<:utf8", "foo.html") || die;
+ $p->parse_file($fh);
+
+=head1 DESCRIPTION
+
+Objects of the C class will recognize markup and
+separate it from plain text (alias data content) in HTML
+documents. As different kinds of markup and text are recognized, the
+corresponding event handlers are invoked.
+
+C is not a generic SGML parser. We have tried to
+make it able to deal with the HTML that is actually "out there", and
+it normally parses as closely as possible to the way the popular web
+browsers do it instead of strictly following one of the many HTML
+specifications from W3C. Where there is disagreement, there is often
+an option that you can enable to get the official behaviour.
+
+The document to be parsed may be supplied in arbitrary chunks. This
+makes on-the-fly parsing as documents are received from the network
+possible.
+
+If event driven parsing does not feel right for your application, you
+might want to use C. This is an C
+subclass that allows a more conventional program structure.
+
+
+=head1 METHODS
+
+The following method is used to construct a new C object:
+
+=over
+
+=item $p = HTML::Parser->new( %options_and_handlers )
+
+This class method creates a new C object and
+returns it. Key/value argument pairs may be provided to assign event
+handlers or initialize parser options. The handlers and parser
+options can also be set or modified later by the method calls described below.
+
+If a top level key is in the form "_h" (e.g., "text_h") then it
+assigns a handler to that event, otherwise it initializes a parser
+option. The event handler specification value must be an array
+reference. Multiple handlers may also be assigned with the 'handlers
+=> [%handlers]' option. See examples below.
+
+If new() is called without any arguments, it will create a parser that
+uses callback methods compatible with version 2 of C.
+See the section on "version 2 compatibility" below for details.
+
+The special constructor option 'api_version => 2' can be used to
+initialize version 2 callbacks while still setting other options and
+handlers. The 'api_version => 3' option can be used if you don't want
+to set any options and don't want to fall back to v2 compatible
+mode.
+
+Examples:
+
+ $p = HTML::Parser->new(api_version => 3,
+ text_h => [ sub {...}, "dtext" ]);
+
+This creates a new parser object with a text event handler subroutine
+that receives the original text with general entities decoded.
+
+ $p = HTML::Parser->new(api_version => 3,
+ start_h => [ 'my_start', "self,tokens" ]);
+
+This creates a new parser object with a start event handler method
+that receives the $p and the tokens array.
+
+ $p = HTML::Parser->new(api_version => 3,
+ handlers => { text => [\@array, "event,text"],
+ comment => [\@array, "event,text"],
+ });
+
+This creates a new parser object that stores the event type and the
+original text in @array for text and comment events.
+
+=back
+
+The following methods feed the HTML document
+to the C object:
+
+=over
+
+=item $p->parse( $string )
+
+Parse $string as the next chunk of the HTML document. The return
+value is normally a reference to the parser object (i.e. $p).
+Handlers invoked should not attempt to modify the $string in-place until
+$p->parse returns.
+
+If an invoked event handler aborts parsing by calling $p->eof, then
+$p->parse() will return a FALSE value.
+
+=item $p->parse( $code_ref )
+
+If a code reference is passed as the argument to be parsed, then the
+chunks to be parsed are obtained by invoking this function repeatedly.
+Parsing continues until the function returns an empty (or undefined)
+result. When this happens $p->eof is automatically signaled.
+
+Parsing will also abort if one of the event handlers calls $p->eof.
+
+The effect of this is the same as:
+
+ while (1) {
+ my $chunk = &$code_ref();
+ if (!defined($chunk) || !length($chunk)) {
+ $p->eof;
+ return $p;
+ }
+ $p->parse($chunk) || return undef;
+ }
+
+But it is more efficient as this loop runs internally in XS code.
+
+=item $p->parse_file( $file )
+
+Parse text directly from a file. The $file argument can be a
+filename, an open file handle, or a reference to an open file
+handle.
+
+If $file contains a filename and the file can't be opened, then the
+method returns an undefined value and $! tells why it failed.
+Otherwise the return value is a reference to the parser object.
+
+If a file handle is passed as the $file argument, then the file will
+normally be read until EOF, but not closed.
+
+If an invoked event handler aborts parsing by calling $p->eof,
+then $p->parse_file() may not have read the entire file.
+
+On systems with multi-byte line terminators, the values passed for the
+offset and length argspecs may be too low if parse_file() is called on
+a file handle that is not in binary mode.
+
+If a filename is passed in, then parse_file() will open the file in
+binary mode.
+
+=item $p->eof
+
+Signals the end of the HTML document. Calling the $p->eof method
+outside a handler callback will flush any remaining buffered text
+(which triggers the C event if there is any remaining text).
+
+Calling $p->eof inside a handler will terminate parsing at that point
+and cause $p->parse to return a FALSE value. This also terminates
+parsing by $p->parse_file().
+
+After $p->eof has been called, the parse() and parse_file() methods
+can be invoked to feed new documents with the parser object.
+
+The return value from eof() is a reference to the parser object.
+
+=back
+
+
+Most parser options are controlled by boolean attributes.
+Each boolean attribute is enabled by calling the corresponding method
+with a TRUE argument and disabled with a FALSE argument. The
+attribute value is left unchanged if no argument is given. The return
+value from each method is the old attribute value.
+
+Methods that can be used to get and/or set parser options are:
+
+=over
+
+=item $p->attr_encoded
+
+=item $p->attr_encoded( $bool )
+
+By default, the C and C<@attr> argspecs will have general
+entities for attribute values decoded. Enabling this attribute leaves
+entities alone.
+
+=item $p->boolean_attribute_value( $val )
+
+This method sets the value reported for boolean attributes inside HTML
+start tags. By default, the name of the attribute is also used as its
+value. This affects the values reported for C and C
+argspecs.
+
+=item $p->case_sensitive
+
+=item $p->case_sensitive( $bool )
+
+By default, tagnames and attribute names are down-cased. Enabling this
+attribute leaves them as found in the HTML source document.
+
+=item $p->closing_plaintext
+
+=item $p->closing_plaintext( $bool )
+
+By default, "plaintext" element can never be closed. Everything up to
+the end of the document is parsed in CDATA mode. This historical
+behaviour is what at least MSIE does. Enabling this attribute makes
+closing "" tag effective and the parsing process will resume
+after seeing this tag. This emulates gecko-based browsers.
+
+=item $p->empty_element_tags
+
+=item $p->empty_element_tags( $bool )
+
+By default, empty element tags are not recognized as such and the "/"
+before ">" is just treated like a normal name character (unless
+C is enabled). Enabling this attribute make
+C recognize these tags.
+
+Empty element tags look like start tags, but end with the character
+sequence "/>" instead of ">". When recognized by C they
+cause an artificial end event in addition to the start event. The
+C for the artificial end event will be empty and the C
+array will be undefined even though the the token array will have one
+element containing the tag name.
+
+=item $p->marked_sections
+
+=item $p->marked_sections( $bool )
+
+By default, section markings like are treated like
+ordinary text. When this attribute is enabled section markings are
+honoured.
+
+There are currently no events associated with the marked section
+markup, but the text can be returned as C.
+
+=item $p->strict_comment
+
+=item $p->strict_comment( $bool )
+
+By default, comments are terminated by the first occurrence of "-->".
+This is the behaviour of most popular browsers (like Mozilla, Opera and
+MSIE), but it is not correct according to the official HTML
+standard. Officially, you need an even number of "--" tokens before
+the closing ">" is recognized and there may not be anything but
+whitespace between an even and an odd "--".
+
+The official behaviour is enabled by enabling this attribute.
+
+Enabling of 'strict_comment' also disables recognizing these forms as
+comments:
+
+
+
+
+
+=item $p->strict_end
+
+=item $p->strict_end( $bool )
+
+By default, attributes and other junk are allowed to be present on end tags in a
+manner that emulates MSIE's behaviour.
+
+The official behaviour is enabled with this attribute. If enabled,
+only whitespace is allowed between the tagname and the final ">".
+
+=item $p->strict_names
+
+=item $p->strict_names( $bool )
+
+By default, almost anything is allowed in tag and attribute names.
+This is the behaviour of most popular browsers and allows us to parse
+some broken tags with invalid attribute values like:
+
+
+
+By default, "LIST]" is parsed as a boolean attribute, not as
+part of the ALT value as was clearly intended. This is also what
+Mozilla sees.
+
+The official behaviour is enabled by enabling this attribute. If
+enabled, it will cause the tag above to be reported as text
+since "LIST]" is not a legal attribute name.
+
+=item $p->unbroken_text
+
+=item $p->unbroken_text( $bool )
+
+By default, blocks of text are given to the text handler as soon as
+possible (but the parser takes care always to break text at a
+boundary between whitespace and non-whitespace so single words and
+entities can always be decoded safely). This might create breaks that
+make it hard to do transformations on the text. When this attribute is
+enabled, blocks of text are always reported in one piece. This will
+delay the text event until the following (non-text) event has been
+recognized by the parser.
+
+Note that the C argspec will give you the offset of the first
+segment of text and C is the combined length of the segments.
+Since there might be ignored tags in between, these numbers can't be
+used to directly index in the original document file.
+
+=item $p->utf8_mode
+
+=item $p->utf8_mode( $bool )
+
+Enable this option when parsing raw undecoded UTF-8. This tells the
+parser that the entities expanded for strings reported by C,
+C<@attr> and C should be expanded as decoded UTF-8 so they end
+up compatible with the surrounding text.
+
+If C is enabled then it is an error to pass strings
+containing characters with code above 255 to the parse() method, and
+the parse() method will croak if you try.
+
+Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8
+encoded. The character can also be represented by the entity
+"♥" or "♥". If we feed the parser:
+
+ $p->parse("\xE2\x99\xA5♥");
+
+then C will be reported as "\xE2\x99\xA5\x{2665}" without
+C enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled.
+The later string is what you want.
+
+This option is only available with perl-5.8 or better.
+
+=item $p->xml_mode
+
+=item $p->xml_mode( $bool )
+
+Enabling this attribute changes the parser to allow some XML
+constructs. This enables the behaviour controlled by individually by
+the C, C, C and
+C attributes and also suppresses special treatment of
+elements that are parsed as CDATA for HTML.
+
+=item $p->xml_pic
+
+=item $p->xml_pic( $bool )
+
+By default, I are terminated by ">". When
+this attribute is enabled, processing instructions are terminated by
+"?>" instead.
+
+=back
+
+As markup and text is recognized, handlers are invoked. The following
+method is used to set up handlers for different events:
+
+=over
+
+=item $p->handler( event => \&subroutine, $argspec )
+
+=item $p->handler( event => $method_name, $argspec )
+
+=item $p->handler( event => \@accum, $argspec )
+
+=item $p->handler( event => "" );
+
+=item $p->handler( event => undef );
+
+=item $p->handler( event );
+
+This method assigns a subroutine, method, or array to handle an event.
+
+Event is one of C, C, C, C, C,
+C, C, C or C.
+
+The C<\&subroutine> is a reference to a subroutine which is called to handle
+the event.
+
+The C<$method_name> is the name of a method of $p which is called to handle
+the event.
+
+The C<@accum> is an array that will hold the event information as
+sub-arrays.
+
+If the second argument is "", the event is ignored.
+If it is undef, the default handler is invoked for the event.
+
+The C<$argspec> is a string that describes the information to be reported
+for the event. Any requested information that does not apply to a
+specific event is passed as C. If argspec is omitted, then it
+is left unchanged.
+
+The return value from $p->handler is the old callback routine or a
+reference to the accumulator array.
+
+Any return values from handler callback routines/methods are always
+ignored. A handler callback can request parsing to be aborted by
+invoking the $p->eof method. A handler callback is not allowed to
+invoke the $p->parse() or $p->parse_file() method. An exception will
+be raised if it tries.
+
+Examples:
+
+ $p->handler(start => "start", 'self, attr, attrseq, text' );
+
+This causes the "start" method of object $p to be called for 'start' events.
+The callback signature is $p->start(\%attr, \@attr_seq, $text).
+
+ $p->handler(start => \&start, 'attr, attrseq, text' );
+
+This causes subroutine start() to be called for 'start' events.
+The callback signature is start(\%attr, \@attr_seq, $text).
+
+ $p->handler(start => \@accum, '"S", attr, attrseq, text' );
+
+This causes 'start' event information to be saved in @accum.
+The array elements will be ['S', \%attr, \@attr_seq, $text].
+
+ $p->handler(start => "");
+
+This causes 'start' events to be ignored. It also suppresses
+invocations of any default handler for start events. It is in most
+cases equivalent to $p->handler(start => sub {}), but is more
+efficient. It is different from the empty-sub-handler in that
+C is not reset by it.
+
+ $p->handler(start => undef);
+
+This causes no handler to be associated with start events.
+If there is a default handler it will be invoked.
+
+=back
+
+Filters based on tags can be set up to limit the number of events
+reported. The main bottleneck during parsing is often the huge number
+of callbacks made from the parser. Applying filters can improve
+performance significantly.
+
+The following methods control filters:
+
+=over
+
+=item $p->ignore_elements( @tags )
+
+Both the C event and the C event as well as any events that
+would be reported in between are suppressed. The ignored elements can
+contain nested occurrences of itself. Example:
+
+ $p->ignore_elements(qw(script style));
+
+The C
+
+å
+EOT
+
+$p->parse($doc)->eof;
+
+is($text, $doc);
+is($dtext, <<"EOT");
+å
+ååAAAA
+
+foo\240bar
+foo\240bar
+&xyzzy
+&xyzzy;
+
+\1
+\377
+\377
+\377G
+
+�
+�
+&
+&#
+&#x
+&aring
+
+
+å
+EOT
diff --git a/ext/HTML/Parser/t/entities.t b/ext/HTML/Parser/t/entities.t
new file mode 100644
index 0000000..b8342f5
--- /dev/null
+++ b/ext/HTML/Parser/t/entities.t
@@ -0,0 +1,193 @@
+use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
+
+use Test::More tests => 12;
+
+$a = "Våre norske tegn bør æres";
+
+decode_entities($a);
+
+is($a, "Våre norske tegn bør æres");
+
+encode_entities($a);
+
+is($a, "Våre norske tegn bør æres");
+
+decode_entities($a);
+encode_entities_numeric($a);
+
+is($a, "Våre norske tegn bør æres");
+
+$a = "<&>\"'";
+is(encode_entities($a), "<&>"'");
+is(encode_entities_numeric($a), "<&>"'");
+
+$a = "abcdef";
+is(encode_entities($a, 'a-c'), "abcdef");
+
+
+# See how well it does against rfc1866...
+$ent = $plain = "";
+while () {
+ next unless /^\s*
+# Subject: HTML entities problem with 5.11
+# To: libwww-perl@ics.uci.edu
+# Date: Fri, 05 Sep 1997 16:56:55 +1000
+# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU>
+#
+# Hi. I've got a problem that has surfaced with the changes to
+# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening
+# in the process of encoding then decoding special entities. Eg, what goes
+# in as "abc&def&ghi" comes out as "abc&def;&ghi;".
+
+is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;");
+
+# Decoding of '
+is(decode_entities("'"), "'");
+is(encode_entities("'", "'"), "'");
+
+
+__END__
+# Quoted from rfc1866.txt
+
+14. Proposed Entities
+
+ The HTML DTD references the "Added Latin 1" entity set, which only
+ supplies named entities for a subset of the non-ASCII characters in
+ [ISO-8859-1], namely the accented characters. The following entities
+ should be supported so that all ISO 8859-1 characters may only be
+ referenced symbolically. The names for these entities are taken from
+ the appendixes of [SGML].
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Berners-Lee & Connolly Standards Track [Page 75]
+
+RFC 1866 Hypertext Markup Language - 2.0 November 1995
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Berners-Lee & Connolly Standards Track [Page 76]
+
+RFC 1866 Hypertext Markup Language - 2.0 November 1995
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/ext/HTML/Parser/t/entities2.t b/ext/HTML/Parser/t/entities2.t
new file mode 100644
index 0000000..7840c71
--- /dev/null
+++ b/ext/HTML/Parser/t/entities2.t
@@ -0,0 +1,57 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 9;
+
+use HTML::Entities qw(_decode_entities);
+
+eval {
+ _decode_entities("<", undef);
+};
+like($@, qr/^Can't inline decode readonly string/);
+
+eval {
+ my $a = "";
+ _decode_entities($a, $a);
+};
+like($@, qr/^2nd argument must be hash reference/);
+
+eval {
+ my $a = "";
+ _decode_entities($a, []);
+};
+like($@, qr/^2nd argument must be hash reference/);
+
+$a = "<";
+_decode_entities($a, undef);
+is($a, "<");
+
+_decode_entities($a, { "lt" => "<" });
+is($a, "<");
+
+my $x = "x" x 20;
+
+my $err;
+for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") {
+ my $a = $_;
+ $a =~ s/:/&a;/g;
+ my $b = $_;
+ $b =~ s/:/$x/g;
+ _decode_entities($a, { "a" => $x });
+ if ($a ne $b) {
+ diag "Something went wrong with '$_'";
+ $err++;
+ }
+}
+ok(!$err);
+
+$a = "foo bar";
+_decode_entities($a, \%HTML::Entities::entity2char);
+is($a, "foo\xA0bar");
+
+$a = "foo bar";
+_decode_entities($a, \%HTML::Entities::entity2char);
+is($a, "foo bar");
+
+_decode_entities($a, \%HTML::Entities::entity2char, 1);
+is($a, "foo\xA0bar");
diff --git a/ext/HTML/Parser/t/filter-methods.t b/ext/HTML/Parser/t/filter-methods.t
new file mode 100644
index 0000000..9eccaf1
--- /dev/null
+++ b/ext/HTML/Parser/t/filter-methods.t
@@ -0,0 +1,205 @@
+#!/usr/bin/perl -w
+
+use Test::More tests => 12;
+use strict;
+
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]);
+$p->ignore_elements("script");
+$p->unbroken_text(1);
+
+$p->handler(default => [], "event, text");
+$p->parse(<<"EOT")->eof;
+foo
+This is an italic and bold text.
+
+
+EOT
+
+my $t = join("||", map join("|", @$_), @{$p->handler("default")});
+#diag $t;
+
+is($t, "start_document|||start|||start|||start|||text|foo||end|||start|||text|
+This is an italic and bold text.
+||end|||text|
+||end|||text|
+||end_document|", 'ignore_elements');
+
+
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags("a");
+$p->handler(start => sub {
+ my($tagname, %attr) = @_;
+ ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start');
+ }, 'tagname, @attr');
+$p->handler(end => sub {
+ my $tagname = shift;
+ is($tagname, "a", 'report_tags end');
+ }, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+
+This is very nice example.
+
+EOT
+
+
+#------------------------------------------------------
+
+my @tags;
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(a em));
+$p->ignore_tags(qw(em));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+
+This is yet another very nice example.
+
+EOT
+is(join('|', @tags), 'a', 'report_tags followed by ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset report_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->report_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h3', 'report_tags replaces filter');
+
+
+#------------------------------------------------------
+
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h1 h2));
+$p->ignore_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->ignore_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<eof;
+
+Next example
+Next example
+
+EOT
+is(join('|', @tags), 'h1', 'ignore_tags before report_tags');
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_elements("script");
+my $res="";
+$p->handler(default=> sub {$res.=$_[0];}, 'text');
+$p->parse(<<'EOT')->eof;
+A C D F
+EOT
+is($res,"A C D F\n","ignore without "
+ ignore this
+
+
+
+
+
+
+
+Dette er vanlig tekst. Denne teksten definerer også slutten på
+<head> delen av dokumentet.
+
+"
+ ignore this too
+
+
+
+
+
+Dette er også vanlig tekst som ikke skal blir parset i det hele tatt.
+
+EOT
+
+$| = 1;
+
+#$HTML::HeadParser::DEBUG = 1;
+require HTML::HeadParser;
+my $p = HTML::HeadParser->new( H->new );
+
+if ($p->parse($HTML)) {
+ fail("Need more data which should not happen");
+} else {
+ #diag $p->as_string;
+ pass();
+}
+
+like($p->header('Title'), qr/Å være eller å ikke være/);
+is($p->header('Expires'), 'Soon');
+is($p->header('Content-Base'), 'http://www.sn.no');
+like($p->header('Link'), qr//);
+
+# This header should not be present because the head ended
+ok(!$p->header('Isindex'));
+
+
+# Try feeding one char at a time
+my $expected = $p->as_string;
+my $nl = 1;
+$p = HTML::HeadParser->new(H->new);
+while ($HTML =~ /(.)/sg) {
+ #print STDERR '#' if $nl;
+ #print STDERR $1;
+ $nl = $1 eq "\n";
+ $p->parse($1) or last;
+}
+is($p->as_string, $expected);
+
+
+# Try reading it from a file
+my $file = "hptest$$.html";
+die "$file already exists" if -e $file;
+
+open(FILE, ">$file") or die "Can't create $file: $!";
+binmode(FILE);
+print FILE $HTML;
+print FILE "This is more content...\n" x 2000;
+print FILE "Buuuh!\n" x 200;
+close FILE or die "Can't close $file: $!";
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+is($p->header("Title"), "Å være eller å ikke være");
+
+
+# We got into an infinite loop on data without tags and no EOL.
+# This was actually a HTML::Parser bug.
+open(FILE, ">$file") or die "Can't create $file: $!";
+print FILE "Foo";
+close(FILE);
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+ok(!$p->as_string);
+
+SKIP: {
+ skip "Need Unicode support", 2 if $] < 5.008;
+
+ # Test that the Unicode BOM does not confuse us?
+ $p = HTML::HeadParser->new(H->new);
+ ok($p->parse("\x{FEFF}\nHi <foo>"));
+ $p->eof;
+
+ is($p->header("title"), "Hi ");
+}
diff --git a/ext/HTML/Parser/t/ignore.t b/ext/HTML/Parser/t/ignore.t
new file mode 100644
index 0000000..008739e
--- /dev/null
+++ b/ext/HTML/Parser/t/ignore.t
@@ -0,0 +1,27 @@
+
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser ();
+
+my $html = 'text';
+
+my $text = '';
+my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']);
+$p->parse($html)->eof;
+is($text, $html);
+
+$text = '';
+$p->handler(start => "");
+$p->parse($html)->eof;
+is($text, 'text');
+
+$text = '';
+$p->handler(end => 0);
+$p->parse($html)->eof;
+is($text, 'text');
+
+$text = '';
+$p->handler(start => undef);
+$p->parse($html)->eof;
+is($text, 'text');
diff --git a/ext/HTML/Parser/t/largetags.t b/ext/HTML/Parser/t/largetags.t
new file mode 100644
index 0000000..a9ed3ff
--- /dev/null
+++ b/ext/HTML/Parser/t/largetags.t
@@ -0,0 +1,38 @@
+# Exercise the tokenpos buffer allocation routines by feeding it
+# very large tags.
+
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->handler("start" =>
+ sub {
+ my $tp = shift;
+ #diag int(@$tp), " - ", join(", ", @$tp);
+ is(@$tp, 2 + 26 * 6 * 4);
+ }, "tokenpos");
+
+$p->handler("declaration" =>
+ sub {
+ my $t = shift;
+ #diag int(@$t), " - @$t";
+ is(@$t, 26 * 6 * 2 + 1);
+ }, "tokens");
+
+$p->parse("parse("$_=1 ");
+}
+$p->parse(">");
+
+$p->parse("parse("$_ -- $_ -- ");
+}
+$p->parse(">");
+$p->eof;
+exit;
+
diff --git a/ext/HTML/Parser/t/linkextor-base.t b/ext/HTML/Parser/t/linkextor-base.t
new file mode 100644
index 0000000..7ef8f02
--- /dev/null
+++ b/ext/HTML/Parser/t/linkextor-base.t
@@ -0,0 +1,41 @@
+# This test that HTML::LinkExtor really absolutize links correctly
+# when a base URL is given to the constructor.
+
+use Test::More tests => 5;
+require HTML::LinkExtor;
+
+SKIP: {
+eval {
+ require URI;
+};
+skip $@, 5 if $@;
+
+# Try with base URL and the $p->links interface.
+$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html");
+$p->parse(<eof;
+
+
+
+
+
+This is link and an .
+HTML
+
+@p = $p->links;
+
+# There should be 4 links in the document
+is(@p, 4);
+
+for (@p) {
+ ($t, %attr) = @$_ if $_->[0] eq 'img';
+}
+
+is($t, 'img');
+
+is(delete $attr{src}, "http://www.sn.no/foo/img.jpg");
+
+is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif");
+
+ok(!scalar(keys %attr)); # there should be no more attributes
+}
diff --git a/ext/HTML/Parser/t/linkextor-rel.t b/ext/HTML/Parser/t/linkextor-rel.t
new file mode 100644
index 0000000..1190a96
--- /dev/null
+++ b/ext/HTML/Parser/t/linkextor-rel.t
@@ -0,0 +1,36 @@
+use Test::More tests => 4;
+
+require HTML::LinkExtor;
+
+$HTML = <
+
+
+
+
+This is link and an .
+HTML
+
+
+# Try the callback interface
+$links = "";
+$p = HTML::LinkExtor->new(
+ sub {
+ my($tag, %links) = @_;
+ #diag "$tag @{[%links]}";
+ $links .= "$tag @{[%links]}\n";
+ });
+
+$p->parse($HTML); $p->eof;
+
+ok($links =~ m|^base href http://www\.sn\.no/$|m);
+ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m);
+ok($links =~ m|^a href link\.html$|m);
+
+# Used to be problems when using the links method on a document with
+# no links it it. This is a test to prove that it works.
+$p = new HTML::LinkExtor;
+$p->parse("this is a document with no links"); $p->eof;
+@a = $p->links;
+is(@a, 0);
diff --git a/ext/HTML/Parser/t/magic.t b/ext/HTML/Parser/t/magic.t
new file mode 100644
index 0000000..366f275
--- /dev/null
+++ b/ext/HTML/Parser/t/magic.t
@@ -0,0 +1,41 @@
+# Check that the magic signature at the top of struct p_state works and that we
+# catch modifications to _hparser_xs_state gracefully
+
+use Test::More tests => 5;
+
+use HTML::Parser;
+
+$p = HTML::Parser->new(api_version => 3);
+
+$p->xml_mode(1);
+
+# We should not be able to simply modify this stuff
+eval {
+ ${$p->{_hparser_xs_state}} += 4;
+};
+like($@, qr/^Modification of a read-only value attempted/);
+
+
+my $x = delete $p->{_hparser_xs_state};
+
+eval {
+ $p->xml_mode(1);
+};
+like($@, qr/^Can't find '_hparser_xs_state'/);
+
+$p->{_hparser_xs_state} = \($$x + 16);
+
+eval {
+ $p->xml_mode(1);
+};
+like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/);
+
+$p->{_hparser_xs_state} = 33;
+eval {
+ $p->xml_mode(1);
+};
+like($@, qr/^_hparser_xs_state element is not a reference/);
+
+$p->{_hparser_xs_state} = $x;
+
+ok($p->xml_mode(0));
diff --git a/ext/HTML/Parser/t/marked-sect.t b/ext/HTML/Parser/t/marked-sect.t
new file mode 100644
index 0000000..6a63478
--- /dev/null
+++ b/ext/HTML/Parser/t/marked-sect.t
@@ -0,0 +1,121 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $tag;
+my $text;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"],
+ text_h => [sub { $text .= shift }, "dtext"],
+ );
+
+
+use Test::More tests => 14;
+
+SKIP: {
+eval {
+ $p->marked_sections(1);
+};
+skip $@, 14 if $@;
+
+$p->parse("");
+is($text, "foo");
+
+$p->parse("");
+is($text, "foobar");
+
+$p->parse("]]>\n
");
+is($text, "foobarfoo\n");
+
+$text = "";
+$p->parse("parse(",bar>]]>
");
+is($text, "<foo]]>");
+
+$text = "";
+$p->parse("]]>]]>å
");
+is($text, "ååå");
+is($tag, "br");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "fooå");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "fooå");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "fooå");
+
+$text = "";
+$p->parse("]]>
");
+is($text, "fooå");
+
+# offsets/line/column numbers
+$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"],
+ marked_sections => 1,
+ );
+$p->parse(<<'EOT')->eof;
+Test
+
+]]>
+
+ Test
+EOT
+
+my @x;
+sub x {
+ my($line, $col, $offset, $event, $text) = @_;
+ $text =~ s/\n/\\n/g;
+ $text =~ s/ /./g;
+ push(@x, "$line.$col:$offset $event \"$text\"\n");
+}
+
+#diag @x;
+is(join("", @x), <<'EOT');
+1.0:0 start_document ""
+1.0:0 start ""
+1.7:7 text "Test"
+1.11:11 end ""
+1.19:19 text "\n"
+3.3:32 text "fooå\n"
+4.3:49 text "\n"
+5.4:54 text "\nINCLUDE\nSTUFF\n"
+8.3:72 text "\n.."
+9.2:75 start ""
+9.6:79 text "Test"
+9.10:83 end ""
+9.15:88 text "\n"
+10.0:89 end_document ""
+EOT
+
+my $doc = "";
+my $result = "";
+$p = HTML::Parser->new(
+ marked_sections => 1,
+ handlers => {
+ default => [ sub { $result .= join("",@_); }, "skipped_text,text" ]
+ }
+)->parse($doc)->eof;
+is($doc, $result);
+
+$text = "";
+$p = HTML::Parser->new(
+ text_h => [sub { $text .= shift }, "dtext"],
+ marked_sections => 1,
+);
+
+$p->parse("");
+is($text, "foo [1]", "CDATA text ending in square bracket");
+
+} # SKIP
diff --git a/ext/HTML/Parser/t/msie-compat.t b/ext/HTML/Parser/t/msie-compat.t
new file mode 100644
index 0000000..90d4b7e
--- /dev/null
+++ b/ext/HTML/Parser/t/msie-compat.t
@@ -0,0 +1,58 @@
+#!perl -w
+
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 2;
+
+my $TEXT = "";
+sub h
+{
+ my($event, $tagname, $text) = @_;
+ for ($event, $tagname, $text) {
+ if (defined) {
+ s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+ }
+ else {
+ $_ = "";
+ }
+ }
+
+ $TEXT .= "[$event,$tagname,$text]\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"]);
+$p->parse("");
+$p->parse("");
+$p->parse("' 'bar>' x>");
+$p->parse("\"");
+$p->parse(" \"bar>\" x>");
+$p->parse("");
+$p->parse("\" >");
+$p->parse("
+xmp
+
+EOT
+
+my $p = HTML::Parser->new(api_version => 3);
+
+my $sum_len = 0;
+my $count = 0;
+my $err;
+
+$p->handler(default =>
+ sub {
+ my($offset, $length, $offset_end, $line, $col, $text) = @_;
+ my $copy = $text;
+ $copy =~ s/\n/\\n/g;
+ substr($copy, 30) = "..." if length($copy) > 32;
+ #diag sprintf ">>> %d.%d %s", $line, $col, $copy;
+ if ($offset != $sum_len) {
+ diag "offset mismatch $offset vs $sum_len";
+ $err++;
+ }
+ if ($offset_end != $offset + $length) {
+ diag "offset_end $offset_end wrong";
+ $err++;
+ }
+ if ($length != length($text)) {
+ diag "length mismatch";
+ $err++;
+ }
+ if (substr($HTML, $offset, $length) ne $text) {
+ diag "content mismatch";
+ $err++;
+ }
+ $sum_len += $length;
+ $count++;
+ },
+ 'offset,length,offset_end,line,column,text');
+
+for (split(//, $HTML)) {
+ $p->parse($_);
+}
+$p->eof;
+
+ok($count > 5 && !$err);
+
+
diff --git a/ext/HTML/Parser/t/options.t b/ext/HTML/Parser/t/options.t
new file mode 100644
index 0000000..ff5f7db
--- /dev/null
+++ b/ext/HTML/Parser/t/options.t
@@ -0,0 +1,36 @@
+# Test option setting methods
+
+use Test::More tests => 10;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3,
+ xml_mode => 1);
+my $old;
+
+$old = $p->boolean_attribute_value("foo");
+ok(!defined $old);
+
+$old = $p->boolean_attribute_value();
+is($old, "foo");
+
+$old = $p->boolean_attribute_value(undef);
+is($old, "foo");
+ok(!defined($p->boolean_attribute_value));
+
+ok($p->xml_mode(0));
+ok(!$p->xml_mode);
+
+my $seen_buggy_comment_warning;
+$SIG{__WARN__} =
+ sub {
+ local $_ = shift;
+ $seen_buggy_comment_warning++
+ if /^netscape_buggy_comment is deprecated/;
+ };
+
+ok(!$p->strict_comment(1));
+ok($p->strict_comment);
+ok(!$p->netscape_buggy_comment);
+ok($seen_buggy_comment_warning);
diff --git a/ext/HTML/Parser/t/parsefile.t b/ext/HTML/Parser/t/parsefile.t
new file mode 100644
index 0000000..f373f06
--- /dev/null
+++ b/ext/HTML/Parser/t/parsefile.t
@@ -0,0 +1,45 @@
+use Test::More tests => 6;
+
+my $filename = "file$$.htm";
+die "$filename is already there" if -e $filename;
+open(FILE, ">$filename") || die "Can't create $filename: $!";
+print FILE <<'EOT'; close(FILE);
+Heisan
+EOT
+
+{
+ package MyParser;
+ require HTML::Parser;
+ @ISA=qw(HTML::Parser);
+
+ sub start
+ {
+ my($self, $tag, $attr) = @_;
+ Test::More::is($tag, "title");
+ }
+}
+
+MyParser->new->parse_file($filename);
+open(FILE, $filename) || die;
+MyParser->new->parse_file(*FILE);
+seek(FILE, 0, 0) || die;
+MyParser->new->parse_file(\*FILE);
+close(FILE);
+
+require IO::File;
+my $io = IO::File->new($filename) || die;
+MyParser->new->parse_file($io);
+$io->seek(0, 0) || die;
+MyParser->new->parse_file(*$io);
+
+my $text = '';
+$io->seek(0, 0) || die;
+MyParser->new(
+ start_h => [ sub{ shift->eof; }, "self" ],
+ text_h => [ sub{ $text = shift; }, "text" ])->parse_file(*$io);
+ok(!$text);
+
+close($io); # needed because of bug in perl
+undef($io);
+
+unlink($filename) or warn "Can't unlink $filename: $!";
diff --git a/ext/HTML/Parser/t/parser.t b/ext/HTML/Parser/t/parser.t
new file mode 100644
index 0000000..0ce4d95
--- /dev/null
+++ b/ext/HTML/Parser/t/parser.t
@@ -0,0 +1,184 @@
+use Test::More tests => 7;
+
+$HTML = <<'HTML';
+
+
+
+
+
+Various entities. The parser must never break them in the middle:
+
+/
+/
+È
+௖
+
+å-Å
+
+
+This is a link
+
This is another one
+
+
+

+
+ and this is not.
+
+ that Netscape hates -->
+
+< this > was not a tag.
+
+
+
+HTML
+
+#-------------------------------------------------------------------
+
+{
+ package P;
+ require HTML::Parser;
+ @ISA=qw(HTML::Parser);
+ $OUT='';
+ $COUNT=0;
+
+ sub new
+ {
+ my $class = shift;
+ my $self = $class->SUPER::new;
+ $OUT = '';
+ die "Can only have one" if $COUNT++;
+ $self;
+ }
+
+ sub DESTROY
+ {
+ my $self = shift;
+ eval { $self->SUPER::DESTROY; };
+ $COUNT--;
+ }
+
+ sub declaration
+ {
+ my($self, $decl) = @_;
+ $OUT .= "[[$decl]]|";
+ }
+
+ sub start
+ {
+ my($self, $tag, $attr) = @_;
+ $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr);
+ $attr = "/$attr" if length $attr;
+ $OUT .= "<<$tag$attr>>|";
+ }
+
+ sub end
+ {
+ my($self, $tag) = @_;
+ $OUT .= ">>$tag<<|";
+ }
+
+ sub comment
+ {
+ my($self, $comment) = @_;
+ $OUT .= "##$comment##|";
+ }
+
+ sub text
+ {
+ my($self, $text) = @_;
+ #$text =~ s/\n/\\n/g;
+ #$text =~ s/\t/\\t/g;
+ #$text =~ s/ /·/g;
+ $OUT .= "$text|";
+ }
+
+ sub result
+ {
+ $OUT;
+ }
+}
+
+for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") {
+#for $chunksize (1) {
+ if ($chunksize =~ /^file/) {
+ #print "Parsing from $chunksize";
+ } else {
+ #print "Parsing using $chunksize byte chunks";
+ }
+ my $p = P->new;
+
+ if ($chunksize =~ /^file/) {
+ # First we must create the file
+ my $tmpfile = "tmp-$$.html";
+ my $file = $tmpfile;
+ die "$file already exists" if -e $file;
+ open(FILE, ">$file") or die "Can't create $file: $!";
+ binmode FILE;
+ print FILE $HTML;
+ close(FILE);
+
+ if ($chunksize eq "filehandle") {
+ require FileHandle;
+ my $fh = FileHandle->new($file) || die "Can't open $file: $!";
+ $file = $fh;
+ }
+
+ # then we can parse it.
+ $p->parse_file($file);
+ close $file if $chunksize eq "filehandle";
+ unlink($tmpfile) || warn "Can't unlink $tmpfile: $!";
+ } else {
+ my $copy = $HTML;
+ while (length $copy) {
+ my $chunk = substr($copy, 0, $chunksize);
+ substr($copy, 0, $chunksize) = '';
+ $p->parse($chunk);
+ }
+ $p->eof;
+ }
+
+ my $res = $p->result;
+ my $bad;
+
+ # Then we start looking for things that should not happen
+ if ($res =~ /\s\|\s/) {
+ diag "broken space";
+ $bad++;
+ }
+ for (
+ # Make sure entities are not broken
+ '/', '/', 'È', '௖', '', 'å', 'Å',
+
+ # Some elements that should be produced
+ "|[[DOCTYPE HTML]]|",
+ "|## this is\na comment ##|",
+ "|<>|\n|<>|</id=33>>|",
+ '|<
>|< å/id=34>>',
+ "|>>ul<<|", "|>>body<<|\n\n|",
+ )
+ {
+ if (index($res, $_) < 0) {
+ diag "Can't find '$_' in parsed document";
+ $bad++;
+ }
+ }
+
+ diag $res if $bad || $ENV{PRINT_RESULTS};
+
+ # And we check that we get the same result all the time
+ $res =~ s/\|//g; # remove all break marks
+ if ($last_res && $res ne $last_res) {
+ diag "The result is not the same as last time";
+ $bad++;
+ }
+ $last_res = $res;
+
+ unless ($res =~ /Various entities/) {
+ diag "Some text must be missing";
+ $bad++;
+ }
+
+ ok(!$bad);
+}
diff --git a/ext/HTML/Parser/t/plaintext.t b/ext/HTML/Parser/t/plaintext.t
new file mode 100644
index 0000000..b2e1e19
--- /dev/null
+++ b/ext/HTML/Parser/t/plaintext.t
@@ -0,0 +1,45 @@
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser;
+
+my @a;
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(default => \@a, '@{event, text, is_cdata}');
+$p->parse(<eof;
+<foo>x<foo>
+</plaintext>
+foo
+EOT
+
+for (@a) {
+ $_ = "" unless defined;
+}
+
+my $doc = join(":", @a);
+
+#diag $doc;
+
+is($doc, "start_document:::start:<xmp>::text:<foo>:1:end:</xmp>::text:x::start:<plaintext>::text:<foo>
+</plaintext>
+foo
+:1:end_document::");
+
+@a = ();
+$p->closing_plaintext('yep, emulate gecko');
+$p->parse(<<EOT)->eof;
+<plaintext><foo>
+</plaintext>foo<b></b>
+EOT
+
+for (@a) {
+ $_ = "" unless defined;
+}
+
+$doc = join(":", @a);
+
+#diag $doc;
+
+is($doc, "start_document:::start:<plaintext>::text:<foo>
+:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text:
+::end_document::");
diff --git a/ext/HTML/Parser/t/pod.t b/ext/HTML/Parser/t/pod.t
new file mode 100644
index 0000000..437887a
--- /dev/null
+++ b/ext/HTML/Parser/t/pod.t
@@ -0,0 +1,4 @@
+use Test::More;
+eval "use Test::Pod 1.00";
+plan skip_all => "Test::Pod 1.00 required for testing POD" if $@;
+all_pod_files_ok();
diff --git a/ext/HTML/Parser/t/process.t b/ext/HTML/Parser/t/process.t
new file mode 100644
index 0000000..9d27250
--- /dev/null
+++ b/ext/HTML/Parser/t/process.t
@@ -0,0 +1,43 @@
+use strict;
+
+use Test::More tests => 12;
+
+my $pi;
+my $orig;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(process_h => [sub { $pi = shift; $orig = shift; },
+ "token0,text"]
+ );
+
+$p->parse("<a><?foo><a>");
+
+is($pi, "foo");
+is($orig, "<?foo>");
+
+$p->parse("<a><?><a>");
+is($pi, "");
+is($orig, "<?>");
+
+$p->parse("<a><?
+foo
+><a>");
+is($pi, "\nfoo\n");
+is($orig, "<?\nfoo\n>");
+
+for (qw(< a > < ? b a r > < a >)) {
+ $p->parse($_);
+}
+
+is($pi, "bar");
+is($orig, "<?bar>");
+
+$p->xml_mode(1);
+
+$p->parse("<a><?foo>bar??><a>");
+is($pi, "foo>bar?");
+is($orig, "<?foo>bar??>");
+
+$p->parse("<a><??></a>");
+is($pi, "");
+is($orig, "<??>");
diff --git a/ext/HTML/Parser/t/pullparser.t b/ext/HTML/Parser/t/pullparser.t
new file mode 100644
index 0000000..80a186b
--- /dev/null
+++ b/ext/HTML/Parser/t/pullparser.t
@@ -0,0 +1,55 @@
+use Test::More tests => 3;
+
+use HTML::PullParser;
+
+my $doc = <<'EOT';
+<title>Title</title>
+<style> h1 { background: white }
+<foo>
+</style>
+<H1 ID="3">Heading</H1>
+
+
+This is a text with a <A HREF="http://www.sol.no" name="l1">link</a>.
+EOT
+
+my $p = HTML::PullParser->new(doc => $doc,
+ start => 'event,tagname,@attr',
+ end => 'event,tagname',
+ text => 'event,dtext',
+
+ ignore_elements => [qw(script style)],
+ unbroken_text => 1,
+ boolean_attribute_value => 1,
+ );
+
+my $t = $p->get_token;
+is($t->[0], "start");
+is($t->[1], "title");
+$p->unget_token($t);
+
+my @a;
+while (my $t = $p->get_token) {
+ for (@$t) {
+ s/\s/./g;
+ }
+ push(@a, join("|", @$t));
+}
+
+my $res = join("\n", @a, "");
+#diag $res;
+is($res, <<'EOT');
+start|title
+text|Title
+end|title
+text|..
+start|h1|id|3
+text|Heading
+end|h1
+text|...This.is.a.text.with.a.
+start|a|href|http://www.sol.no|name|l1
+text|link
+end|a
+text|..
+EOT
+
diff --git a/ext/HTML/Parser/t/script.t b/ext/HTML/Parser/t/script.t
new file mode 100644
index 0000000..2a75ccb
--- /dev/null
+++ b/ext/HTML/Parser/t/script.t
@@ -0,0 +1,41 @@
+#!perl -w
+
+use strict;
+use Test;
+plan tests => 1;
+
+use HTML::Parser;
+
+my $TEXT = "";
+sub h
+{
+ my($event, $tagname, $text) = @_;
+ for ($event, $tagname, $text) {
+ if (defined) {
+ s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+ }
+ else {
+ $_ = "<undef>";
+ }
+ }
+
+ $TEXT .= "[$event,$tagname,$text]\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"], empty_element_tags => 1);
+$p->parse(q(<tr><td align="center" height="100"><script src="whatever"/><SCRIPT language="JavaScript1.1">bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');</SCRIPT></td></tr>));
+$p->eof;
+
+ok($TEXT, <<'EOT');
+[start_document,<undef>,]
+[start,tr,<tr>]
+[start,td,<td align="center" height="100">]
+[start,script,<script src="whatever"/>]
+[end,script,]
+[start,script,<SCRIPT language="JavaScript1.1">]
+[text,<undef>,bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');]
+[end,script,</SCRIPT>]
+[end,td,</td>]
+[end,tr,</tr>]
+[end_document,<undef>,]
+EOT
diff --git a/ext/HTML/Parser/t/skipped-text.t b/ext/HTML/Parser/t/skipped-text.t
new file mode 100644
index 0000000..8bd2704
--- /dev/null
+++ b/ext/HTML/Parser/t/skipped-text.t
@@ -0,0 +1,74 @@
+use Test::More tests => 3;
+
+use strict;
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->report_tags("a");
+
+my @doc;
+
+$p->handler(start => \&a_handler, "skipped_text, text");
+$p->handler(end_document => \@doc, '@{skipped_text}');
+
+$p->parse(<<EOT)->eof;
+<title>hi</title>
+<h1><a href="foo">link</a></h1>
+and <a foo="">some</a> text.
+EOT
+
+sub a_handler {
+ push(@doc, shift);
+ my $text = shift;
+ push(@doc, uc($text));
+}
+
+
+is(join("", @doc), <<'EOT');
+<title>hi</title>
+<h1><A HREF="FOO">link</a></h1>
+and <A FOO="">some</a> text.
+EOT
+
+#
+# Comment stripper. Interaction with "" handlers.
+#
+my $doc = <<EOT;
+<html>text</html>
+
+and some more <b>text</b>.
+EOT
+(my $expected = $doc) =~ s///;
+
+$p = HTML::Parser->new(api_version => 3);
+$p->handler(comment => "");
+$p->handler(end_document => sub {
+ my $stripped = shift;
+ #diag $stripped;
+ is($stripped, $expected);
+ }, "skipped_text");
+for (split(//, $doc)) {
+ $p->parse($_);
+}
+$p->eof;
+
+#
+# Interaction with unbroken text
+#
+my @x;
+$p = HTML::Parser->new(api_version => 3, unbroken_text => 1);
+$p->handler(text => \@x, '@{"X", skipped_text, text}');
+$p->handler(end => "");
+$p->handler(end_document => \@x, '@{"Y", skipped_text}');
+
+$doc = "a a<a>b b</a>c c<x>d d</x>e";
+
+for (split(//, $doc)) {
+ $p->parse($_);
+}
+$p->eof;
+
+#diag join(":", @x);
+is(join(":", @x), "X::a a:X:<a>:b bc c:X:<x>:d de:Y:");
+
diff --git a/ext/HTML/Parser/t/stack-realloc.t b/ext/HTML/Parser/t/stack-realloc.t
new file mode 100644
index 0000000..46c7d35
--- /dev/null
+++ b/ext/HTML/Parser/t/stack-realloc.t
@@ -0,0 +1,17 @@
+#!perl -w
+
+# HTML-Parser 3.33 and older used to core dump on this program because
+# of missing SPAGAIN calls in parse() XS code. It was not prepared for
+# the stack to get realloced.
+
+$| = 1;
+
+use Test::More tests => 1;
+
+use HTML::Parser;
+my $x = HTML::Parser->new(api_version => 3);
+my @row;
+$x->handler(end => sub { push(@row, (1) x 505); 1 }, "tagname");
+$x->parse("</TD>");
+
+pass;
diff --git a/ext/HTML/Parser/t/textarea.t b/ext/HTML/Parser/t/textarea.t
new file mode 100644
index 0000000..120f79b
--- /dev/null
+++ b/ext/HTML/Parser/t/textarea.t
@@ -0,0 +1,70 @@
+use Test::More tests => 1;
+
+use strict;
+use HTML::Parser;
+
+my $html = <<'EOT';
+<html>
+<title>This is a <nice> title</title>
+
+<script language="perl">while (<DATA>) { & }</script>
+
+<FORM>
+
+<textarea name="foo" cols=50 rows=10>
+
+foo
+<foo>
+
+&
+foo
+</FORM>
+
+</textarea>
+
+</FORM>
+
+</html>
+EOT
+
+my $dump = "";
+sub tdump {
+ my @a = @_;
+ for (@a) {
+ $_ = "<undef>" unless defined;
+ s/\n/\\n/g;
+ }
+ $dump .= join("|", @a) . "\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]);
+$p->parse($html)->eof;
+
+#diag $dump;
+
+is($dump, <<'EOT');
+start_document||<undef>|<undef>
+start|<html>|<undef>|<undef>
+text|\n|\n|
+start|<title>|<undef>|<undef>
+text|This is a <nice> title|This is a <nice> title|
+end|</title>|<undef>|<undef>
+text|\n|\n|
+comment||<undef>|<undef>
+text|\n|\n|
+start|<script language="perl">|<undef>|<undef>
+text|while (<DATA>) { & }|while (<DATA>) { & }|1
+end|</script>|<undef>|<undef>
+text|\n\n|\n\n|
+start|<FORM>|<undef>|<undef>
+text|\n\n|\n\n|
+start|<textarea name="foo" cols=50 rows=10>|<undef>|<undef>
+text|\n\nfoo\n<foo>\n\n&\nfoo\n</FORM>\n\n|\n\nfoo\n<foo>\n\n&\nfoo\n</FORM>\n\n|
+end|</textarea>|<undef>|<undef>
+text|\n\n|\n\n|
+end|</FORM>|<undef>|<undef>
+text|\n\n|\n\n|
+end|</html>|<undef>|<undef>
+text|\n|\n|
+end_document||<undef>|<undef>
+EOT
diff --git a/ext/HTML/Parser/t/threads.t b/ext/HTML/Parser/t/threads.t
new file mode 100644
index 0000000..8da91e9
--- /dev/null
+++ b/ext/HTML/Parser/t/threads.t
@@ -0,0 +1,39 @@
+# Verify thread safety.
+
+use Config;
+use Test::More;
+
+BEGIN {
+ plan(skip_all => "Not configured for threads")
+ unless $Config{useithreads} && $] >= 5.008;
+ plan(tests => 1);
+}
+
+use threads;
+use HTML::Parser;
+
+my $ok=0;
+
+sub start
+{
+ my($tag,$attr)=@_;
+
+ $ok += ($tag eq "foo");
+ $ok += (defined($attr->{param}) && $attr->{param} eq "bar");
+}
+
+my $p = HTML::Parser->new
+ (api_version => 3,
+ handlers => {
+ start => [\&start, "tagname,attr"],
+ });
+
+$p->parse("<foo pa");
+
+$ok=async {
+ $p->parse("ram=bar>");
+ $ok;
+}->join();
+
+is($ok,2);
+
diff --git a/ext/HTML/Parser/t/tokeparser.t b/ext/HTML/Parser/t/tokeparser.t
new file mode 100644
index 0000000..2084201
--- /dev/null
+++ b/ext/HTML/Parser/t/tokeparser.t
@@ -0,0 +1,164 @@
+use Test::More tests => 17;
+
+use strict;
+use HTML::TokeParser;
+
+# First we create an HTML document to test
+
+my $file = "ttest$$.htm";
+die "$file already exists" if -e $file;
+
+open(F, ">$file") or die "Can't create $file: $!";
+print F <<'EOT'; close(F);
+
+
+<html><head><title>
+ This is the <title>
+</title>
+
+ <base href="http://www.perl.com">
+</head>
+
+<body background="bg.gif">
+
+ <h1>This is the <b>title</b> again
+ </h1>
+
+ And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl"> Institute</a>
+
+ <br/><? process instruction >
+
+</body>
+</html>
+
+EOT
+
+END { unlink($file) || warn "Can't unlink $file: $!"; }
+
+
+my $p;
+
+
+$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
+ok($p->unbroken_text);
+if ($p->get_tag("foo", "title")) {
+ my $title = $p->get_trimmed_text;
+ #diag "Title: $title";
+ is($title, "This is the <title>");
+}
+undef($p);
+
+# Test with reference to glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(\*F);
+my $scount = 0;
+my $ecount = 0;
+my $tcount = 0;
+my $pcount = 0;
+while (my $token = $p->get_token) {
+ $scount++ if $token->[0] eq "S";
+ $ecount++ if $token->[0] eq "E";
+ $pcount++ if $token->[0] eq "PI";
+}
+undef($p);
+close F;
+
+# Test with glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(*F);
+$tcount++ while $p->get_tag;
+undef($p);
+close F;
+
+# Test with plain file name
+$p = HTML::TokeParser->new($file) || die;
+$tcount++ while $p->get_tag;
+undef($p);
+
+#diag "Number of tokens found: $tcount/2 = $scount + $ecount";
+is($tcount, 34);
+is($scount, 10);
+is($ecount, 7);
+is($pcount, 1);
+is($tcount/2, $scount + $ecount);
+
+ok(!HTML::TokeParser->new("/noT/thEre/$$"));
+
+
+$p = HTML::TokeParser->new($file) || die;
+$p->get_tag("a");
+my $atext = $p->get_text;
+undef($p);
+
+is($atext, "Perl\240Institute");
+
+# test parsing of embeded document
+$p = HTML::TokeParser->new(\<<HTML);
+<title>Title</title>
+<H1>
+Heading
+</h1>
+HTML
+
+ok($p->get_tag("h1"));
+is($p->get_trimmed_text, "Heading");
+undef($p);
+
+# test parsing of large embedded documents
+my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022;
+
+#use Time::HiRes qw(time);
+my $start = time;
+$p = HTML::TokeParser->new(\$doc);
+#diag "Construction time: ", time - $start;
+
+my $count;
+while (my $t = $p->get_token) {
+ $count++ if $t->[0] eq "S";
+}
+#diag "Parse time: ", time - $start;
+
+is($count, 2022);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+<H1>This is a heading</H1>
+This is s<b>o</b>me<hr>text.
+<br />
+This is some more text.
+<p>
+This is even some more.
+EOT
+
+$p->get_tag("/h1");
+
+my $t = $p->get_trimmed_text("br", "p");
+is($t, "This is some text.");
+
+$p->get_tag;
+
+$t = $p->get_trimmed_text("br", "p");
+is($t,"This is some more text.");
+
+undef($p);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+<H1>This is a <b>bold</b> heading</H1>
+This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>.
+<p>
+This is even some more.
+EOT
+
+$p->get_tag("h1");
+
+$t = $p->get_phrase;
+is($t, "This is a bold heading");
+
+$t = $p->get_phrase;
+is($t, "");
+
+$p->get_tag;
+
+$t = $p->get_phrase;
+is($t, "This is some italic text. This is some more text.");
+
+undef($p);
diff --git a/ext/HTML/Parser/t/uentities.t b/ext/HTML/Parser/t/uentities.t
new file mode 100644
index 0000000..b9decc5
--- /dev/null
+++ b/ext/HTML/Parser/t/uentities.t
@@ -0,0 +1,67 @@
+# Test Unicode entities
+
+use HTML::Entities;
+
+use Test::More tests => 27;
+
+SKIP: {
+skip "This perl does not support Unicode or Unicode entities not selected",
+ 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT;
+
+is(decode_entities("&euro"), "&euro");
+is(decode_entities("€"), "\x{20AC}");
+
+is(decode_entities("&aring"), "å");
+is(decode_entities("å"), "å");
+
+is(decode_entities("&#500000"), chr(500000));
+
+is(decode_entities("&#x10FFFD"), "\x{10FFFD}");
+
+is(decode_entities("&#xFFFC"), "\x{FFFC}");
+
+
+is(decode_entities("&#xFDD0"), "\x{FFFD}");
+is(decode_entities("&#xFDD1"), "\x{FFFD}");
+is(decode_entities("&#xFDE0"), "\x{FFFD}");
+is(decode_entities("&#xFDEF"), "\x{FFFD}");
+is(decode_entities("&#xFFFF"), "\x{FFFD}");
+is(decode_entities("&#x10FFFF"), "\x{FFFD}");
+is(decode_entities("&#x110000"), chr(0xFFFD));
+is(decode_entities("&#XFFFFFFFF"), chr(0xFFFD));
+
+is(decode_entities("&#0"), "\0");
+is(decode_entities(""), "\0");
+is(decode_entities("&#x0"), "\0");
+is(decode_entities("&#X0;"), "\0");
+
+is(decode_entities("&#&aring&#229å&#xFFF"), "&#ååå\x{FFF}");
+
+# This might fail when we get more than 64 bit UVs
+is(decode_entities("&#0009999999999999999999999999999;"), "&#0009999999999999999999999999999;");
+is(decode_entities("&#xFFFF0000FFFF0000FFFF1"), "&#xFFFF0000FFFF0000FFFF1");
+
+my $err;
+for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) {
+ my $a = join("", map chr, $_->[0] .. $_->[1]);
+
+ my $e = encode_entities($a);
+ my $d = decode_entities($e);
+
+ unless ($d eq $a) {
+ diag "Wrong decoding in range $_->[0] .. $_->[1]";
+ # use Devel::Peek; Dump($a); Dump($d);
+ $err++;
+ }
+}
+ok(!$err);
+
+
+is(decode_entities("&#56256;&#56453;"), chr(0x100085));
+
+is(decode_entities("&#56256;&#56453;"), chr(0x100085));
+
+is(decode_entities("&#56256"), chr(0xFFFD));
+
+is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}");
+}
diff --git a/ext/HTML/Parser/t/unbroken-text.t b/ext/HTML/Parser/t/unbroken-text.t
new file mode 100644
index 0000000..7de85a9
--- /dev/null
+++ b/ext/HTML/Parser/t/unbroken-text.t
@@ -0,0 +1,60 @@
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 3;
+
+my $text = "";
+sub text
+{
+ my $cdata = shift() ? "CDATA" : "TEXT";
+ my($offset, $line, $col, $t) = @_;
+ $text .= "[$cdata:$offset:$line.$col:$t]";
+}
+
+sub tag
+{
+ $text .= shift;
+}
+
+my $p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ start_h => [\&tag, "text"],
+ end_h => [\&tag, "text"],
+ );
+
+$p->parse("foo ");
+$p->parse("bar ");
+$p->parse("<foo>");
+$p->parse("bar\n");
+$p->parse("</foo>");
+$p->parse("<xmp>xmp</xmp>");
+$p->parse("atend");
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foo bar ]<foo>[TEXT:13:1.13:bar\n]</foo><xmp>[CDATA:28:2.11:xmp]</xmp>");
+
+$text = "";
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:37:2.20:atend]");
+
+
+$p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ );
+
+$text = "";
+$p->parse("foo");
+$p->parse("<foo");
+$p->parse(">bar\n");
+$p->parse("foo<xm");
+$p->parse("p>xmp");
+$p->parse("</xmp");
+$p->parse(">bar");
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]");
+
+
diff --git a/ext/HTML/Parser/t/unicode-bom.t b/ext/HTML/Parser/t/unicode-bom.t
new file mode 100644
index 0000000..34e066f
--- /dev/null
+++ b/ext/HTML/Parser/t/unicode-bom.t
@@ -0,0 +1,59 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 2;
+use HTML::Parser;
+
+SKIP: {
+skip "This perl does not support Unicode", 2 if $] < 5.008;
+
+my @parsed;
+my $p = HTML::Parser->new(
+ api_version => 3,
+ start_h => [\@parsed, 'tag, attr'],
+);
+
+my @warn;
+$SIG{__WARN__} = sub {
+ push(@warn, $_[0]);
+};
+
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>");
+$p->eof;
+
+#use Encode;
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>" . chr(0x263A));
+$p->eof;
+
+$p->parse("\xFF\xFE<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\xFE\xFF<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\0\0\xFF\xFE<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\xFE\xFF\0\0<head>Hi there</head>");
+$p->eof;
+
+is(join("", @warn), <<EOT);
+Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line 21.
+Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line 25.
+Parsing of undecoded UTF-16 at $0 line 28.
+Parsing of undecoded UTF-16 at $0 line 31.
+Parsing of undecoded UTF-32 at $0 line 34.
+Parsing of undecoded UTF-32 at $0 line 37.
+EOT
+
+@warn = ();
+
+$p = HTML::Parser->new(
+ api_version => 3,
+ start_h => [\@parsed, 'tag'],
+);
+
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>");
+$p->eof;
+ok(!@warn);
+}
diff --git a/ext/HTML/Parser/t/unicode.t b/ext/HTML/Parser/t/unicode.t
new file mode 100644
index 0000000..82902de
--- /dev/null
+++ b/ext/HTML/Parser/t/unicode.t
@@ -0,0 +1,183 @@
+#!perl -w
+
+use strict;
+use HTML::Parser;
+use Test::More tests => 103;
+
+SKIP: {
+skip "This perl does not support Unicode", 103 if $] < 5.008;
+
+my @warn;
+$SIG{__WARN__} = sub {
+ push(@warn, $_[0]);
+};
+
+my @parsed;
+my $p = HTML::Parser->new(
+ api_version => 3,
+ default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'],
+);
+
+my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile &#x263a</h1>\x{0420}";
+is(length($doc), 46);
+
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "<title>");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 0x263A);
+is($parsed[2][2], chr(0x263A));
+is($parsed[2][3], 7);
+is($parsed[2][4], 1);
+is($parsed[2][5], 8);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "</title>");
+is($parsed[3][3], 8);
+is($parsed[3][6], 8);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "<h1 id=\x{2600} f>");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
+is($parsed[4][8]{id}, "\x{2600}");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile &#x263a");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[7][0], "text");
+is($parsed[7][1], "\x{0420}");
+is($parsed[7][2], "\x{0420}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+is(@warn, 0);
+
+# Try to parse it as an UTF8 encoded string
+utf8::encode($doc);
+is(length($doc), 51);
+
+@parsed = ();
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "<title>");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 226);
+is($parsed[2][1], "\xE2\x98\xBA");
+is($parsed[2][2], "\xE2\x98\xBA");
+is($parsed[2][3], 7);
+is($parsed[2][4], 3);
+is($parsed[2][5], 10);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "</title>");
+is($parsed[3][3], 10);
+is($parsed[3][6], 10);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
+is($parsed[4][8]{id}, "\xE2\x98\x80");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile &#x263a");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+my $file = "test-$$.html";
+open(my $fh, ">:utf8", $file) || die;
+print $fh <<EOT;
+\x{FEFF}
+<title>\x{263A} Love! </title>
+<h1 id=&hearts;\x{2665}>&hearts; Love \x{2665}<h1>
+EOT
+close($fh) || die;
+
+@warn = ();
+@parsed = ();
+$p->parse_file($file);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \xE2\x99\xA5");
+is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage
+is($parsed[10][3], -s $file);
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw:utf8", $file) || die;
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\x{2665}");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \x{2665}");
+is($parsed[7][2], "\x{2665} Love \x{2665}");
+is($parsed[10][3], (-s $file) - 2 * 4);
+is(@warn, 0);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw", $file) || die;
+$p->utf8_mode(1);
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \xE2\x99\xA5");
+is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
+is($parsed[10][3], -s $file);
+is(@warn, 0);
+
+unlink($file);
+
+@parsed = ();
+$p->parse(q(<a href="a=1&lang=2&times=3">foo</a>))->eof;
+is(@parsed, "5");
+is($parsed[1][0], "start");
+is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
+
+ok(!HTML::Entities::_probably_utf8_chunk(""));
+ok(!HTML::Entities::_probably_utf8_chunk("f"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
+}
diff --git a/ext/HTML/Parser/t/xml-mode.t b/ext/HTML/Parser/t/xml-mode.t
new file mode 100644
index 0000000..cdfc5b0
--- /dev/null
+++ b/ext/HTML/Parser/t/xml-mode.t
@@ -0,0 +1,112 @@
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(xml_mode => 1,
+ );
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ $text .= "]";
+ }, "tagname,attr");
+$p->handler(end =>
+ sub {
+ $text .= "E[" . shift() . "]";
+ }, "tagname");
+$p->handler(process =>
+ sub {
+ $text .= "PI[" . shift() . "]";
+ }, "token0");
+$p->handler(text =>
+ sub {
+ $text .= shift;
+ }, "text");
+
+my $xml = <<'EOT';
+<?xml version="1.0"?>
+<?IS10744:arch name="html"?>
+<DOC>
+<title html="h1">My first architectual document</title>
+<author html="address">Geir Ove Gronmo, grove@infotek.no</author>
+<para>This is the first paragraph in this document</para>
+<para html="p">This is the second paragraph</para>
+<para/>
+<xmp><foo></foo></xmp>
+</DOC>
+EOT
+
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"]
+PI[IS10744:arch name="html"]
+S[DOC]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para]E[para]
+S[xmp]S[foo]E[foo]E[xmp]
+E[DOC]
+EOT
+
+$text = "";
+$p->xml_mode(0);
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"?]
+PI[IS10744:arch name="html"?]
+S[doc]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para/]
+S[xmp]<foo></foo>E[xmp]
+E[doc]
+EOT
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+ xml_mode => 1);
+
+$p->handler("end" =>
+ sub {
+ my($tagname, $text) = @_;
+ is($tagname, "Xyzzy");
+ ok(!length($text));
+ }, "tagname,text");
+$p->parse("<Xyzzy foo=bar/>and some more")->eof;
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+ empty_element_tags => 1);
+
+$p->handler("end" =>
+ sub {
+ my($tagname, $text) = @_;
+ is($tagname, "xyzzy");
+ ok(!length($text));
+ }, "tagname,text");
+$p->parse("<Xyzzy foo=bar/>and some more")->eof;
+
+$p = HTML::Parser->new(
+ api_version => 3,
+ xml_pic => 1,
+);
+
+$p->handler(
+ "process" => sub {
+ my($text, $t0) = @_;
+ is($text, "<?foo > bar?>");
+ is($t0, "foo > bar");
+ }, "text, token0");
+$p->parse("<?foo > bar?> and then")->eof;
diff --git a/ext/HTML/Parser/tokenpos.h b/ext/HTML/Parser/tokenpos.h
new file mode 100644
index 0000000..aa971bf
--- /dev/null
+++ b/ext/HTML/Parser/tokenpos.h
@@ -0,0 +1,49 @@
+struct token_pos
+{
+ char *beg;
+ char *end;
+};
+typedef struct token_pos token_pos_t;
+
+#define dTOKENS(init_lim) \
+ token_pos_t token_buf[init_lim]; \
+ int token_lim = init_lim; \
+ token_pos_t *tokens = token_buf; \
+ int num_tokens = 0
+
+#define PUSH_TOKEN(p_beg, p_end) \
+ STMT_START { \
+ ++num_tokens; \
+ if (num_tokens == token_lim) \
+ tokens_grow(&tokens, &token_lim, (bool)(tokens != token_buf)); \
+ tokens[num_tokens-1].beg = p_beg; \
+ tokens[num_tokens-1].end = p_end; \
+ } STMT_END
+
+#define FREE_TOKENS \
+ STMT_START { \
+ if (tokens != token_buf) \
+ Safefree(tokens); \
+ } STMT_END
+
+static void
+tokens_grow(token_pos_t **token_ptr, int *token_lim_ptr, bool tokens_on_heap)
+{
+ int new_lim = *token_lim_ptr;
+ if (new_lim < 4)
+ new_lim = 4;
+ new_lim *= 2;
+
+ if (tokens_on_heap) {
+ Renew(*token_ptr, new_lim, token_pos_t);
+ }
+ else {
+ token_pos_t *new_tokens;
+ int i;
+ New(57, new_tokens, new_lim, token_pos_t);
+ for (i = 0; i < *token_lim_ptr; i++)
+ new_tokens[i] = (*token_ptr)[i];
+ *token_ptr = new_tokens;
+ }
+ *token_lim_ptr = new_lim;
+}
diff --git a/ext/HTML/Parser/typemap b/ext/HTML/Parser/typemap
new file mode 100644
index 0000000..a323854
--- /dev/null
+++ b/ext/HTML/Parser/typemap
@@ -0,0 +1,5 @@
+PSTATE* T_PSTATE
+
+INPUT
+T_PSTATE
+ $var = get_pstate_hv(aTHX_ $arg)
diff --git a/ext/HTML/Parser/util.c b/ext/HTML/Parser/util.c
new file mode 100644
index 0000000..7e626bf
--- /dev/null
+++ b/ext/HTML/Parser/util.c
@@ -0,0 +1,312 @@
+/* $Id: util.c,v 2.30 2006/03/22 09:15:17 gisle Exp $
+ *
+ * Copyright 1999-2006, Gisle Aas.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the same terms as Perl itself.
+ */
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+
+EXTERN SV*
+sv_lower(pTHX_ SV* sv)
+{
+ STRLEN len;
+ char *s = SvPV_force(sv, len);
+ for (; len--; s++)
+ *s = toLOWER(*s);
+ return sv;
+}
+
+EXTERN int
+strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case)
+{
+ while (n--) {
+ if (ignore_case) {
+ if (toLOWER(*s1) != toLOWER(*s2))
+ return 0;
+ }
+ else {
+ if (*s1 != *s2)
+ return 0;
+ }
+ s1++;
+ s2++;
+ }
+ return 1;
+}
+
+static void
+grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e)
+{
+ /*
+ SvPVX ---> AAAAAA...BBBBBB
+ ^ ^ ^
+ t s e
+ */
+ STRLEN t_offset = *t - SvPVX(sv);
+ STRLEN s_offset = *s - SvPVX(sv);
+ STRLEN e_offset = *e - SvPVX(sv);
+
+ SvGROW(sv, e_offset + grow + 1);
+
+ *t = SvPVX(sv) + t_offset;
+ *s = SvPVX(sv) + s_offset;
+ *e = SvPVX(sv) + e_offset;
+
+ Move(*s, *s+grow, *e - *s, char);
+ *s += grow;
+ *e += grow;
+}
+
+EXTERN SV*
+decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix)
+{
+ STRLEN len;
+ char *s = SvPV_force(sv, len);
+ char *t = s;
+ char *end = s + len;
+ char *ent_start;
+
+ char *repl;
+ STRLEN repl_len;
+#ifdef UNICODE_HTML_PARSER
+ char buf[UTF8_MAXLEN];
+ int repl_utf8;
+ int high_surrogate = 0;
+#else
+ char buf[1];
+#endif
+
+#if defined(__GNUC__) && defined(UNICODE_HTML_PARSER)
+ /* gcc -Wall reports this variable as possibly used uninitialized */
+ repl_utf8 = 0;
+#endif
+
+ while (s < end) {
+ assert(t <= s);
+
+ if ((*t++ = *s++) != '&')
+ continue;
+
+ ent_start = s;
+ repl = 0;
+
+ if (*s == '#') {
+ UV num = 0;
+ UV prev = 0;
+ int ok = 0;
+ s++;
+ if (*s == 'x' || *s == 'X') {
+ s++;
+ while (*s) {
+ char *tmp = strchr(PL_hexdigit, *s);
+ if (!tmp)
+ break;
+ num = num << 4 | ((tmp - PL_hexdigit) & 15);
+ if (prev && num <= prev) {
+ /* overflow */
+ ok = 0;
+ break;
+ }
+ prev = num;
+ s++;
+ ok = 1;
+ }
+ }
+ else {
+ while (isDIGIT(*s)) {
+ num = num * 10 + (*s - '0');
+ if (prev && num < prev) {
+ /* overflow */
+ ok = 0;
+ break;
+ }
+ prev = num;
+ s++;
+ ok = 1;
+ }
+ }
+ if (ok) {
+#ifdef UNICODE_HTML_PARSER
+ if (!SvUTF8(sv) && num <= 255) {
+ buf[0] = (char) num;
+ repl = buf;
+ repl_len = 1;
+ repl_utf8 = 0;
+ }
+ else {
+ char *tmp;
+ if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */
+ if (high_surrogate != 0) {
+ t -= 3; /* Back up past 0xFFFD */
+ num = ((high_surrogate - 0xD800) << 10) +
+ (num - 0xDC00) + 0x10000;
+ high_surrogate = 0;
+ } else {
+ num = 0xFFFD;
+ }
+ }
+ else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */
+ high_surrogate = num;
+ num = 0xFFFD;
+ }
+ else {
+ high_surrogate = 0;
+ /* otherwise invalid? */
+ if ((num >= 0xFDD0 && num <= 0xFDEF) ||
+ ((num & 0xFFFE) == 0xFFFE) ||
+ num > 0x10FFFF)
+ {
+ num = 0xFFFD;
+ }
+ }
+
+ tmp = (char*)uvuni_to_utf8((U8*)buf, num);
+ repl = buf;
+ repl_len = tmp - buf;
+ repl_utf8 = 1;
+ }
+#else
+ if (num <= 255) {
+ buf[0] = (char) num & 0xFF;
+ repl = buf;
+ repl_len = 1;
+ }
+#endif
+ }
+ }
+ else {
+ char *ent_name = s;
+ while (isALNUM(*s))
+ s++;
+ if (ent_name != s && entity2char) {
+ SV** svp;
+ if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) ||
+ (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0)))
+ )
+ {
+ repl = SvPV(*svp, repl_len);
+#ifdef UNICODE_HTML_PARSER
+ repl_utf8 = SvUTF8(*svp);
+#endif
+ }
+ else if (expand_prefix) {
+ char *ss = s - 1;
+ while (ss > ent_name) {
+ svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0);
+ if (svp) {
+ repl = SvPV(*svp, repl_len);
+#ifdef UNICODE_HTML_PARSER
+ repl_utf8 = SvUTF8(*svp);
+#endif
+ s = ss;
+ break;
+ }
+ ss--;
+ }
+ }
+ }
+#ifdef UNICODE_HTML_PARSER
+ high_surrogate = 0;
+#endif
+ }
+
+ if (repl) {
+ char *repl_allocated = 0;
+ if (*s == ';')
+ s++;
+ t--; /* '&' already copied, undo it */
+
+#ifdef UNICODE_HTML_PARSER
+ if (*s != '&') {
+ high_surrogate = 0;
+ }
+
+ if (!SvUTF8(sv) && repl_utf8) {
+ /* need to upgrade sv before we continue */
+ STRLEN before_gap_len = t - SvPVX(sv);
+ char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len);
+ STRLEN after_gap_len = end - s;
+ char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len);
+
+ sv_setpvn(sv, before_gap, before_gap_len);
+ sv_catpvn(sv, after_gap, after_gap_len);
+ SvUTF8_on(sv);
+
+ Safefree(before_gap);
+ Safefree(after_gap);
+
+ s = t = SvPVX(sv) + before_gap_len;
+ end = SvPVX(sv) + before_gap_len + after_gap_len;
+ }
+ else if (SvUTF8(sv) && !repl_utf8) {
+ repl = (char*)bytes_to_utf8((U8*)repl, &repl_len);
+ repl_allocated = repl;
+ }
+#endif
+
+ if (t + repl_len > s) {
+ /* need to grow the string */
+ grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end);
+ }
+
+ /* copy replacement string into string */
+ while (repl_len--)
+ *t++ = *repl++;
+
+ if (repl_allocated)
+ Safefree(repl_allocated);
+ }
+ else {
+ while (ent_start < s)
+ *t++ = *ent_start++;
+ }
+ }
+
+ *t = '\0';
+ SvCUR_set(sv, t - SvPVX(sv));
+
+ return sv;
+}
+
+#ifdef UNICODE_HTML_PARSER
+static bool
+has_hibit(char *s, char *e)
+{
+ while (s < e) {
+ U8 ch = *s++;
+ if (!UTF8_IS_INVARIANT(ch)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+EXTERN bool
+probably_utf8_chunk(pTHX_ char *s, STRLEN len)
+{
+ char *e = s + len;
+ STRLEN clen;
+
+ /* ignore partial utf8 char at end of buffer */
+ while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1)))
+ e--;
+ if (s < e && UTF8_IS_START((U8)*(e - 1)))
+ e--;
+ clen = len - (e - s);
+ if (clen && UTF8SKIP(e) == clen) {
+ /* all promised continuation bytes are present */
+ e = s + len;
+ }
+
+ if (!has_hibit(s, e))
+ return 0;
+
+ return is_utf8_string((U8*)s, e - s);
+}
+#endif
diff --git a/lib/HTML/Tagset.pm b/lib/HTML/Tagset.pm
new file mode 100644
index 0000000..754137f
--- /dev/null
+++ b/lib/HTML/Tagset.pm
@@ -0,0 +1,471 @@
+package HTML::Tagset;
+
+use strict;
+
+=head1 NAME
+
+HTML::Tagset - data tables useful in parsing HTML
+
+=head1 VERSION
+
+Version 3.20
+
+=cut
+
+use vars qw( $VERSION );
+
+$VERSION = '3.20';
+
+=head1 SYNOPSIS
+
+ use HTML::Tagset;
+ # Then use any of the items in the HTML::Tagset package
+ # as need arises
+
+=head1 DESCRIPTION
+
+This module contains several data tables useful in various kinds of
+HTML parsing operations.
+
+Note that all tag names used are lowercase.
+
+In the following documentation, a "hashset" is a hash being used as a
+set -- the hash conveys that its keys are there, and the actual values
+associated with the keys are not significant. (But what values are
+there, are always true.)
+
+=cut
+
+use vars qw(
+ $VERSION
+ %emptyElement %optionalEndTag %linkElements %boolean_attr
+ %isHeadElement %isBodyElement %isPhraseMarkup
+ %is_Possible_Strict_P_Content
+ %isHeadOrBodyElement
+ %isList %isTableElement %isFormElement
+ %isKnown %canTighten
+ @p_closure_barriers
+ %isCDATA_Parent
+);
+
+=head1 VARIABLES
+
+Note that none of these variables are exported.
+
+=head2 hashset %HTML::Tagset::emptyElement
+
+This hashset has as values the tag-names (GIs) of elements that cannot
+have content. (For example, "base", "br", "hr".) So
+C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
+C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
+
+=cut
+
+%emptyElement = map {; $_ => 1 } qw(base link meta isindex
+ img br hr wbr
+ input area param
+ embed bgsound spacer
+ basefont col frame
+ ~comment ~literal
+ ~declaration ~pi
+ );
+# The "~"-initial names are for pseudo-elements used by HTML::Entities
+# and TreeBuilder
+
+=head2 hashset %HTML::Tagset::optionalEndTag
+
+This hashset lists tag-names for elements that can have content, but whose
+end-tags are generally, "safely", omissible. Example:
+C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
+
+=cut
+
+%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
+
+=head2 hash %HTML::Tagset::linkElements
+
+Values in this hash are tagnames for elements that might contain
+links, and the value for each is a reference to an array of the names
+of attributes whose values can be links.
+
+=cut
+
+%linkElements =
+(
+ 'a' => ['href'],
+ 'applet' => ['archive', 'codebase', 'code'],
+ 'area' => ['href'],
+ 'base' => ['href'],
+ 'bgsound' => ['src'],
+ 'blockquote' => ['cite'],
+ 'body' => ['background'],
+ 'del' => ['cite'],
+ 'embed' => ['pluginspage', 'src'],
+ 'form' => ['action'],
+ 'frame' => ['src', 'longdesc'],
+ 'iframe' => ['src', 'longdesc'],
+ 'ilayer' => ['background'],
+ 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
+ 'input' => ['src', 'usemap'],
+ 'ins' => ['cite'],
+ 'isindex' => ['action'],
+ 'head' => ['profile'],
+ 'layer' => ['background', 'src'],
+ 'link' => ['href'],
+ 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
+ 'q' => ['cite'],
+ 'script' => ['src', 'for'],
+ 'table' => ['background'],
+ 'td' => ['background'],
+ 'th' => ['background'],
+ 'tr' => ['background'],
+ 'xmp' => ['href'],
+);
+
+=head2 hash %HTML::Tagset::boolean_attr
+
+This hash (not hashset) lists what attributes of what elements can be
+printed without showing the value (for example, the "noshade" attribute
+of "hr" elements). For elements with only one such attribute, its value
+is simply that attribute name. For elements with many such attributes,
+the value is a reference to a hashset containing all such attributes.
+
+=cut
+
+%boolean_attr = (
+# TODO: make these all hashes
+ 'area' => 'nohref',
+ 'dir' => 'compact',
+ 'dl' => 'compact',
+ 'hr' => 'noshade',
+ 'img' => 'ismap',
+ 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
+ 'menu' => 'compact',
+ 'ol' => 'compact',
+ 'option' => 'selected',
+ 'select' => 'multiple',
+ 'td' => 'nowrap',
+ 'th' => 'nowrap',
+ 'ul' => 'compact',
+);
+
+#==========================================================================
+# List of all elements from Extensible HTML version 1.0 Transitional DTD:
+#
+# a abbr acronym address applet area b base basefont bdo big
+# blockquote body br button caption center cite code col colgroup
+# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
+# head hr html i iframe img input ins isindex kbd label legend li
+# link map menu meta noframes noscript object ol optgroup option p
+# param pre q s samp script select small span strike strong style
+# sub sup table tbody td textarea tfoot th thead title tr tt u ul
+# var
+#
+# Varia from Mozilla source internal table of tags:
+# Implemented:
+# xmp listing wbr nobr frame frameset noframes ilayer
+# layer nolayer spacer embed multicol
+# But these are unimplemented:
+# sound?? keygen?? server??
+# Also seen here and there:
+# marquee?? app?? (both unimplemented)
+#==========================================================================
+
+=head2 hashset %HTML::Tagset::isPhraseMarkup
+
+This hashset contains all phrasal-level elements.
+
+=cut
+
+%isPhraseMarkup = map {; $_ => 1 } qw(
+ span abbr acronym q sub sup
+ cite code em kbd samp strong var dfn strike
+ b i u s tt small big
+ a img br
+ wbr nobr blink
+ font basefont bdo
+ spacer embed noembed
+); # had: center, hr, table
+
+
+=head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
+
+This hashset contains all phrasal-level elements that be content of a
+P element, for a strict model of HTML.
+
+=cut
+
+%is_Possible_Strict_P_Content = (
+ %isPhraseMarkup,
+ %isFormElement,
+ map {; $_ => 1} qw( object script map )
+ # I've no idea why there's these latter exceptions.
+ # I'm just following the HTML4.01 DTD.
+);
+
+#from html4 strict:
+#<!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
+#
+#<!ENTITY % phrase "EM | STRONG | DFN | CODE |
+# SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
+#
+#<!ENTITY % special
+# "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
+#
+#<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
+#
+#
+#<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
+
+=head2 hashset %HTML::Tagset::isHeadElement
+
+This hashset contains all elements that elements that should be
+present only in the 'head' element of an HTML document.
+
+=cut
+
+%isHeadElement = map {; $_ => 1 }
+ qw(title base link meta isindex script style object bgsound);
+
+=head2 hashset %HTML::Tagset::isList
+
+This hashset contains all elements that can contain "li" elements.
+
+=cut
+
+%isList = map {; $_ => 1 } qw(ul ol dir menu);
+
+=head2 hashset %HTML::Tagset::isTableElement
+
+This hashset contains all elements that are to be found only in/under
+a "table" element.
+
+=cut
+
+%isTableElement = map {; $_ => 1 }
+ qw(tr td th thead tbody tfoot caption col colgroup);
+
+=head2 hashset %HTML::Tagset::isFormElement
+
+This hashset contains all elements that are to be found only in/under
+a "form" element.
+
+=cut
+
+%isFormElement = map {; $_ => 1 }
+ qw(input select option optgroup textarea button label);
+
+=head2 hashset %HTML::Tagset::isBodyMarkup
+
+This hashset contains all elements that are to be found only in/under
+the "body" element of an HTML document.
+
+=cut
+
+%isBodyElement = map {; $_ => 1 } qw(
+ h1 h2 h3 h4 h5 h6
+ p div pre plaintext address blockquote
+ xmp listing
+ center
+
+ multicol
+ iframe ilayer nolayer
+ bgsound
+
+ hr
+ ol ul dir menu li
+ dl dt dd
+ ins del
+
+ fieldset legend
+
+ map area
+ applet param object
+ isindex script noscript
+ table
+ center
+ form
+ ),
+ keys %isFormElement,
+ keys %isPhraseMarkup, # And everything phrasal
+ keys %isTableElement,
+;
+
+
+=head2 hashset %HTML::Tagset::isHeadOrBodyElement
+
+This hashset includes all elements that I notice can fall either in
+the head or in the body.
+
+=cut
+
+%isHeadOrBodyElement = map {; $_ => 1 }
+ qw(script isindex style object map area param noscript bgsound);
+ # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
+
+
+=head2 hashset %HTML::Tagset::isKnown
+
+This hashset lists all known HTML elements.
+
+=cut
+
+%isKnown = (%isHeadElement, %isBodyElement,
+ map{; $_=>1 }
+ qw( head body html
+ frame frameset noframes
+ ~comment ~pi ~directive ~literal
+));
+ # that should be all known tags ever ever
+
+
+=head2 hashset %HTML::Tagset::canTighten
+
+This hashset lists elements that might have ignorable whitespace as
+children or siblings.
+
+=cut
+
+%canTighten = %isKnown;
+delete @canTighten{
+ keys(%isPhraseMarkup), 'input', 'select',
+ 'xmp', 'listing', 'plaintext', 'pre',
+};
+ # xmp, listing, plaintext, and pre are untightenable, and
+ # in a really special way.
+@canTighten{'hr','br'} = (1,1);
+ # exceptional 'phrasal' things that ARE subject to tightening.
+
+# The one case where I can think of my tightening rules failing is:
+# <p>foo bar<center> <em>baz quux</em> ...
+# ^-- that would get deleted.
+# But that's pretty gruesome code anyhow. You gets what you pays for.
+
+#==========================================================================
+
+=head2 array @HTML::Tagset::p_closure_barriers
+
+This array has a meaning that I have only seen a need for in
+C<HTML::TreeBuilder>, but I include it here on the off chance that someone
+might find it of use:
+
+When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p
+element we might have to minimize. At first sight, we might say that
+if there's a p anywhere in the lineage of this new p, it should be
+closed. But that's wrong. Consider this document:
+
+ <html>
+ <head>
+ <title>foo</title>
+ </head>
+ <body>
+ <p>foo
+ <table>
+ <tr>
+ <td>
+ foo
+ <p>bar
+ </td>
+ </tr>
+ </table>
+ </p>
+ </body>
+ </html>
+
+The second p is quite legally inside a much higher p.
+
+My formalization of the reason why this is legal, but this:
+
+ <p>foo<p>bar</p></p>
+
+isn't, is that something about the table constitutes a "barrier" to
+the application of the rule about what p must minimize.
+
+So C<@HTML::Tagset::p_closure_barriers> is the list of all such
+barrier-tags.
+
+=cut
+
+@p_closure_barriers = qw(
+ li blockquote
+ ul ol menu dir
+ dl dt dd
+ td th tr table caption
+ div
+ );
+
+# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
+# monkey business of barriers to minimization!
+
+=head2 hashset %isCDATA_Parent
+
+This hashset includes all elements whose content is CDATA.
+
+=cut
+
+%isCDATA_Parent = map {; $_ => 1 }
+ qw(script style xmp listing plaintext);
+
+# TODO: there's nothing else that takes CDATA children, right?
+
+# As the HTML3 DTD (Raggett 1995-04-24) noted:
+# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
+# and derive from very early versions of HTML. They require non-
+# standard parsers and will cause problems for processing
+# documents with standard SGML tools.
+
+
+=head1 CAVEATS
+
+You may find it useful to alter the behavior of modules (like
+C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s
+data tables by altering the data tables themselves. You are welcome
+to try, but be careful; and be aware that different modules may or may
+react differently to the data tables being changed.
+
+Note that it may be inappropriate to use these tables for I<producing>
+HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
+for all elements that can appear either in the head or in the body,
+such as "script". That doesn't mean that I am saying your code that
+produces HTML should feel free to put script elements in either place!
+If you are producing programs that spit out HTML, you should be
+I<intimately> familiar with the DTDs for HTML or XHTML (available at
+C<http://www.w3.org/>), and you should slavishly obey them, not
+the data tables in this document.
+
+=head1 SEE ALSO
+
+L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 1995-2000 Gisle Aas.
+
+Copyright 2000-2005 Sean M. Burke.
+
+Copyright 2005-2008 Andy Lester.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=head1 ACKNOWLEDGEMENTS
+
+Most of the code/data in this module was adapted from code written
+by Gisle Aas for C<HTML::Element>, C<HTML::TreeBuilder>, and
+C<HTML::LinkExtor>. Then it was maintained by Sean M. Burke.
+
+=head1 AUTHOR
+
+Current maintainer: Andy Lester, C<< <andy at petdance.com> >>
+
+=head1 BUGS
+
+Please report any bugs or feature requests to
+C<bug-html-tagset at rt.cpan.org>, or through the web interface at
+L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Tagset>. I will
+be notified, and then you'll automatically be notified of progress on
+your bug as I make changes.
+
+=cut
+
+1;
diff --git a/lib/HTML/Tagset/t/00_about_verbose.t b/lib/HTML/Tagset/t/00_about_verbose.t
new file mode 100644
index 0000000..3278b58
--- /dev/null
+++ b/lib/HTML/Tagset/t/00_about_verbose.t
@@ -0,0 +1,85 @@
+
+require 5;
+# Time-stamp: "2004-12-29 20:55:15 AST"
+# Summary of, well, things.
+
+use Test;
+BEGIN {plan tests => 2};
+ok 1;
+
+use HTML::Tagset ();
+
+#chdir "t" if -e "t";
+
+{
+ my @out;
+ push @out,
+ "\n\nPerl v",
+ defined($^V) ? sprintf('%vd', $^V) : $],
+ " under $^O ",
+ (defined(&Win32::BuildNumber) and defined &Win32::BuildNumber())
+ ? ("(Win32::BuildNumber ", &Win32::BuildNumber(), ")") : (),
+ (defined $MacPerl::Version)
+ ? ("(MacPerl version $MacPerl::Version)") : (),
+ "\n"
+ ;
+
+ # Ugly code to walk the symbol tables:
+ my %v;
+ my @stack = (''); # start out in %::
+ my $this;
+ my $count = 0;
+ my $pref;
+ while(@stack) {
+ $this = shift @stack;
+ die "Too many packages?" if ++$count > 1000;
+ next if exists $v{$this};
+ next if $this eq 'main'; # %main:: is %::
+
+ #print "Peeking at $this => ${$this . '::VERSION'}\n";
+
+ if(defined ${$this . '::VERSION'} ) {
+ $v{$this} = ${$this . '::VERSION'}
+ } elsif(
+ defined *{$this . '::ISA'} or defined &{$this . '::import'}
+ or ($this ne '' and grep defined *{$_}{'CODE'}, values %{$this . "::"})
+ # If it has an ISA, an import, or any subs...
+ ) {
+ # It's a class/module with no version.
+ $v{$this} = undef;
+ } else {
+ # It's probably an unpopulated package.
+ ## $v{$this} = '...';
+ }
+
+ $pref = length($this) ? "$this\::" : '';
+ push @stack, map m/^(.+)::$/ ? "$pref$1" : (), keys %{$this . '::'};
+ #print "Stack: @stack\n";
+ }
+ push @out, " Modules in memory:\n";
+ delete @v{'', '[none]'};
+ foreach my $p (sort {lc($a) cmp lc($b)} keys %v) {
+ $indent = ' ' x (2 + ($p =~ tr/:/:/));
+ push @out, ' ', $indent, $p, defined($v{$p}) ? " v$v{$p};\n" : ";\n";
+ }
+ push @out, sprintf "[at %s (local) / %s (GMT)]\n",
+ scalar(gmtime), scalar(localtime);
+ my $x = join '', @out;
+ $x =~ s/^/#/mg;
+ print $x;
+}
+
+print "# Running",
+ (chr(65) eq 'A') ? " in an ASCII world.\n" : " in a non-ASCII world.\n",
+ "#\n",
+;
+
+print "# \@INC:\n", map("# [$_]\n", @INC), "#\n#\n";
+
+print "# \%INC:\n";
+foreach my $x (sort {lc($a) cmp lc($b)} keys %INC) {
+ print "# [$x] = [", $INC{$x} || '', "]\n";
+}
+
+ok 1;
+
diff --git a/lib/HTML/Tagset/t/01_old_junk.t b/lib/HTML/Tagset/t/01_old_junk.t
new file mode 100644
index 0000000..a09080f
--- /dev/null
+++ b/lib/HTML/Tagset/t/01_old_junk.t
@@ -0,0 +1,8 @@
+
+# Time-stamp: "2004-12-29 18:49:45 AST"
+
+BEGIN { $| = 1; print "1..1\n"; }
+END {print "not ok 1\n" unless $loaded;}
+use HTML::Tagset;
+$loaded = 1;
+print "ok 1\n";}

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Next example

Test

" +9.6:79 text "Test" +9.10:83 end "