From: Steve Peters Date: Thu, 5 Jun 2008 01:58:26 +0000 (+0000) Subject: Assimilate HTML-Parser and HTML-Tagset. HTML-Parser is now a prereq X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=42e4baff3065f0219e40d48113b9180ea9333dbc;p=p5sagit%2Fp5-mst-13.2.git Assimilate HTML-Parser and HTML-Tagset. HTML-Parser is now a prereq for Pod-Simple and HTML-Tagset is a prereq for HTML-Parser. I also resorted the MANIFEST file. p4raw-id: //depot/perl@33998 --- diff --git a/MANIFEST b/MANIFEST index b4bba83..51c1d3d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -75,8 +75,8 @@ ext/attrs/t/attrs.t See if attrs works with C ext/B/B/Concise.pm Compiler Concise backend ext/B/B/Debug.pm Compiler Debug backend ext/B/B/Deparse.pm Compiler Deparse backend -ext/B/B/Lint.pm Compiler Lint backend ext/B/B/Lint/Debug.pm Adds debugging stringification to B:: +ext/B/B/Lint.pm Compiler Lint backend ext/B/B.pm Compiler backend support functions and methods ext/B/B/Showlex.pm Compiler Showlex backend ext/B/B/Terse.pm Compiler Terse backend @@ -350,6 +350,7 @@ ext/Devel/PPPort/t/MY_CXT.t Devel::PPPort test file ext/Devel/PPPort/t/newCONSTSUB.t Devel::PPPort test file ext/Devel/PPPort/t/newRV.t Devel::PPPort test file ext/Devel/PPPort/t/newSVpv.t Devel::PPPort test file +ext/Devel/PPPort/TODO Devel::PPPort Todo ext/Devel/PPPort/t/podtest.t Devel::PPPort test file ext/Devel/PPPort/t/ppphtest.t Devel::PPPort test file ext/Devel/PPPort/t/pvs.t Devel::PPPort test file @@ -365,7 +366,6 @@ ext/Devel/PPPort/t/threads.t Devel::PPPort test file ext/Devel/PPPort/t/uv.t Devel::PPPort test file ext/Devel/PPPort/t/variables.t Devel::PPPort test file ext/Devel/PPPort/t/warn.t Devel::PPPort test file -ext/Devel/PPPort/TODO Devel::PPPort Todo ext/Devel/PPPort/typemap Devel::PPPort Typemap ext/Digest/MD5/Changes Digest::MD5 extension changes ext/Digest/MD5/hints/dec_osf.pl Hints for named architecture @@ -693,6 +693,71 @@ ext/Hash/Util/lib/Hash/Util.pm Hash::Util ext/Hash/Util/Makefile.PL Makefile for Hash::Util ext/Hash/Util/t/Util.t See if Hash::Util works ext/Hash/Util/Util.xs XS bits of Hash::Util +ext/HTML/Parser/hints/solaris.pl files for HTML::Parser +ext/HTML/Parser/hparser.c files for HTML::Parser +ext/HTML/Parser/hparser.h files for HTML::Parser +ext/HTML/Parser/lib/HTML/Entities.pm file for HTML::Entities +ext/HTML/Parser/lib/HTML/Filter.pm file for HTML::Filter +ext/HTML/Parser/lib/HTML/HeadParser.pm file for HTML::HeadParser +ext/HTML/Parser/lib/HTML/LinkExtor.pm file for HTML::LinkExtor +ext/HTML/Parser/lib/HTML/PullParser.pm file for HTML::PullParser +ext/HTML/Parser/lib/HTML/TokeParser.pm file for HTML::TokeParser +ext/HTML/Parser/Makefile.PL files for HTML::Parser +ext/HTML/Parser/mkhctype files for HTML::Parser +ext/HTML/Parser/mkpfunc files for HTML::Parser +ext/HTML/Parser/Parser.pm files for HTML::Parser +ext/HTML/Parser/Parser.xs files for HTML::Parser +ext/HTML/Parser/t/api_version.t test for HTML::Parser +ext/HTML/Parser/t/argspec2.t test for HTML::Parser +ext/HTML/Parser/t/argspec-bad.t test for HTML::Parser +ext/HTML/Parser/t/argspec.t test for HTML::Parser +ext/HTML/Parser/t/attr-encoded.t test for HTML::Parser +ext/HTML/Parser/t/callback.t test for HTML::Parser +ext/HTML/Parser/t/case-sensitive.t test for HTML::Parser +ext/HTML/Parser/t/cases.t test for HTML::Parser +ext/HTML/Parser/t/comment.t test for HTML::Parser +ext/HTML/Parser/t/crashme.t test for HTML::Parser +ext/HTML/Parser/t/declaration.t test for HTML::Parser +ext/HTML/Parser/t/default.t test for HTML::Parser +ext/HTML/Parser/t/document.t test for HTML::Parser +ext/HTML/Parser/t/dtext.t test for HTML::Parser +ext/HTML/Parser/t/entities2.t test for HTML::Parser +ext/HTML/Parser/t/entities.t test for HTML::Parser +ext/HTML/Parser/t/filter-methods.t test for HTML::Parser +ext/HTML/Parser/t/filter.t test for HTML::Parser +ext/HTML/Parser/t/handler-eof.t test for HTML::Parser +ext/HTML/Parser/t/handler.t test for HTML::Parser +ext/HTML/Parser/t/headparser-http.t test for HTML::Parser +ext/HTML/Parser/t/headparser.t test for HTML::Parser +ext/HTML/Parser/t/ignore.t test for HTML::Parser +ext/HTML/Parser/t/largetags.t test for HTML::Parser +ext/HTML/Parser/t/linkextor-base.t test for HTML::Parser +ext/HTML/Parser/t/linkextor-rel.t test for HTML::Parser +ext/HTML/Parser/t/magic.t test for HTML::Parser +ext/HTML/Parser/t/marked-sect.t test for HTML::Parser +ext/HTML/Parser/t/msie-compat.t test for HTML::Parser +ext/HTML/Parser/t/offset.t test for HTML::Parser +ext/HTML/Parser/tokenpos.h files for HTML::Parser +ext/HTML/Parser/t/options.t test for HTML::Parser +ext/HTML/Parser/t/parsefile.t test for HTML::Parser +ext/HTML/Parser/t/parser.t test for HTML::Parser +ext/HTML/Parser/t/plaintext.t test for HTML::Parser +ext/HTML/Parser/t/pod.t test for HTML::Parser +ext/HTML/Parser/t/process.t test for HTML::Parser +ext/HTML/Parser/t/pullparser.t test for HTML::Parser +ext/HTML/Parser/t/script.t test for HTML::Parser +ext/HTML/Parser/t/skipped-text.t test for HTML::Parser +ext/HTML/Parser/t/stack-realloc.t test for HTML::Parser +ext/HTML/Parser/t/textarea.t test for HTML::Parser +ext/HTML/Parser/t/threads.t test for HTML::Parser +ext/HTML/Parser/t/tokeparser.t test for HTML::Parser +ext/HTML/Parser/t/uentities.t test for HTML::Parser +ext/HTML/Parser/t/unbroken-text.t test for HTML::Parser +ext/HTML/Parser/t/unicode-bom.t test for HTML::Parser +ext/HTML/Parser/t/unicode.t test for HTML::Parser +ext/HTML/Parser/t/xml-mode.t test for HTML::Parser +ext/HTML/Parser/typemap files for HTML::Parser +ext/HTML/Parser/util.c files for HTML::Parser ext/I18N/Langinfo/fallback/const-c.inc I18N::Langinfo ext/I18N/Langinfo/fallback/const-xs.inc I18N::Langinfo ext/I18N/Langinfo/Langinfo.pm I18N::Langinfo @@ -848,12 +913,12 @@ ext/IPC/SysV/README IPC::SysV README ext/IPC/SysV/regen.pl IPC::SysV file regeneration script ext/IPC/SysV/SysV.xs IPC::SysV extension Perl module ext/IPC/SysV/t/ipcsysv.t IPC::SysV test file -ext/IPC/SysV/t/pod.t IPC::SysV test file -ext/IPC/SysV/t/podcov.t IPC::SysV test file ext/IPC/SysV/t/msg.t IPC::SysV test file +ext/IPC/SysV/TODO IPC::SysV todo file +ext/IPC/SysV/t/podcov.t IPC::SysV test file +ext/IPC/SysV/t/pod.t IPC::SysV test file ext/IPC/SysV/t/sem.t IPC::SysV test file ext/IPC/SysV/t/shm.t IPC::SysV test file -ext/IPC/SysV/TODO IPC::SysV todo file ext/IPC/SysV/typemap IPC::SysV typemap ext/List/Util/Changes Util extension ext/List/Util/lib/List/Util.pm List::Util @@ -1478,9 +1543,9 @@ lib/Attribute/Handlers/t/data_convert.t Test attribute data conversion lib/Attribute/Handlers/t/linerep.t See if Attribute::Handlers works lib/Attribute/Handlers/t/multi.t See if Attribute::Handlers works lib/attributes.pm For "sub foo : attrlist" +lib/AutoLoader.pm Autoloader base class lib/AutoLoader/t/01AutoLoader.t See if AutoLoader works lib/AutoLoader/t/02AutoSplit.t See if AutoSplit works -lib/AutoLoader.pm Autoloader base class lib/AutoSplit.pm Split up autoload functions lib/autouse.pm Load and call a function only when it's used lib/autouse.t See if autouse works @@ -1582,8 +1647,8 @@ lib/CGI/t/start_end_end.t See if CGI.pm works lib/CGI/t/start_end_start.t See if CGI.pm works lib/CGI/t/switch.t See if CGI::Switch still loads lib/CGI/t/uploadInfo.t See if CGI.pm works -lib/CGI/t/upload.t See if CGI.pm works lib/CGI/t/upload_post_text.txt.packed Test data for CGI.pm +lib/CGI/t/upload.t See if CGI.pm works lib/CGI/t/util-58.t See if 5.8-dependent features work lib/CGI/t/util.t See if CGI.pm works lib/CGI/Util.pm Utility functions @@ -1839,10 +1904,10 @@ lib/ExtUtils/t/eu_command.t See if ExtUtils::Command works lib/ExtUtils/t/FIRST_MAKEFILE.t See if FIRST_MAKEFILE works lib/ExtUtils/t/fixin.t See if ExtUtils::MakeMaker works lib/ExtUtils/t/hints.t See if hint files are honored. +lib/ExtUtils/t/Installapi2.t See if new api for ExtUtils::Install::install() works lib/ExtUtils/t/INSTALL_BASE.t Test INSTALL_BASE in MakeMaker lib/ExtUtils/t/Installed.t See if ExtUtils::Installed works lib/ExtUtils/t/Install.t See if ExtUtils::Install works -lib/ExtUtils/t/Installapi2.t See if new api for ExtUtils::Install::install() works lib/ExtUtils/t/INST_PREFIX.t See if MakeMaker can apply PREFIXs lib/ExtUtils/t/INST.t Check MakeMaker INST_* macros lib/ExtUtils/t/Liblist.t See if ExtUtils::Liblist works @@ -1972,6 +2037,9 @@ lib/Getopt/Std.t See if Getopt::Std and Getopt::Long work lib/h2ph.t See if h2ph works like it should lib/h2xs.t See if h2xs produces expected lists of files lib/hostname.pl Old hostname code +lib/HTML/Tagset.pm HTML::Tagset +lib/HTML/Tagset/t/00_about_verbose.t HTML::Tagset +lib/HTML/Tagset/t/01_old_junk.t HTML::Tagset lib/I18N/Collate.pm Routines to do strxfrm-based collation lib/I18N/Collate.t See if I18N::Collate works lib/I18N/LangTags/ChangeLog I18N::LangTags @@ -2347,8 +2415,8 @@ lib/parent.pm Establish an ISA relationship with base classes at compile time lib/parent/t/compile-time-file.t tests for parent.pm lib/parent/t/compile-time.t tests for parent.pm lib/parent/t/lib/Dummy2.plugin test files for parent.pm -lib/parent/t/lib/Dummy.pm test files for parent.pm lib/parent/t/lib/Dummy/Outside.pm test files for parent.pm +lib/parent/t/lib/Dummy.pm test files for parent.pm lib/parent/t/lib/FileThatOnlyExistsAsPMC.pmc test files for parent.pm lib/parent/t/lib/ReturnsFalse.pm test files for parent.pm lib/parent/t/parent-classfromclassfile.t tests for parent.pm @@ -2565,9 +2633,9 @@ lib/Pod/Simple/t/testlib3/squaa/Vliff.pm Pod::Simple test file lib/Pod/Simple/t/tiedfh.t Pod::Simple test file lib/Pod/Simple/t/verbatim.t Pod::Simple test file lib/Pod/Simple/t/verb_fmt.t Pod::Simple test file -lib/Pod/Simple/t/x_nixer.t Pod::Simple test file lib/Pod/Simple/t/xhtml01.t Pod::Simple test file lib/Pod/Simple/t/xhtml05.t Pod::Simple test file +lib/Pod/Simple/t/x_nixer.t Pod::Simple test file lib/Pod/Simple/XHTML.pm turn Pod into XHTML lib/Pod/Simple/XMLOutStream.pm turn Pod into XML lib/Pod/t/basic.cap podlators test @@ -2593,8 +2661,8 @@ lib/Pod/t/htmllink.t pod2html link test lib/Pod/t/htmlview.pod pod2html render test input data lib/Pod/t/htmlview.t pod2html render test lib/Pod/t/InputObjects.t See if Pod::InputObjects works -lib/Pod/t/man.t podlators test lib/Pod/t/man-options.t podlators test +lib/Pod/t/man.t podlators test lib/Pod/t/parselink.t podlators test lib/Pod/t/pod2html-lib.pl pod2html testing library lib/Pod/t/pod2latex.t See if Pod::LaTeX works @@ -2612,9 +2680,9 @@ lib/Search/Dict.pm Perform binary search on dictionaries lib/Search/Dict.t See if Search::Dict works lib/SelectSaver.pm Enforce proper select scoping lib/SelectSaver.t See if SelectSaver works -lib/SelfLoader/t/02SelfLoader-buggy.t See if SelfLoader works lib/SelfLoader.pm Load functions only on demand lib/SelfLoader/t/01SelfLoader.t See if SelfLoader works +lib/SelfLoader/t/02SelfLoader-buggy.t See if SelfLoader works lib/Shell.pm Make AUTOLOADed system() calls lib/Shell.t Tests for above lib/shellwords.pl Perl library to split into words with shell quoting @@ -2637,29 +2705,29 @@ lib/syslog.pl Perl library supporting syslogging lib/tainted.pl Old code for tainting lib/TAP/Base.pm A parser for Test Anything Protocol lib/TAP/Formatter/Color.pm A parser for Test Anything Protocol -lib/TAP/Formatter/Console.pm A parser for Test Anything Protocol lib/TAP/Formatter/Console/ParallelSession.pm A parser for Test Anything Protocol +lib/TAP/Formatter/Console.pm A parser for Test Anything Protocol lib/TAP/Formatter/Console/Session.pm A parser for Test Anything Protocol lib/TAP/Harness.pm A parser for Test Anything Protocol -lib/TAP/Parser.pm A parser for Test Anything Protocol lib/TAP/Parser/Aggregator.pm A parser for Test Anything Protocol lib/TAP/Parser/Grammar.pm A parser for Test Anything Protocol -lib/TAP/Parser/Iterator.pm A parser for Test Anything Protocol lib/TAP/Parser/Iterator/Array.pm A parser for Test Anything Protocol +lib/TAP/Parser/Iterator.pm A parser for Test Anything Protocol lib/TAP/Parser/Iterator/Process.pm A parser for Test Anything Protocol lib/TAP/Parser/Iterator/Stream.pm A parser for Test Anything Protocol lib/TAP/Parser/Multiplexer.pm A parser for Test Anything Protocol -lib/TAP/Parser/Result.pm A parser for Test Anything Protocol +lib/TAP/Parser.pm A parser for Test Anything Protocol lib/TAP/Parser/Result/Bailout.pm A parser for Test Anything Protocol lib/TAP/Parser/Result/Comment.pm A parser for Test Anything Protocol lib/TAP/Parser/Result/Plan.pm A parser for Test Anything Protocol +lib/TAP/Parser/Result.pm A parser for Test Anything Protocol lib/TAP/Parser/Result/Pragma.pm A parser for Test Anything Protocol lib/TAP/Parser/Result/Test.pm A parser for Test Anything Protocol lib/TAP/Parser/Result/Unknown.pm A parser for Test Anything Protocol lib/TAP/Parser/Result/Version.pm A parser for Test Anything Protocol lib/TAP/Parser/Result/YAML.pm A parser for Test Anything Protocol -lib/TAP/Parser/Source.pm A parser for Test Anything Protocol lib/TAP/Parser/Source/Perl.pm A parser for Test Anything Protocol +lib/TAP/Parser/Source.pm A parser for Test Anything Protocol lib/TAP/Parser/Utils.pm A parser for Test Anything Protocol lib/TAP/Parser/YAMLish/Reader.pm A parser for Test Anything Protocol lib/TAP/Parser/YAMLish/Writer.pm A parser for Test Anything Protocol @@ -2710,9 +2778,9 @@ lib/Test/Harness/t/nofork.t Test::Harness test lib/Test/Harness/t/parse.t Test::Harness test lib/Test/Harness/t/premature-bailout.t Test::Harness test lib/Test/Harness/t/process.t Test::Harness test -lib/Test/Harness/t/prove.t Test::Harness test lib/Test/Harness/t/proverc.t Test::Harness test lib/Test/Harness/t/proverun.t Test::Harness test +lib/Test/Harness/t/prove.t Test::Harness test lib/Test/Harness/t/regression.t Test::Harness test lib/Test/Harness/t/results.t Test::Harness test lib/Test/Harness/t/source.t Test::Harness test @@ -2724,8 +2792,8 @@ lib/Test/Harness/t/testargs.t Test::Harness test lib/Test/Harness/t/unicode.t Test::Harness test lib/Test/Harness/t/utils.t Test::Harness test lib/Test/Harness/t/yamlish-output.t Test::Harness test -lib/Test/Harness/t/yamlish-writer.t Test::Harness test lib/Test/Harness/t/yamlish.t Test::Harness test +lib/Test/Harness/t/yamlish-writer.t Test::Harness test lib/Test/More.pm More utilities for writing tests lib/Test.pm A simple framework for writing test scripts lib/Test/Simple/Changes Test::Simple changes @@ -3173,8 +3241,8 @@ parser.h parser object header patchlevel.h The current patch level of perl perlapi.c Perl API functions perlapi.h Perl API function declarations -perldtrace.d D script for Perl probes perl.c main() +perldtrace.d D script for Perl probes perl.h Global declarations perlio.c C code for PerlIO abstraction perlio.h PerlIO abstraction @@ -3563,6 +3631,7 @@ t/io/through.t See if pipe passes data intact t/io/utf8.t See if file seeking works t/japh/abigail.t Obscure tests t/lib/1_compile.t See if the various libraries and extensions compile +t/lib/App/Prove/Plugin/Dummy.pm Module for testing Test::Harness t/lib/Cname.pm Test charnames in regexes (op/pat.t) t/lib/common.pl Helper for lib/{warnings,feature}.t t/lib/commonsense.t See if configuration meets basic needs @@ -3583,7 +3652,9 @@ t/lib/compress/truncate.pl Compress::Zlib t/lib/compress/zlib-generic.pl Compress::Zlib t/lib/contains_pod.xr Pod-Parser test file t/lib/cygwin.t Builtin cygwin function tests -t/lib/App/Prove/Plugin/Dummy.pm Module for testing Test::Harness +t/lib/data/catme.1 Test data for Test::Harness +t/lib/data/proverc Test data for Test::Harness +t/lib/data/sample.yml Test data for Test::Harness t/lib/Devel/switchd.pm Module for t/run/switchd.t t/lib/Dev/Null.pm Module for testing Test::Harness t/lib/dprof/test1_t Perl code profiler tests @@ -3635,9 +3706,6 @@ t/lib/mypragma.t Test the example user pragma t/lib/NoFork.pm Module for testing Test::Harness t/lib/no_load.t Test that some modules don't load others t/lib/proxy_constant_subs.t Test that Proxy Constant Subs behave correctly -t/lib/data/catme.1 Test data for Test::Harness -t/lib/data/proverc Test data for Test::Harness -t/lib/data/sample.yml Test data for Test::Harness t/lib/sample-tests/bailout Test data for Test::Harness t/lib/sample-tests/bignum Test data for Test::Harness t/lib/sample-tests/bignum_many Test data for Test::Harness @@ -3673,10 +3741,10 @@ t/lib/sample-tests/simple Test data for Test::Harness t/lib/sample-tests/simple_fail Test data for Test::Harness t/lib/sample-tests/simple_yaml Test data for Test::Harness t/lib/sample-tests/skip Test data for Test::Harness -t/lib/sample-tests/skip_nomsg Test data for Test::Harness t/lib/sample-tests/skipall Test data for Test::Harness t/lib/sample-tests/skipall_nomsg Test data for Test::Harness t/lib/sample-tests/skipall_v13 Test data for Test::Harness +t/lib/sample-tests/skip_nomsg Test data for Test::Harness t/lib/sample-tests/space_after_plan Test data for Test::Harness t/lib/sample-tests/stdout_stderr Test data for Test::Harness t/lib/sample-tests/strict Test data for Test::Harness @@ -3704,8 +3772,8 @@ t/lib/strict/subs Tests of "use strict 'subs'" for strict.t t/lib/strict/vars Tests of "use strict 'vars'" for strict.t t/lib/Test/Simple/Catch.pm Utility module for testing Test::Simple t/lib/Test/Simple/sample_tests/death_in_eval.plx for exit.t -t/lib/Test/Simple/sample_tests/death_with_handler.plx for exit.t t/lib/Test/Simple/sample_tests/death.plx for exit.t +t/lib/Test/Simple/sample_tests/death_with_handler.plx for exit.t t/lib/Test/Simple/sample_tests/exit.plx for exit.t t/lib/Test/Simple/sample_tests/extras.plx for exit.t t/lib/Test/Simple/sample_tests/five_fail.plx for exit.t @@ -3791,11 +3859,11 @@ t/Module_Pluggable/20dodgy_files.t Module::Pluggable tests t/Module_Pluggable/21editor_junk.t Module::Pluggable tests t/Module_Pluggable/acme/Acme/MyTest/Plugin/Foo.pm Module::Pluggable tests t/Module_Pluggable/lib/Acme/MyTest/Plugin/Foo.pm Module::Pluggable tests -t/Module_Pluggable/lib/EditorJunk/Plugin/Foo.pm Module::Pluggable tests -t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm~ Module::Pluggable tests -t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swp Module::Pluggable tests t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm Module::Pluggable tests +t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm~ Module::Pluggable tests t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swo Module::Pluggable tests +t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swp Module::Pluggable tests +t/Module_Pluggable/lib/EditorJunk/Plugin/Foo.pm Module::Pluggable tests t/Module_Pluggable/lib/ExtTest/Plugin/Bar.plugin Module::Pluggable tests t/Module_Pluggable/lib/ExtTest/Plugin/Foo.plugin Module::Pluggable tests t/Module_Pluggable/lib/ExtTest/Plugin/Quux/Foo.plugin Module::Pluggable tests @@ -3880,8 +3948,8 @@ t/op/die_exit.t See if die and exit status interaction works t/op/die.t See if die works t/op/dor.t See if defined-or (//) works t/op/do.t See if subroutines work -t/op/each.t See if hash iterators work t/op/each_array.t See if array iterators work +t/op/each.t See if hash iterators work t/op/eval.t See if eval operator works t/op/exec.t See if exec, system and qx work t/op/exists_sub.t See if exists(&sub) works @@ -4095,6 +4163,7 @@ t/uni/latin2.t See if Unicode in latin2 works t/uni/lower.t See if Unicode casing works t/uni/overload.t See if Unicode overloading works t/uni/sprintf.t See if Unicode sprintf works +t/uni/tie.t See if Unicode tie works t/uni/title.t See if Unicode casing works t/uni/tr_7jis.t See if Unicode tr/// in 7jis works t/uni/tr_eucjp.t See if Unicode tr/// in eucjp works @@ -4104,7 +4173,6 @@ t/uni/upper.t See if Unicode casing works t/uni/write.t See if Unicode formats work t/win32/system.t See if system works in Win* t/win32/system_tests Test runner for system.t -t/uni/tie.t See if Unicode tie works t/x2p/s2p.t See if s2p/psed work uconfig.h Configuration header for microperl uconfig.sh Configuration script for microperl diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl index 5b7486f..5cae67d 100644 --- a/Porting/Maintainers.pl +++ b/Porting/Maintainers.pl @@ -418,6 +418,20 @@ package Maintainers; 'CPAN' => 1, }, + 'HTML::Parser' => + { + 'MAINTAINER' => 'gaas', + 'FILES' => q[ext/HTML/Parser], + 'CPAN' => 1, + }, + + 'HTML::Tagset' => + { + 'MAINTAINER' => 'petdance', + 'FILES' => q[lib/HTML/Tagset.pm lib/HTML/Tagset], + 'CPAN' => 1, + }, + 'I18N::LangTags' => { 'MAINTAINER' => 'sburke', diff --git a/ext/HTML/Parser/Makefile.PL b/ext/HTML/Parser/Makefile.PL new file mode 100644 index 0000000..79081f7 --- /dev/null +++ b/ext/HTML/Parser/Makefile.PL @@ -0,0 +1,30 @@ +require 5.006; +use strict; +use ExtUtils::MakeMaker; + +WriteMakefile( + NAME => 'HTML::Parser', + VERSION_FROM => 'Parser.pm', + H => [ "hparser.h", "hctype.h", "tokenpos.h", "pfunc.h", + "hparser.c", "util.c", + ], + PREREQ_PM => { + 'HTML::Tagset' => 3, + 'Test::More' => 0, # only needed to run 'make test' + }, + DEFINE => "-DMARKED_SECTION", + dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, + clean => { FILES => 'hctype.h pfunc.h' }, +); + + +sub MY::postamble +{ + ' +pfunc.h : mkpfunc + $(PERL) mkpfunc >pfunc.h + +hctype.h : mkhctype + $(PERL) mkhctype >hctype.h +' +} diff --git a/ext/HTML/Parser/Parser.pm b/ext/HTML/Parser/Parser.pm new file mode 100644 index 0000000..72d5a98 --- /dev/null +++ b/ext/HTML/Parser/Parser.pm @@ -0,0 +1,1233 @@ +package HTML::Parser; + +# Copyright 1996-2007, Gisle Aas. +# Copyright 1999-2000, Michael A. Chase. +# +# This library is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. + +use strict; +use vars qw($VERSION @ISA); + +$VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $ + +require HTML::Entities; + +require XSLoader; +XSLoader::load('HTML::Parser', $VERSION); + +sub new +{ + my $class = shift; + my $self = bless {}, $class; + return $self->init(@_); +} + + +sub init +{ + my $self = shift; + $self->_alloc_pstate; + + my %arg = @_; + my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); + if ($api_version >= 4) { + require Carp; + Carp::croak("API version $api_version not supported " . + "by HTML::Parser $VERSION"); + } + + if ($api_version < 3) { + # Set up method callbacks compatible with HTML-Parser-2.xx + $self->handler(text => "text", "self,text,is_cdata"); + $self->handler(end => "end", "self,tagname,text"); + $self->handler(process => "process", "self,token0,text"); + $self->handler(start => "start", + "self,tagname,attr,attrseq,text"); + + $self->handler(comment => + sub { + my($self, $tokens) = @_; + for (@$tokens) { + $self->comment($_); + } + }, "self,tokens"); + + $self->handler(declaration => + sub { + my $self = shift; + $self->declaration(substr($_[0], 2, -1)); + }, "self,text"); + } + + if (my $h = delete $arg{handlers}) { + $h = {@$h} if ref($h) eq "ARRAY"; + while (my($event, $cb) = each %$h) { + $self->handler($event => @$cb); + } + } + + # In the end we try to assume plain attribute or handler + while (my($option, $val) = each %arg) { + if ($option =~ /^(\w+)_h$/) { + $self->handler($1 => @$val); + } + elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { + require Carp; + Carp::croak("Bad constructor option '$option'"); + } + else { + $self->$option($val); + } + } + + return $self; +} + + +sub parse_file +{ + my($self, $file) = @_; + my $opened; + if (!ref($file) && ref(\$file) ne "GLOB") { + # Assume $file is a filename + local(*F); + open(F, $file) || return undef; + binmode(F); # should we? good for byte counts + $opened++; + $file = *F; + } + my $chunk = ''; + while (read($file, $chunk, 512)) { + $self->parse($chunk) || last; + } + close($file) if $opened; + $self->eof; +} + + +sub netscape_buggy_comment # legacy +{ + my $self = shift; + require Carp; + Carp::carp("netscape_buggy_comment() is deprecated. " . + "Please use the strict_comment() method instead"); + my $old = !$self->strict_comment; + $self->strict_comment(!shift) if @_; + return $old; +} + +# set up method stubs +sub text { } +*start = \&text; +*end = \&text; +*comment = \&text; +*declaration = \&text; +*process = \&text; + +1; + +__END__ + + +=head1 NAME + +HTML::Parser - HTML parser class + +=head1 SYNOPSIS + + use HTML::Parser (); + + # Create parser object + $p = HTML::Parser->new( api_version => 3, + start_h => [\&start, "tagname, attr"], + end_h => [\&end, "tagname"], + marked_sections => 1, + ); + + # Parse document text chunk by chunk + $p->parse($chunk1); + $p->parse($chunk2); + #... + $p->eof; # signal end of document + + # Parse directly from file + $p->parse_file("foo.html"); + # or + open(my $fh, "<:utf8", "foo.html") || die; + $p->parse_file($fh); + +=head1 DESCRIPTION + +Objects of the C class will recognize markup and +separate it from plain text (alias data content) in HTML +documents. As different kinds of markup and text are recognized, the +corresponding event handlers are invoked. + +C is not a generic SGML parser. We have tried to +make it able to deal with the HTML that is actually "out there", and +it normally parses as closely as possible to the way the popular web +browsers do it instead of strictly following one of the many HTML +specifications from W3C. Where there is disagreement, there is often +an option that you can enable to get the official behaviour. + +The document to be parsed may be supplied in arbitrary chunks. This +makes on-the-fly parsing as documents are received from the network +possible. + +If event driven parsing does not feel right for your application, you +might want to use C. This is an C +subclass that allows a more conventional program structure. + + +=head1 METHODS + +The following method is used to construct a new C object: + +=over + +=item $p = HTML::Parser->new( %options_and_handlers ) + +This class method creates a new C object and +returns it. Key/value argument pairs may be provided to assign event +handlers or initialize parser options. The handlers and parser +options can also be set or modified later by the method calls described below. + +If a top level key is in the form "_h" (e.g., "text_h") then it +assigns a handler to that event, otherwise it initializes a parser +option. The event handler specification value must be an array +reference. Multiple handlers may also be assigned with the 'handlers +=> [%handlers]' option. See examples below. + +If new() is called without any arguments, it will create a parser that +uses callback methods compatible with version 2 of C. +See the section on "version 2 compatibility" below for details. + +The special constructor option 'api_version => 2' can be used to +initialize version 2 callbacks while still setting other options and +handlers. The 'api_version => 3' option can be used if you don't want +to set any options and don't want to fall back to v2 compatible +mode. + +Examples: + + $p = HTML::Parser->new(api_version => 3, + text_h => [ sub {...}, "dtext" ]); + +This creates a new parser object with a text event handler subroutine +that receives the original text with general entities decoded. + + $p = HTML::Parser->new(api_version => 3, + start_h => [ 'my_start', "self,tokens" ]); + +This creates a new parser object with a start event handler method +that receives the $p and the tokens array. + + $p = HTML::Parser->new(api_version => 3, + handlers => { text => [\@array, "event,text"], + comment => [\@array, "event,text"], + }); + +This creates a new parser object that stores the event type and the +original text in @array for text and comment events. + +=back + +The following methods feed the HTML document +to the C object: + +=over + +=item $p->parse( $string ) + +Parse $string as the next chunk of the HTML document. The return +value is normally a reference to the parser object (i.e. $p). +Handlers invoked should not attempt to modify the $string in-place until +$p->parse returns. + +If an invoked event handler aborts parsing by calling $p->eof, then +$p->parse() will return a FALSE value. + +=item $p->parse( $code_ref ) + +If a code reference is passed as the argument to be parsed, then the +chunks to be parsed are obtained by invoking this function repeatedly. +Parsing continues until the function returns an empty (or undefined) +result. When this happens $p->eof is automatically signaled. + +Parsing will also abort if one of the event handlers calls $p->eof. + +The effect of this is the same as: + + while (1) { + my $chunk = &$code_ref(); + if (!defined($chunk) || !length($chunk)) { + $p->eof; + return $p; + } + $p->parse($chunk) || return undef; + } + +But it is more efficient as this loop runs internally in XS code. + +=item $p->parse_file( $file ) + +Parse text directly from a file. The $file argument can be a +filename, an open file handle, or a reference to an open file +handle. + +If $file contains a filename and the file can't be opened, then the +method returns an undefined value and $! tells why it failed. +Otherwise the return value is a reference to the parser object. + +If a file handle is passed as the $file argument, then the file will +normally be read until EOF, but not closed. + +If an invoked event handler aborts parsing by calling $p->eof, +then $p->parse_file() may not have read the entire file. + +On systems with multi-byte line terminators, the values passed for the +offset and length argspecs may be too low if parse_file() is called on +a file handle that is not in binary mode. + +If a filename is passed in, then parse_file() will open the file in +binary mode. + +=item $p->eof + +Signals the end of the HTML document. Calling the $p->eof method +outside a handler callback will flush any remaining buffered text +(which triggers the C event if there is any remaining text). + +Calling $p->eof inside a handler will terminate parsing at that point +and cause $p->parse to return a FALSE value. This also terminates +parsing by $p->parse_file(). + +After $p->eof has been called, the parse() and parse_file() methods +can be invoked to feed new documents with the parser object. + +The return value from eof() is a reference to the parser object. + +=back + + +Most parser options are controlled by boolean attributes. +Each boolean attribute is enabled by calling the corresponding method +with a TRUE argument and disabled with a FALSE argument. The +attribute value is left unchanged if no argument is given. The return +value from each method is the old attribute value. + +Methods that can be used to get and/or set parser options are: + +=over + +=item $p->attr_encoded + +=item $p->attr_encoded( $bool ) + +By default, the C and C<@attr> argspecs will have general +entities for attribute values decoded. Enabling this attribute leaves +entities alone. + +=item $p->boolean_attribute_value( $val ) + +This method sets the value reported for boolean attributes inside HTML +start tags. By default, the name of the attribute is also used as its +value. This affects the values reported for C and C +argspecs. + +=item $p->case_sensitive + +=item $p->case_sensitive( $bool ) + +By default, tagnames and attribute names are down-cased. Enabling this +attribute leaves them as found in the HTML source document. + +=item $p->closing_plaintext + +=item $p->closing_plaintext( $bool ) + +By default, "plaintext" element can never be closed. Everything up to +the end of the document is parsed in CDATA mode. This historical +behaviour is what at least MSIE does. Enabling this attribute makes +closing "" tag effective and the parsing process will resume +after seeing this tag. This emulates gecko-based browsers. + +=item $p->empty_element_tags + +=item $p->empty_element_tags( $bool ) + +By default, empty element tags are not recognized as such and the "/" +before ">" is just treated like a normal name character (unless +C is enabled). Enabling this attribute make +C recognize these tags. + +Empty element tags look like start tags, but end with the character +sequence "/>" instead of ">". When recognized by C they +cause an artificial end event in addition to the start event. The +C for the artificial end event will be empty and the C +array will be undefined even though the the token array will have one +element containing the tag name. + +=item $p->marked_sections + +=item $p->marked_sections( $bool ) + +By default, section markings like are treated like +ordinary text. When this attribute is enabled section markings are +honoured. + +There are currently no events associated with the marked section +markup, but the text can be returned as C. + +=item $p->strict_comment + +=item $p->strict_comment( $bool ) + +By default, comments are terminated by the first occurrence of "-->". +This is the behaviour of most popular browsers (like Mozilla, Opera and +MSIE), but it is not correct according to the official HTML +standard. Officially, you need an even number of "--" tokens before +the closing ">" is recognized and there may not be anything but +whitespace between an even and an odd "--". + +The official behaviour is enabled by enabling this attribute. + +Enabling of 'strict_comment' also disables recognizing these forms as +comments: + + + + + +=item $p->strict_end + +=item $p->strict_end( $bool ) + +By default, attributes and other junk are allowed to be present on end tags in a +manner that emulates MSIE's behaviour. + +The official behaviour is enabled with this attribute. If enabled, +only whitespace is allowed between the tagname and the final ">". + +=item $p->strict_names + +=item $p->strict_names( $bool ) + +By default, almost anything is allowed in tag and attribute names. +This is the behaviour of most popular browsers and allows us to parse +some broken tags with invalid attribute values like: + + [PREV + +By default, "LIST]" is parsed as a boolean attribute, not as +part of the ALT value as was clearly intended. This is also what +Mozilla sees. + +The official behaviour is enabled by enabling this attribute. If +enabled, it will cause the tag above to be reported as text +since "LIST]" is not a legal attribute name. + +=item $p->unbroken_text + +=item $p->unbroken_text( $bool ) + +By default, blocks of text are given to the text handler as soon as +possible (but the parser takes care always to break text at a +boundary between whitespace and non-whitespace so single words and +entities can always be decoded safely). This might create breaks that +make it hard to do transformations on the text. When this attribute is +enabled, blocks of text are always reported in one piece. This will +delay the text event until the following (non-text) event has been +recognized by the parser. + +Note that the C argspec will give you the offset of the first +segment of text and C is the combined length of the segments. +Since there might be ignored tags in between, these numbers can't be +used to directly index in the original document file. + +=item $p->utf8_mode + +=item $p->utf8_mode( $bool ) + +Enable this option when parsing raw undecoded UTF-8. This tells the +parser that the entities expanded for strings reported by C, +C<@attr> and C should be expanded as decoded UTF-8 so they end +up compatible with the surrounding text. + +If C is enabled then it is an error to pass strings +containing characters with code above 255 to the parse() method, and +the parse() method will croak if you try. + +Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 +encoded. The character can also be represented by the entity +"♥" or "♥". If we feed the parser: + + $p->parse("\xE2\x99\xA5♥"); + +then C will be reported as "\xE2\x99\xA5\x{2665}" without +C enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. +The later string is what you want. + +This option is only available with perl-5.8 or better. + +=item $p->xml_mode + +=item $p->xml_mode( $bool ) + +Enabling this attribute changes the parser to allow some XML +constructs. This enables the behaviour controlled by individually by +the C, C, C and +C attributes and also suppresses special treatment of +elements that are parsed as CDATA for HTML. + +=item $p->xml_pic + +=item $p->xml_pic( $bool ) + +By default, I are terminated by ">". When +this attribute is enabled, processing instructions are terminated by +"?>" instead. + +=back + +As markup and text is recognized, handlers are invoked. The following +method is used to set up handlers for different events: + +=over + +=item $p->handler( event => \&subroutine, $argspec ) + +=item $p->handler( event => $method_name, $argspec ) + +=item $p->handler( event => \@accum, $argspec ) + +=item $p->handler( event => "" ); + +=item $p->handler( event => undef ); + +=item $p->handler( event ); + +This method assigns a subroutine, method, or array to handle an event. + +Event is one of C, C, C, C, C, +C, C, C or C. + +The C<\&subroutine> is a reference to a subroutine which is called to handle +the event. + +The C<$method_name> is the name of a method of $p which is called to handle +the event. + +The C<@accum> is an array that will hold the event information as +sub-arrays. + +If the second argument is "", the event is ignored. +If it is undef, the default handler is invoked for the event. + +The C<$argspec> is a string that describes the information to be reported +for the event. Any requested information that does not apply to a +specific event is passed as C. If argspec is omitted, then it +is left unchanged. + +The return value from $p->handler is the old callback routine or a +reference to the accumulator array. + +Any return values from handler callback routines/methods are always +ignored. A handler callback can request parsing to be aborted by +invoking the $p->eof method. A handler callback is not allowed to +invoke the $p->parse() or $p->parse_file() method. An exception will +be raised if it tries. + +Examples: + + $p->handler(start => "start", 'self, attr, attrseq, text' ); + +This causes the "start" method of object $p to be called for 'start' events. +The callback signature is $p->start(\%attr, \@attr_seq, $text). + + $p->handler(start => \&start, 'attr, attrseq, text' ); + +This causes subroutine start() to be called for 'start' events. +The callback signature is start(\%attr, \@attr_seq, $text). + + $p->handler(start => \@accum, '"S", attr, attrseq, text' ); + +This causes 'start' event information to be saved in @accum. +The array elements will be ['S', \%attr, \@attr_seq, $text]. + + $p->handler(start => ""); + +This causes 'start' events to be ignored. It also suppresses +invocations of any default handler for start events. It is in most +cases equivalent to $p->handler(start => sub {}), but is more +efficient. It is different from the empty-sub-handler in that +C is not reset by it. + + $p->handler(start => undef); + +This causes no handler to be associated with start events. +If there is a default handler it will be invoked. + +=back + +Filters based on tags can be set up to limit the number of events +reported. The main bottleneck during parsing is often the huge number +of callbacks made from the parser. Applying filters can improve +performance significantly. + +The following methods control filters: + +=over + +=item $p->ignore_elements( @tags ) + +Both the C event and the C event as well as any events that +would be reported in between are suppressed. The ignored elements can +contain nested occurrences of itself. Example: + + $p->ignore_elements(qw(script style)); + +The C + +å +EOT + +$p->parse($doc)->eof; + +is($text, $doc); +is($dtext, <<"EOT"); +å +ååAAAA + +foo\240bar +foo\240bar +&xyzzy +&xyzzy; + +\1 +\377 +\377 +\377G + +� +� +& +&# +&#x +&aring + + +å +EOT diff --git a/ext/HTML/Parser/t/entities.t b/ext/HTML/Parser/t/entities.t new file mode 100644 index 0000000..b8342f5 --- /dev/null +++ b/ext/HTML/Parser/t/entities.t @@ -0,0 +1,193 @@ +use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); + +use Test::More tests => 12; + +$a = "Våre norske tegn bør æres"; + +decode_entities($a); + +is($a, "Våre norske tegn bør æres"); + +encode_entities($a); + +is($a, "Våre norske tegn bør æres"); + +decode_entities($a); +encode_entities_numeric($a); + +is($a, "Våre norske tegn bør æres"); + +$a = "<&>\"'"; +is(encode_entities($a), "<&>"'"); +is(encode_entities_numeric($a), "<&>"'"); + +$a = "abcdef"; +is(encode_entities($a, 'a-c'), "abcdef"); + + +# See how well it does against rfc1866... +$ent = $plain = ""; +while () { + next unless /^\s* +# Subject: HTML entities problem with 5.11 +# To: libwww-perl@ics.uci.edu +# Date: Fri, 05 Sep 1997 16:56:55 +1000 +# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU> +# +# Hi. I've got a problem that has surfaced with the changes to +# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening +# in the process of encoding then decoding special entities. Eg, what goes +# in as "abc&def&ghi" comes out as "abc&def;&ghi;". + +is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); + +# Decoding of ' +is(decode_entities("'"), "'"); +is(encode_entities("'", "'"), "'"); + + +__END__ +# Quoted from rfc1866.txt + +14. Proposed Entities + + The HTML DTD references the "Added Latin 1" entity set, which only + supplies named entities for a subset of the non-ASCII characters in + [ISO-8859-1], namely the accented characters. The following entities + should be supported so that all ISO 8859-1 characters may only be + referenced symbolically. The names for these entities are taken from + the appendixes of [SGML]. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Berners-Lee & Connolly Standards Track [Page 75] + +RFC 1866 Hypertext Markup Language - 2.0 November 1995 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Berners-Lee & Connolly Standards Track [Page 76] + +RFC 1866 Hypertext Markup Language - 2.0 November 1995 + + + + + + + + + + + + + + + diff --git a/ext/HTML/Parser/t/entities2.t b/ext/HTML/Parser/t/entities2.t new file mode 100644 index 0000000..7840c71 --- /dev/null +++ b/ext/HTML/Parser/t/entities2.t @@ -0,0 +1,57 @@ +#!perl -w + +use strict; +use Test::More tests => 9; + +use HTML::Entities qw(_decode_entities); + +eval { + _decode_entities("<", undef); +}; +like($@, qr/^Can't inline decode readonly string/); + +eval { + my $a = ""; + _decode_entities($a, $a); +}; +like($@, qr/^2nd argument must be hash reference/); + +eval { + my $a = ""; + _decode_entities($a, []); +}; +like($@, qr/^2nd argument must be hash reference/); + +$a = "<"; +_decode_entities($a, undef); +is($a, "<"); + +_decode_entities($a, { "lt" => "<" }); +is($a, "<"); + +my $x = "x" x 20; + +my $err; +for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") { + my $a = $_; + $a =~ s/:/&a;/g; + my $b = $_; + $b =~ s/:/$x/g; + _decode_entities($a, { "a" => $x }); + if ($a ne $b) { + diag "Something went wrong with '$_'"; + $err++; + } +} +ok(!$err); + +$a = "foo bar"; +_decode_entities($a, \%HTML::Entities::entity2char); +is($a, "foo\xA0bar"); + +$a = "foo bar"; +_decode_entities($a, \%HTML::Entities::entity2char); +is($a, "foo bar"); + +_decode_entities($a, \%HTML::Entities::entity2char, 1); +is($a, "foo\xA0bar"); diff --git a/ext/HTML/Parser/t/filter-methods.t b/ext/HTML/Parser/t/filter-methods.t new file mode 100644 index 0000000..9eccaf1 --- /dev/null +++ b/ext/HTML/Parser/t/filter-methods.t @@ -0,0 +1,205 @@ +#!/usr/bin/perl -w + +use Test::More tests => 12; +use strict; + +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]); +$p->ignore_elements("script"); +$p->unbroken_text(1); + +$p->handler(default => [], "event, text"); +$p->parse(<<"EOT")->eof; +foo +This is an italic and bold text. + + +EOT + +my $t = join("||", map join("|", @$_), @{$p->handler("default")}); +#diag $t; + +is($t, "start_document|||start|||start|||start|||text|foo||end|||start|||text| +This is an italic and bold text. +||end|||text| +||end|||text| +||end_document|", 'ignore_elements'); + + +#------------------------------------------------------ + +$p = HTML::Parser->new(api_version => 3); +$p->report_tags("a"); +$p->handler(start => sub { + my($tagname, %attr) = @_; + ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start'); + }, 'tagname, @attr'); +$p->handler(end => sub { + my $tagname = shift; + is($tagname, "a", 'report_tags end'); + }, 'tagname'); + +$p->parse(<eof; + +

Next example

+ +This is very nice example. + +EOT + + +#------------------------------------------------------ + +my @tags; +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(a em)); +$p->ignore_tags(qw(em)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<eof; + +

Next example

+ +This is yet another very nice example. + +EOT +is(join('|', @tags), 'a', 'report_tags followed by ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1)); +$p->report_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<eof; + +

Next example

+

Next example

+ +EOT +is(join('|', @tags), 'h1|h2', 'reset report_tags filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->ignore_tags(qw(h2)); +$p->report_tags(qw(h1 h2)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<eof; + +

Next example

+

Next example

+ +EOT +is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->ignore_tags(qw(h2)); +$p->report_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<eof; + +

Next example

+

Next example

+ +EOT +is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->report_tags(qw(h3)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<eof; + +

Next example

+

Next example

+

Next example

+ +EOT +is(join('|', @tags), 'h3', 'report_tags replaces filter'); + + +#------------------------------------------------------ + + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h1 h2)); +$p->ignore_tags(qw(h3)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<eof; + +

Next example

+

Next example

+

Next example

+ +EOT +is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h2)); +$p->ignore_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<eof; + +

Next example

+

Next example

+ +EOT +is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h2)); +$p->report_tags(qw(h1 h2)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<eof; + +

Next example

+

Next example

+ +EOT +is(join('|', @tags), 'h1', 'ignore_tags before report_tags'); +#------------------------------------------------------ + +$p = HTML::Parser->new(api_version => 3); +$p->ignore_elements("script"); +my $res=""; +$p->handler(default=> sub {$res.=$_[0];}, 'text'); +$p->parse(<<'EOT')->eof; +A C D F +EOT +is($res,"A C D F\n","ignore without " + ignore this + + + + + + + +Dette er vanlig tekst. Denne teksten definerer også slutten på +<head> delen av dokumentet. + +" + ignore this too + + + + + +Dette er også vanlig tekst som ikke skal blir parset i det hele tatt. + +EOT + +$| = 1; + +#$HTML::HeadParser::DEBUG = 1; +require HTML::HeadParser; +my $p = HTML::HeadParser->new( H->new ); + +if ($p->parse($HTML)) { + fail("Need more data which should not happen"); +} else { + #diag $p->as_string; + pass(); +} + +like($p->header('Title'), qr/Å være eller å ikke være/); +is($p->header('Expires'), 'Soon'); +is($p->header('Content-Base'), 'http://www.sn.no'); +like($p->header('Link'), qr//); + +# This header should not be present because the head ended +ok(!$p->header('Isindex')); + + +# Try feeding one char at a time +my $expected = $p->as_string; +my $nl = 1; +$p = HTML::HeadParser->new(H->new); +while ($HTML =~ /(.)/sg) { + #print STDERR '#' if $nl; + #print STDERR $1; + $nl = $1 eq "\n"; + $p->parse($1) or last; +} +is($p->as_string, $expected); + + +# Try reading it from a file +my $file = "hptest$$.html"; +die "$file already exists" if -e $file; + +open(FILE, ">$file") or die "Can't create $file: $!"; +binmode(FILE); +print FILE $HTML; +print FILE "

This is more content...

\n" x 2000; +print FILE "Buuuh!\n" x 200; +close FILE or die "Can't close $file: $!"; + +$p = HTML::HeadParser->new(H->new); +$p->parse_file($file); +unlink($file) or warn "Can't unlink $file: $!"; + +is($p->header("Title"), "Å være eller å ikke være"); + + +# We got into an infinite loop on data without tags and no EOL. +# This was actually a HTML::Parser bug. +open(FILE, ">$file") or die "Can't create $file: $!"; +print FILE "Foo"; +close(FILE); + +$p = HTML::HeadParser->new(H->new); +$p->parse_file($file); +unlink($file) or warn "Can't unlink $file: $!"; + +ok(!$p->as_string); + +SKIP: { + skip "Need Unicode support", 2 if $] < 5.008; + + # Test that the Unicode BOM does not confuse us? + $p = HTML::HeadParser->new(H->new); + ok($p->parse("\x{FEFF}\nHi <foo>")); + $p->eof; + + is($p->header("title"), "Hi "); +} diff --git a/ext/HTML/Parser/t/ignore.t b/ext/HTML/Parser/t/ignore.t new file mode 100644 index 0000000..008739e --- /dev/null +++ b/ext/HTML/Parser/t/ignore.t @@ -0,0 +1,27 @@ + +use Test::More tests => 4; + +use strict; +use HTML::Parser (); + +my $html = 'text'; + +my $text = ''; +my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']); +$p->parse($html)->eof; +is($text, $html); + +$text = ''; +$p->handler(start => ""); +$p->parse($html)->eof; +is($text, 'text'); + +$text = ''; +$p->handler(end => 0); +$p->parse($html)->eof; +is($text, 'text'); + +$text = ''; +$p->handler(start => undef); +$p->parse($html)->eof; +is($text, 'text'); diff --git a/ext/HTML/Parser/t/largetags.t b/ext/HTML/Parser/t/largetags.t new file mode 100644 index 0000000..a9ed3ff --- /dev/null +++ b/ext/HTML/Parser/t/largetags.t @@ -0,0 +1,38 @@ +# Exercise the tokenpos buffer allocation routines by feeding it +# very large tags. + +use Test::More tests => 2; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +$p->handler("start" => + sub { + my $tp = shift; + #diag int(@$tp), " - ", join(", ", @$tp); + is(@$tp, 2 + 26 * 6 * 4); + }, "tokenpos"); + +$p->handler("declaration" => + sub { + my $t = shift; + #diag int(@$t), " - @$t"; + is(@$t, 26 * 6 * 2 + 1); + }, "tokens"); + +$p->parse("parse("$_=1 "); +} +$p->parse(">"); + +$p->parse("parse("$_ -- $_ -- "); +} +$p->parse(">"); +$p->eof; +exit; + diff --git a/ext/HTML/Parser/t/linkextor-base.t b/ext/HTML/Parser/t/linkextor-base.t new file mode 100644 index 0000000..7ef8f02 --- /dev/null +++ b/ext/HTML/Parser/t/linkextor-base.t @@ -0,0 +1,41 @@ +# This test that HTML::LinkExtor really absolutize links correctly +# when a base URL is given to the constructor. + +use Test::More tests => 5; +require HTML::LinkExtor; + +SKIP: { +eval { + require URI; +}; +skip $@, 5 if $@; + +# Try with base URL and the $p->links interface. +$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html"); +$p->parse(<eof; + + + + + +This is link and an Image. +HTML + +@p = $p->links; + +# There should be 4 links in the document +is(@p, 4); + +for (@p) { + ($t, %attr) = @$_ if $_->[0] eq 'img'; +} + +is($t, 'img'); + +is(delete $attr{src}, "http://www.sn.no/foo/img.jpg"); + +is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif"); + +ok(!scalar(keys %attr)); # there should be no more attributes +} diff --git a/ext/HTML/Parser/t/linkextor-rel.t b/ext/HTML/Parser/t/linkextor-rel.t new file mode 100644 index 0000000..1190a96 --- /dev/null +++ b/ext/HTML/Parser/t/linkextor-rel.t @@ -0,0 +1,36 @@ +use Test::More tests => 4; + +require HTML::LinkExtor; + +$HTML = < + + + + +This is link and an Image. +HTML + + +# Try the callback interface +$links = ""; +$p = HTML::LinkExtor->new( + sub { + my($tag, %links) = @_; + #diag "$tag @{[%links]}"; + $links .= "$tag @{[%links]}\n"; + }); + +$p->parse($HTML); $p->eof; + +ok($links =~ m|^base href http://www\.sn\.no/$|m); +ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m); +ok($links =~ m|^a href link\.html$|m); + +# Used to be problems when using the links method on a document with +# no links it it. This is a test to prove that it works. +$p = new HTML::LinkExtor; +$p->parse("this is a document with no links"); $p->eof; +@a = $p->links; +is(@a, 0); diff --git a/ext/HTML/Parser/t/magic.t b/ext/HTML/Parser/t/magic.t new file mode 100644 index 0000000..366f275 --- /dev/null +++ b/ext/HTML/Parser/t/magic.t @@ -0,0 +1,41 @@ +# Check that the magic signature at the top of struct p_state works and that we +# catch modifications to _hparser_xs_state gracefully + +use Test::More tests => 5; + +use HTML::Parser; + +$p = HTML::Parser->new(api_version => 3); + +$p->xml_mode(1); + +# We should not be able to simply modify this stuff +eval { + ${$p->{_hparser_xs_state}} += 4; +}; +like($@, qr/^Modification of a read-only value attempted/); + + +my $x = delete $p->{_hparser_xs_state}; + +eval { + $p->xml_mode(1); +}; +like($@, qr/^Can't find '_hparser_xs_state'/); + +$p->{_hparser_xs_state} = \($$x + 16); + +eval { + $p->xml_mode(1); +}; +like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/); + +$p->{_hparser_xs_state} = 33; +eval { + $p->xml_mode(1); +}; +like($@, qr/^_hparser_xs_state element is not a reference/); + +$p->{_hparser_xs_state} = $x; + +ok($p->xml_mode(0)); diff --git a/ext/HTML/Parser/t/marked-sect.t b/ext/HTML/Parser/t/marked-sect.t new file mode 100644 index 0000000..6a63478 --- /dev/null +++ b/ext/HTML/Parser/t/marked-sect.t @@ -0,0 +1,121 @@ +#!/usr/bin/perl -w + +use strict; +my $tag; +my $text; + +use HTML::Parser (); +my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"], + text_h => [sub { $text .= shift }, "dtext"], + ); + + +use Test::More tests => 14; + +SKIP: { +eval { + $p->marked_sections(1); +}; +skip $@, 14 if $@; + +$p->parse(""); +is($text, "foo"); + +$p->parse(""); +is($text, "foobar"); + +$p->parse("]]>\n
"); +is($text, "foobarfoo\n"); + +$text = ""; +$p->parse("parse(",bar>]]>
"); +is($text, "<foo]]>"); + +$text = ""; +$p->parse("]]>]]>å
"); +is($text, "å
åå"); +is($tag, "br"); + +$text = ""; +$p->parse("]]>
"); +is($text, ""); + +$text = ""; +$p->parse("]]>
"); +is($text, "fooå
"); + +$text = ""; +$p->parse("]]>
"); +is($text, "fooå
"); + +$text = ""; +$p->parse("]]>
"); +is($text, "fooå"); + +$text = ""; +$p->parse("]]>
"); +is($text, "fooå"); + +# offsets/line/column numbers +$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"], + marked_sections => 1, + ); +$p->parse(<<'EOT')->eof; +Test + +]]> + +

Test

+EOT + +my @x; +sub x { + my($line, $col, $offset, $event, $text) = @_; + $text =~ s/\n/\\n/g; + $text =~ s/ /./g; + push(@x, "$line.$col:$offset $event \"$text\"\n"); +} + +#diag @x; +is(join("", @x), <<'EOT'); +1.0:0 start_document "" +1.0:0 start "" +1.7:7 text "Test" +1.11:11 end "" +1.19:19 text "\n" +3.3:32 text "fooå
\n" +4.3:49 text "\n" +5.4:54 text "\nINCLUDE\nSTUFF\n" +8.3:72 text "\n.." +9.2:75 start "

" +9.6:79 text "Test" +9.10:83 end "

" +9.15:88 text "\n" +10.0:89 end_document "" +EOT + +my $doc = ""; +my $result = ""; +$p = HTML::Parser->new( + marked_sections => 1, + handlers => { + default => [ sub { $result .= join("",@_); }, "skipped_text,text" ] + } +)->parse($doc)->eof; +is($doc, $result); + +$text = ""; +$p = HTML::Parser->new( + text_h => [sub { $text .= shift }, "dtext"], + marked_sections => 1, +); + +$p->parse(""); +is($text, "foo [1]", "CDATA text ending in square bracket"); + +} # SKIP diff --git a/ext/HTML/Parser/t/msie-compat.t b/ext/HTML/Parser/t/msie-compat.t new file mode 100644 index 0000000..90d4b7e --- /dev/null +++ b/ext/HTML/Parser/t/msie-compat.t @@ -0,0 +1,58 @@ +#!perl -w + +use strict; +use HTML::Parser; + +use Test::More tests => 2; + +my $TEXT = ""; +sub h +{ + my($event, $tagname, $text) = @_; + for ($event, $tagname, $text) { + if (defined) { + s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; + } + else { + $_ = ""; + } + } + + $TEXT .= "[$event,$tagname,$text]\n"; +} + +my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"]); +$p->parse("
"); +$p->parse(""); +$p->parse("' 'bar>' x>"); +$p->parse("\""); +$p->parse(" \"bar>\" x>"); +$p->parse(""); +$p->parse("\" >"); +$p->parse(" +xmp + +EOT + +my $p = HTML::Parser->new(api_version => 3); + +my $sum_len = 0; +my $count = 0; +my $err; + +$p->handler(default => + sub { + my($offset, $length, $offset_end, $line, $col, $text) = @_; + my $copy = $text; + $copy =~ s/\n/\\n/g; + substr($copy, 30) = "..." if length($copy) > 32; + #diag sprintf ">>> %d.%d %s", $line, $col, $copy; + if ($offset != $sum_len) { + diag "offset mismatch $offset vs $sum_len"; + $err++; + } + if ($offset_end != $offset + $length) { + diag "offset_end $offset_end wrong"; + $err++; + } + if ($length != length($text)) { + diag "length mismatch"; + $err++; + } + if (substr($HTML, $offset, $length) ne $text) { + diag "content mismatch"; + $err++; + } + $sum_len += $length; + $count++; + }, + 'offset,length,offset_end,line,column,text'); + +for (split(//, $HTML)) { + $p->parse($_); +} +$p->eof; + +ok($count > 5 && !$err); + + diff --git a/ext/HTML/Parser/t/options.t b/ext/HTML/Parser/t/options.t new file mode 100644 index 0000000..ff5f7db --- /dev/null +++ b/ext/HTML/Parser/t/options.t @@ -0,0 +1,36 @@ +# Test option setting methods + +use Test::More tests => 10; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3, + xml_mode => 1); +my $old; + +$old = $p->boolean_attribute_value("foo"); +ok(!defined $old); + +$old = $p->boolean_attribute_value(); +is($old, "foo"); + +$old = $p->boolean_attribute_value(undef); +is($old, "foo"); +ok(!defined($p->boolean_attribute_value)); + +ok($p->xml_mode(0)); +ok(!$p->xml_mode); + +my $seen_buggy_comment_warning; +$SIG{__WARN__} = + sub { + local $_ = shift; + $seen_buggy_comment_warning++ + if /^netscape_buggy_comment\(\) is deprecated/; + }; + +ok(!$p->strict_comment(1)); +ok($p->strict_comment); +ok(!$p->netscape_buggy_comment); +ok($seen_buggy_comment_warning); diff --git a/ext/HTML/Parser/t/parsefile.t b/ext/HTML/Parser/t/parsefile.t new file mode 100644 index 0000000..f373f06 --- /dev/null +++ b/ext/HTML/Parser/t/parsefile.t @@ -0,0 +1,45 @@ +use Test::More tests => 6; + +my $filename = "file$$.htm"; +die "$filename is already there" if -e $filename; +open(FILE, ">$filename") || die "Can't create $filename: $!"; +print FILE <<'EOT'; close(FILE); +Heisan +EOT + +{ + package MyParser; + require HTML::Parser; + @ISA=qw(HTML::Parser); + + sub start + { + my($self, $tag, $attr) = @_; + Test::More::is($tag, "title"); + } +} + +MyParser->new->parse_file($filename); +open(FILE, $filename) || die; +MyParser->new->parse_file(*FILE); +seek(FILE, 0, 0) || die; +MyParser->new->parse_file(\*FILE); +close(FILE); + +require IO::File; +my $io = IO::File->new($filename) || die; +MyParser->new->parse_file($io); +$io->seek(0, 0) || die; +MyParser->new->parse_file(*$io); + +my $text = ''; +$io->seek(0, 0) || die; +MyParser->new( + start_h => [ sub{ shift->eof; }, "self" ], + text_h => [ sub{ $text = shift; }, "text" ])->parse_file(*$io); +ok(!$text); + +close($io); # needed because of bug in perl +undef($io); + +unlink($filename) or warn "Can't unlink $filename: $!"; diff --git a/ext/HTML/Parser/t/parser.t b/ext/HTML/Parser/t/parser.t new file mode 100644 index 0000000..0ce4d95 --- /dev/null +++ b/ext/HTML/Parser/t/parser.t @@ -0,0 +1,184 @@ +use Test::More tests => 7; + +$HTML = <<'HTML'; + + + + + +Various entities. The parser must never break them in the middle: + +/ +/ +È +௖ +￿ +å-Å + + + +

+ + and this is not. + + that Netscape hates --> + +< this > was not a tag. + + + +HTML + +#------------------------------------------------------------------- + +{ + package P; + require HTML::Parser; + @ISA=qw(HTML::Parser); + $OUT=''; + $COUNT=0; + + sub new + { + my $class = shift; + my $self = $class->SUPER::new; + $OUT = ''; + die "Can only have one" if $COUNT++; + $self; + } + + sub DESTROY + { + my $self = shift; + eval { $self->SUPER::DESTROY; }; + $COUNT--; + } + + sub declaration + { + my($self, $decl) = @_; + $OUT .= "[[$decl]]|"; + } + + sub start + { + my($self, $tag, $attr) = @_; + $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); + $attr = "/$attr" if length $attr; + $OUT .= "<<$tag$attr>>|"; + } + + sub end + { + my($self, $tag) = @_; + $OUT .= ">>$tag<<|"; + } + + sub comment + { + my($self, $comment) = @_; + $OUT .= "##$comment##|"; + } + + sub text + { + my($self, $text) = @_; + #$text =~ s/\n/\\n/g; + #$text =~ s/\t/\\t/g; + #$text =~ s/ /·/g; + $OUT .= "$text|"; + } + + sub result + { + $OUT; + } +} + +for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") { +#for $chunksize (1) { + if ($chunksize =~ /^file/) { + #print "Parsing from $chunksize"; + } else { + #print "Parsing using $chunksize byte chunks"; + } + my $p = P->new; + + if ($chunksize =~ /^file/) { + # First we must create the file + my $tmpfile = "tmp-$$.html"; + my $file = $tmpfile; + die "$file already exists" if -e $file; + open(FILE, ">$file") or die "Can't create $file: $!"; + binmode FILE; + print FILE $HTML; + close(FILE); + + if ($chunksize eq "filehandle") { + require FileHandle; + my $fh = FileHandle->new($file) || die "Can't open $file: $!"; + $file = $fh; + } + + # then we can parse it. + $p->parse_file($file); + close $file if $chunksize eq "filehandle"; + unlink($tmpfile) || warn "Can't unlink $tmpfile: $!"; + } else { + my $copy = $HTML; + while (length $copy) { + my $chunk = substr($copy, 0, $chunksize); + substr($copy, 0, $chunksize) = ''; + $p->parse($chunk); + } + $p->eof; + } + + my $res = $p->result; + my $bad; + + # Then we start looking for things that should not happen + if ($res =~ /\s\|\s/) { + diag "broken space"; + $bad++; + } + for ( + # Make sure entities are not broken + '/', '/', 'È', '௖', '￿', 'å', 'Å', + + # Some elements that should be produced + "|[[DOCTYPE HTML]]|", + "|## this is\na comment ##|", + "|<