ext/B/B/Concise.pm Compiler Concise backend
ext/B/B/Debug.pm Compiler Debug backend
ext/B/B/Deparse.pm Compiler Deparse backend
-ext/B/B/Lint.pm Compiler Lint backend
ext/B/B/Lint/Debug.pm Adds debugging stringification to B::
+ext/B/B/Lint.pm Compiler Lint backend
ext/B/B.pm Compiler backend support functions and methods
ext/B/B/Showlex.pm Compiler Showlex backend
ext/B/B/Terse.pm Compiler Terse backend
ext/Devel/PPPort/t/newCONSTSUB.t Devel::PPPort test file
ext/Devel/PPPort/t/newRV.t Devel::PPPort test file
ext/Devel/PPPort/t/newSVpv.t Devel::PPPort test file
+ext/Devel/PPPort/TODO Devel::PPPort Todo
ext/Devel/PPPort/t/podtest.t Devel::PPPort test file
ext/Devel/PPPort/t/ppphtest.t Devel::PPPort test file
ext/Devel/PPPort/t/pvs.t Devel::PPPort test file
ext/Devel/PPPort/t/uv.t Devel::PPPort test file
ext/Devel/PPPort/t/variables.t Devel::PPPort test file
ext/Devel/PPPort/t/warn.t Devel::PPPort test file
-ext/Devel/PPPort/TODO Devel::PPPort Todo
ext/Devel/PPPort/typemap Devel::PPPort Typemap
ext/Digest/MD5/Changes Digest::MD5 extension changes
ext/Digest/MD5/hints/dec_osf.pl Hints for named architecture
ext/Hash/Util/Makefile.PL Makefile for Hash::Util
ext/Hash/Util/t/Util.t See if Hash::Util works
ext/Hash/Util/Util.xs XS bits of Hash::Util
+ext/HTML/Parser/hints/solaris.pl files for HTML::Parser
+ext/HTML/Parser/hparser.c files for HTML::Parser
+ext/HTML/Parser/hparser.h files for HTML::Parser
+ext/HTML/Parser/lib/HTML/Entities.pm file for HTML::Entities
+ext/HTML/Parser/lib/HTML/Filter.pm file for HTML::Filter
+ext/HTML/Parser/lib/HTML/HeadParser.pm file for HTML::HeadParser
+ext/HTML/Parser/lib/HTML/LinkExtor.pm file for HTML::LinkExtor
+ext/HTML/Parser/lib/HTML/PullParser.pm file for HTML::PullParser
+ext/HTML/Parser/lib/HTML/TokeParser.pm file for HTML::TokeParser
+ext/HTML/Parser/Makefile.PL files for HTML::Parser
+ext/HTML/Parser/mkhctype files for HTML::Parser
+ext/HTML/Parser/mkpfunc files for HTML::Parser
+ext/HTML/Parser/Parser.pm files for HTML::Parser
+ext/HTML/Parser/Parser.xs files for HTML::Parser
+ext/HTML/Parser/t/api_version.t test for HTML::Parser
+ext/HTML/Parser/t/argspec2.t test for HTML::Parser
+ext/HTML/Parser/t/argspec-bad.t test for HTML::Parser
+ext/HTML/Parser/t/argspec.t test for HTML::Parser
+ext/HTML/Parser/t/attr-encoded.t test for HTML::Parser
+ext/HTML/Parser/t/callback.t test for HTML::Parser
+ext/HTML/Parser/t/case-sensitive.t test for HTML::Parser
+ext/HTML/Parser/t/cases.t test for HTML::Parser
+ext/HTML/Parser/t/comment.t test for HTML::Parser
+ext/HTML/Parser/t/crashme.t test for HTML::Parser
+ext/HTML/Parser/t/declaration.t test for HTML::Parser
+ext/HTML/Parser/t/default.t test for HTML::Parser
+ext/HTML/Parser/t/document.t test for HTML::Parser
+ext/HTML/Parser/t/dtext.t test for HTML::Parser
+ext/HTML/Parser/t/entities2.t test for HTML::Parser
+ext/HTML/Parser/t/entities.t test for HTML::Parser
+ext/HTML/Parser/t/filter-methods.t test for HTML::Parser
+ext/HTML/Parser/t/filter.t test for HTML::Parser
+ext/HTML/Parser/t/handler-eof.t test for HTML::Parser
+ext/HTML/Parser/t/handler.t test for HTML::Parser
+ext/HTML/Parser/t/headparser-http.t test for HTML::Parser
+ext/HTML/Parser/t/headparser.t test for HTML::Parser
+ext/HTML/Parser/t/ignore.t test for HTML::Parser
+ext/HTML/Parser/t/largetags.t test for HTML::Parser
+ext/HTML/Parser/t/linkextor-base.t test for HTML::Parser
+ext/HTML/Parser/t/linkextor-rel.t test for HTML::Parser
+ext/HTML/Parser/t/magic.t test for HTML::Parser
+ext/HTML/Parser/t/marked-sect.t test for HTML::Parser
+ext/HTML/Parser/t/msie-compat.t test for HTML::Parser
+ext/HTML/Parser/t/offset.t test for HTML::Parser
+ext/HTML/Parser/tokenpos.h files for HTML::Parser
+ext/HTML/Parser/t/options.t test for HTML::Parser
+ext/HTML/Parser/t/parsefile.t test for HTML::Parser
+ext/HTML/Parser/t/parser.t test for HTML::Parser
+ext/HTML/Parser/t/plaintext.t test for HTML::Parser
+ext/HTML/Parser/t/pod.t test for HTML::Parser
+ext/HTML/Parser/t/process.t test for HTML::Parser
+ext/HTML/Parser/t/pullparser.t test for HTML::Parser
+ext/HTML/Parser/t/script.t test for HTML::Parser
+ext/HTML/Parser/t/skipped-text.t test for HTML::Parser
+ext/HTML/Parser/t/stack-realloc.t test for HTML::Parser
+ext/HTML/Parser/t/textarea.t test for HTML::Parser
+ext/HTML/Parser/t/threads.t test for HTML::Parser
+ext/HTML/Parser/t/tokeparser.t test for HTML::Parser
+ext/HTML/Parser/t/uentities.t test for HTML::Parser
+ext/HTML/Parser/t/unbroken-text.t test for HTML::Parser
+ext/HTML/Parser/t/unicode-bom.t test for HTML::Parser
+ext/HTML/Parser/t/unicode.t test for HTML::Parser
+ext/HTML/Parser/t/xml-mode.t test for HTML::Parser
+ext/HTML/Parser/typemap files for HTML::Parser
+ext/HTML/Parser/util.c files for HTML::Parser
ext/I18N/Langinfo/fallback/const-c.inc I18N::Langinfo
ext/I18N/Langinfo/fallback/const-xs.inc I18N::Langinfo
ext/I18N/Langinfo/Langinfo.pm I18N::Langinfo
ext/IPC/SysV/regen.pl IPC::SysV file regeneration script
ext/IPC/SysV/SysV.xs IPC::SysV extension Perl module
ext/IPC/SysV/t/ipcsysv.t IPC::SysV test file
-ext/IPC/SysV/t/pod.t IPC::SysV test file
-ext/IPC/SysV/t/podcov.t IPC::SysV test file
ext/IPC/SysV/t/msg.t IPC::SysV test file
+ext/IPC/SysV/TODO IPC::SysV todo file
+ext/IPC/SysV/t/podcov.t IPC::SysV test file
+ext/IPC/SysV/t/pod.t IPC::SysV test file
ext/IPC/SysV/t/sem.t IPC::SysV test file
ext/IPC/SysV/t/shm.t IPC::SysV test file
-ext/IPC/SysV/TODO IPC::SysV todo file
ext/IPC/SysV/typemap IPC::SysV typemap
ext/List/Util/Changes Util extension
ext/List/Util/lib/List/Util.pm List::Util
lib/Attribute/Handlers/t/linerep.t See if Attribute::Handlers works
lib/Attribute/Handlers/t/multi.t See if Attribute::Handlers works
lib/attributes.pm For "sub foo : attrlist"
+lib/AutoLoader.pm Autoloader base class
lib/AutoLoader/t/01AutoLoader.t See if AutoLoader works
lib/AutoLoader/t/02AutoSplit.t See if AutoSplit works
-lib/AutoLoader.pm Autoloader base class
lib/AutoSplit.pm Split up autoload functions
lib/autouse.pm Load and call a function only when it's used
lib/autouse.t See if autouse works
lib/CGI/t/start_end_start.t See if CGI.pm works
lib/CGI/t/switch.t See if CGI::Switch still loads
lib/CGI/t/uploadInfo.t See if CGI.pm works
-lib/CGI/t/upload.t See if CGI.pm works
lib/CGI/t/upload_post_text.txt.packed Test data for CGI.pm
+lib/CGI/t/upload.t See if CGI.pm works
lib/CGI/t/util-58.t See if 5.8-dependent features work
lib/CGI/t/util.t See if CGI.pm works
lib/CGI/Util.pm Utility functions
lib/ExtUtils/t/FIRST_MAKEFILE.t See if FIRST_MAKEFILE works
lib/ExtUtils/t/fixin.t See if ExtUtils::MakeMaker works
lib/ExtUtils/t/hints.t See if hint files are honored.
+lib/ExtUtils/t/Installapi2.t See if new api for ExtUtils::Install::install() works
lib/ExtUtils/t/INSTALL_BASE.t Test INSTALL_BASE in MakeMaker
lib/ExtUtils/t/Installed.t See if ExtUtils::Installed works
lib/ExtUtils/t/Install.t See if ExtUtils::Install works
-lib/ExtUtils/t/Installapi2.t See if new api for ExtUtils::Install::install() works
lib/ExtUtils/t/INST_PREFIX.t See if MakeMaker can apply PREFIXs
lib/ExtUtils/t/INST.t Check MakeMaker INST_* macros
lib/ExtUtils/t/Liblist.t See if ExtUtils::Liblist works
lib/h2ph.t See if h2ph works like it should
lib/h2xs.t See if h2xs produces expected lists of files
lib/hostname.pl Old hostname code
+lib/HTML/Tagset.pm HTML::Tagset
+lib/HTML/Tagset/t/00_about_verbose.t HTML::Tagset
+lib/HTML/Tagset/t/01_old_junk.t HTML::Tagset
lib/I18N/Collate.pm Routines to do strxfrm-based collation
lib/I18N/Collate.t See if I18N::Collate works
lib/I18N/LangTags/ChangeLog I18N::LangTags
lib/parent/t/compile-time-file.t tests for parent.pm
lib/parent/t/compile-time.t tests for parent.pm
lib/parent/t/lib/Dummy2.plugin test files for parent.pm
-lib/parent/t/lib/Dummy.pm test files for parent.pm
lib/parent/t/lib/Dummy/Outside.pm test files for parent.pm
+lib/parent/t/lib/Dummy.pm test files for parent.pm
lib/parent/t/lib/FileThatOnlyExistsAsPMC.pmc test files for parent.pm
lib/parent/t/lib/ReturnsFalse.pm test files for parent.pm
lib/parent/t/parent-classfromclassfile.t tests for parent.pm
lib/Pod/Simple/t/tiedfh.t Pod::Simple test file
lib/Pod/Simple/t/verbatim.t Pod::Simple test file
lib/Pod/Simple/t/verb_fmt.t Pod::Simple test file
-lib/Pod/Simple/t/x_nixer.t Pod::Simple test file
lib/Pod/Simple/t/xhtml01.t Pod::Simple test file
lib/Pod/Simple/t/xhtml05.t Pod::Simple test file
+lib/Pod/Simple/t/x_nixer.t Pod::Simple test file
lib/Pod/Simple/XHTML.pm turn Pod into XHTML
lib/Pod/Simple/XMLOutStream.pm turn Pod into XML
lib/Pod/t/basic.cap podlators test
lib/Pod/t/htmlview.pod pod2html render test input data
lib/Pod/t/htmlview.t pod2html render test
lib/Pod/t/InputObjects.t See if Pod::InputObjects works
-lib/Pod/t/man.t podlators test
lib/Pod/t/man-options.t podlators test
+lib/Pod/t/man.t podlators test
lib/Pod/t/parselink.t podlators test
lib/Pod/t/pod2html-lib.pl pod2html testing library
lib/Pod/t/pod2latex.t See if Pod::LaTeX works
lib/Search/Dict.t See if Search::Dict works
lib/SelectSaver.pm Enforce proper select scoping
lib/SelectSaver.t See if SelectSaver works
-lib/SelfLoader/t/02SelfLoader-buggy.t See if SelfLoader works
lib/SelfLoader.pm Load functions only on demand
lib/SelfLoader/t/01SelfLoader.t See if SelfLoader works
+lib/SelfLoader/t/02SelfLoader-buggy.t See if SelfLoader works
lib/Shell.pm Make AUTOLOADed system() calls
lib/Shell.t Tests for above
lib/shellwords.pl Perl library to split into words with shell quoting
lib/tainted.pl Old code for tainting
lib/TAP/Base.pm A parser for Test Anything Protocol
lib/TAP/Formatter/Color.pm A parser for Test Anything Protocol
-lib/TAP/Formatter/Console.pm A parser for Test Anything Protocol
lib/TAP/Formatter/Console/ParallelSession.pm A parser for Test Anything Protocol
+lib/TAP/Formatter/Console.pm A parser for Test Anything Protocol
lib/TAP/Formatter/Console/Session.pm A parser for Test Anything Protocol
lib/TAP/Harness.pm A parser for Test Anything Protocol
-lib/TAP/Parser.pm A parser for Test Anything Protocol
lib/TAP/Parser/Aggregator.pm A parser for Test Anything Protocol
lib/TAP/Parser/Grammar.pm A parser for Test Anything Protocol
-lib/TAP/Parser/Iterator.pm A parser for Test Anything Protocol
lib/TAP/Parser/Iterator/Array.pm A parser for Test Anything Protocol
+lib/TAP/Parser/Iterator.pm A parser for Test Anything Protocol
lib/TAP/Parser/Iterator/Process.pm A parser for Test Anything Protocol
lib/TAP/Parser/Iterator/Stream.pm A parser for Test Anything Protocol
lib/TAP/Parser/Multiplexer.pm A parser for Test Anything Protocol
-lib/TAP/Parser/Result.pm A parser for Test Anything Protocol
+lib/TAP/Parser.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Bailout.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Comment.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Plan.pm A parser for Test Anything Protocol
+lib/TAP/Parser/Result.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Pragma.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Test.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Unknown.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/Version.pm A parser for Test Anything Protocol
lib/TAP/Parser/Result/YAML.pm A parser for Test Anything Protocol
-lib/TAP/Parser/Source.pm A parser for Test Anything Protocol
lib/TAP/Parser/Source/Perl.pm A parser for Test Anything Protocol
+lib/TAP/Parser/Source.pm A parser for Test Anything Protocol
lib/TAP/Parser/Utils.pm A parser for Test Anything Protocol
lib/TAP/Parser/YAMLish/Reader.pm A parser for Test Anything Protocol
lib/TAP/Parser/YAMLish/Writer.pm A parser for Test Anything Protocol
lib/Test/Harness/t/parse.t Test::Harness test
lib/Test/Harness/t/premature-bailout.t Test::Harness test
lib/Test/Harness/t/process.t Test::Harness test
-lib/Test/Harness/t/prove.t Test::Harness test
lib/Test/Harness/t/proverc.t Test::Harness test
lib/Test/Harness/t/proverun.t Test::Harness test
+lib/Test/Harness/t/prove.t Test::Harness test
lib/Test/Harness/t/regression.t Test::Harness test
lib/Test/Harness/t/results.t Test::Harness test
lib/Test/Harness/t/source.t Test::Harness test
lib/Test/Harness/t/unicode.t Test::Harness test
lib/Test/Harness/t/utils.t Test::Harness test
lib/Test/Harness/t/yamlish-output.t Test::Harness test
-lib/Test/Harness/t/yamlish-writer.t Test::Harness test
lib/Test/Harness/t/yamlish.t Test::Harness test
+lib/Test/Harness/t/yamlish-writer.t Test::Harness test
lib/Test/More.pm More utilities for writing tests
lib/Test.pm A simple framework for writing test scripts
lib/Test/Simple/Changes Test::Simple changes
patchlevel.h The current patch level of perl
perlapi.c Perl API functions
perlapi.h Perl API function declarations
-perldtrace.d D script for Perl probes
perl.c main()
+perldtrace.d D script for Perl probes
perl.h Global declarations
perlio.c C code for PerlIO abstraction
perlio.h PerlIO abstraction
t/io/utf8.t See if file seeking works
t/japh/abigail.t Obscure tests
t/lib/1_compile.t See if the various libraries and extensions compile
+t/lib/App/Prove/Plugin/Dummy.pm Module for testing Test::Harness
t/lib/Cname.pm Test charnames in regexes (op/pat.t)
t/lib/common.pl Helper for lib/{warnings,feature}.t
t/lib/commonsense.t See if configuration meets basic needs
t/lib/compress/zlib-generic.pl Compress::Zlib
t/lib/contains_pod.xr Pod-Parser test file
t/lib/cygwin.t Builtin cygwin function tests
-t/lib/App/Prove/Plugin/Dummy.pm Module for testing Test::Harness
+t/lib/data/catme.1 Test data for Test::Harness
+t/lib/data/proverc Test data for Test::Harness
+t/lib/data/sample.yml Test data for Test::Harness
t/lib/Devel/switchd.pm Module for t/run/switchd.t
t/lib/Dev/Null.pm Module for testing Test::Harness
t/lib/dprof/test1_t Perl code profiler tests
t/lib/NoFork.pm Module for testing Test::Harness
t/lib/no_load.t Test that some modules don't load others
t/lib/proxy_constant_subs.t Test that Proxy Constant Subs behave correctly
-t/lib/data/catme.1 Test data for Test::Harness
-t/lib/data/proverc Test data for Test::Harness
-t/lib/data/sample.yml Test data for Test::Harness
t/lib/sample-tests/bailout Test data for Test::Harness
t/lib/sample-tests/bignum Test data for Test::Harness
t/lib/sample-tests/bignum_many Test data for Test::Harness
t/lib/sample-tests/simple_fail Test data for Test::Harness
t/lib/sample-tests/simple_yaml Test data for Test::Harness
t/lib/sample-tests/skip Test data for Test::Harness
-t/lib/sample-tests/skip_nomsg Test data for Test::Harness
t/lib/sample-tests/skipall Test data for Test::Harness
t/lib/sample-tests/skipall_nomsg Test data for Test::Harness
t/lib/sample-tests/skipall_v13 Test data for Test::Harness
+t/lib/sample-tests/skip_nomsg Test data for Test::Harness
t/lib/sample-tests/space_after_plan Test data for Test::Harness
t/lib/sample-tests/stdout_stderr Test data for Test::Harness
t/lib/sample-tests/strict Test data for Test::Harness
t/lib/strict/vars Tests of "use strict 'vars'" for strict.t
t/lib/Test/Simple/Catch.pm Utility module for testing Test::Simple
t/lib/Test/Simple/sample_tests/death_in_eval.plx for exit.t
-t/lib/Test/Simple/sample_tests/death_with_handler.plx for exit.t
t/lib/Test/Simple/sample_tests/death.plx for exit.t
+t/lib/Test/Simple/sample_tests/death_with_handler.plx for exit.t
t/lib/Test/Simple/sample_tests/exit.plx for exit.t
t/lib/Test/Simple/sample_tests/extras.plx for exit.t
t/lib/Test/Simple/sample_tests/five_fail.plx for exit.t
t/Module_Pluggable/21editor_junk.t Module::Pluggable tests
t/Module_Pluggable/acme/Acme/MyTest/Plugin/Foo.pm Module::Pluggable tests
t/Module_Pluggable/lib/Acme/MyTest/Plugin/Foo.pm Module::Pluggable tests
-t/Module_Pluggable/lib/EditorJunk/Plugin/Foo.pm Module::Pluggable tests
-t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm~ Module::Pluggable tests
-t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swp Module::Pluggable tests
t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm Module::Pluggable tests
+t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm~ Module::Pluggable tests
t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swo Module::Pluggable tests
+t/Module_Pluggable/lib/EditorJunk/Plugin/Bar.pm.swp Module::Pluggable tests
+t/Module_Pluggable/lib/EditorJunk/Plugin/Foo.pm Module::Pluggable tests
t/Module_Pluggable/lib/ExtTest/Plugin/Bar.plugin Module::Pluggable tests
t/Module_Pluggable/lib/ExtTest/Plugin/Foo.plugin Module::Pluggable tests
t/Module_Pluggable/lib/ExtTest/Plugin/Quux/Foo.plugin Module::Pluggable tests
t/op/die.t See if die works
t/op/dor.t See if defined-or (//) works
t/op/do.t See if subroutines work
-t/op/each.t See if hash iterators work
t/op/each_array.t See if array iterators work
+t/op/each.t See if hash iterators work
t/op/eval.t See if eval operator works
t/op/exec.t See if exec, system and qx work
t/op/exists_sub.t See if exists(&sub) works
t/uni/lower.t See if Unicode casing works
t/uni/overload.t See if Unicode overloading works
t/uni/sprintf.t See if Unicode sprintf works
+t/uni/tie.t See if Unicode tie works
t/uni/title.t See if Unicode casing works
t/uni/tr_7jis.t See if Unicode tr/// in 7jis works
t/uni/tr_eucjp.t See if Unicode tr/// in eucjp works
t/uni/write.t See if Unicode formats work
t/win32/system.t See if system works in Win*
t/win32/system_tests Test runner for system.t
-t/uni/tie.t See if Unicode tie works
t/x2p/s2p.t See if s2p/psed work
uconfig.h Configuration header for microperl
uconfig.sh Configuration script for microperl
'CPAN' => 1,
},
+ 'HTML::Parser' =>
+ {
+ 'MAINTAINER' => 'gaas',
+ 'FILES' => q[ext/HTML/Parser],
+ 'CPAN' => 1,
+ },
+
+ 'HTML::Tagset' =>
+ {
+ 'MAINTAINER' => 'petdance',
+ 'FILES' => q[lib/HTML/Tagset.pm lib/HTML/Tagset],
+ 'CPAN' => 1,
+ },
+
'I18N::LangTags' =>
{
'MAINTAINER' => 'sburke',
--- /dev/null
+require 5.006;
+use strict;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+ NAME => 'HTML::Parser',
+ VERSION_FROM => 'Parser.pm',
+ H => [ "hparser.h", "hctype.h", "tokenpos.h", "pfunc.h",
+ "hparser.c", "util.c",
+ ],
+ PREREQ_PM => {
+ 'HTML::Tagset' => 3,
+ 'Test::More' => 0, # only needed to run 'make test'
+ },
+ DEFINE => "-DMARKED_SECTION",
+ dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
+ clean => { FILES => 'hctype.h pfunc.h' },
+);
+
+
+sub MY::postamble
+{
+ '
+pfunc.h : mkpfunc
+ $(PERL) mkpfunc >pfunc.h
+
+hctype.h : mkhctype
+ $(PERL) mkhctype >hctype.h
+'
+}
--- /dev/null
+package HTML::Parser;
+
+# Copyright 1996-2007, Gisle Aas.
+# Copyright 1999-2000, Michael A. Chase.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the same terms as Perl itself.
+
+use strict;
+use vars qw($VERSION @ISA);
+
+$VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $
+
+require HTML::Entities;
+
+require XSLoader;
+XSLoader::load('HTML::Parser', $VERSION);
+
+sub new
+{
+ my $class = shift;
+ my $self = bless {}, $class;
+ return $self->init(@_);
+}
+
+
+sub init
+{
+ my $self = shift;
+ $self->_alloc_pstate;
+
+ my %arg = @_;
+ my $api_version = delete $arg{api_version} || (@_ ? 3 : 2);
+ if ($api_version >= 4) {
+ require Carp;
+ Carp::croak("API version $api_version not supported " .
+ "by HTML::Parser $VERSION");
+ }
+
+ if ($api_version < 3) {
+ # Set up method callbacks compatible with HTML-Parser-2.xx
+ $self->handler(text => "text", "self,text,is_cdata");
+ $self->handler(end => "end", "self,tagname,text");
+ $self->handler(process => "process", "self,token0,text");
+ $self->handler(start => "start",
+ "self,tagname,attr,attrseq,text");
+
+ $self->handler(comment =>
+ sub {
+ my($self, $tokens) = @_;
+ for (@$tokens) {
+ $self->comment($_);
+ }
+ }, "self,tokens");
+
+ $self->handler(declaration =>
+ sub {
+ my $self = shift;
+ $self->declaration(substr($_[0], 2, -1));
+ }, "self,text");
+ }
+
+ if (my $h = delete $arg{handlers}) {
+ $h = {@$h} if ref($h) eq "ARRAY";
+ while (my($event, $cb) = each %$h) {
+ $self->handler($event => @$cb);
+ }
+ }
+
+ # In the end we try to assume plain attribute or handler
+ while (my($option, $val) = each %arg) {
+ if ($option =~ /^(\w+)_h$/) {
+ $self->handler($1 => @$val);
+ }
+ elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) {
+ require Carp;
+ Carp::croak("Bad constructor option '$option'");
+ }
+ else {
+ $self->$option($val);
+ }
+ }
+
+ return $self;
+}
+
+
+sub parse_file
+{
+ my($self, $file) = @_;
+ my $opened;
+ if (!ref($file) && ref(\$file) ne "GLOB") {
+ # Assume $file is a filename
+ local(*F);
+ open(F, $file) || return undef;
+ binmode(F); # should we? good for byte counts
+ $opened++;
+ $file = *F;
+ }
+ my $chunk = '';
+ while (read($file, $chunk, 512)) {
+ $self->parse($chunk) || last;
+ }
+ close($file) if $opened;
+ $self->eof;
+}
+
+
+sub netscape_buggy_comment # legacy
+{
+ my $self = shift;
+ require Carp;
+ Carp::carp("netscape_buggy_comment() is deprecated. " .
+ "Please use the strict_comment() method instead");
+ my $old = !$self->strict_comment;
+ $self->strict_comment(!shift) if @_;
+ return $old;
+}
+
+# set up method stubs
+sub text { }
+*start = \&text;
+*end = \&text;
+*comment = \&text;
+*declaration = \&text;
+*process = \&text;
+
+1;
+
+__END__
+
+
+=head1 NAME
+
+HTML::Parser - HTML parser class
+
+=head1 SYNOPSIS
+
+ use HTML::Parser ();
+
+ # Create parser object
+ $p = HTML::Parser->new( api_version => 3,
+ start_h => [\&start, "tagname, attr"],
+ end_h => [\&end, "tagname"],
+ marked_sections => 1,
+ );
+
+ # Parse document text chunk by chunk
+ $p->parse($chunk1);
+ $p->parse($chunk2);
+ #...
+ $p->eof; # signal end of document
+
+ # Parse directly from file
+ $p->parse_file("foo.html");
+ # or
+ open(my $fh, "<:utf8", "foo.html") || die;
+ $p->parse_file($fh);
+
+=head1 DESCRIPTION
+
+Objects of the C<HTML::Parser> class will recognize markup and
+separate it from plain text (alias data content) in HTML
+documents. As different kinds of markup and text are recognized, the
+corresponding event handlers are invoked.
+
+C<HTML::Parser> is not a generic SGML parser. We have tried to
+make it able to deal with the HTML that is actually "out there", and
+it normally parses as closely as possible to the way the popular web
+browsers do it instead of strictly following one of the many HTML
+specifications from W3C. Where there is disagreement, there is often
+an option that you can enable to get the official behaviour.
+
+The document to be parsed may be supplied in arbitrary chunks. This
+makes on-the-fly parsing as documents are received from the network
+possible.
+
+If event driven parsing does not feel right for your application, you
+might want to use C<HTML::PullParser>. This is an C<HTML::Parser>
+subclass that allows a more conventional program structure.
+
+
+=head1 METHODS
+
+The following method is used to construct a new C<HTML::Parser> object:
+
+=over
+
+=item $p = HTML::Parser->new( %options_and_handlers )
+
+This class method creates a new C<HTML::Parser> object and
+returns it. Key/value argument pairs may be provided to assign event
+handlers or initialize parser options. The handlers and parser
+options can also be set or modified later by the method calls described below.
+
+If a top level key is in the form "<event>_h" (e.g., "text_h") then it
+assigns a handler to that event, otherwise it initializes a parser
+option. The event handler specification value must be an array
+reference. Multiple handlers may also be assigned with the 'handlers
+=> [%handlers]' option. See examples below.
+
+If new() is called without any arguments, it will create a parser that
+uses callback methods compatible with version 2 of C<HTML::Parser>.
+See the section on "version 2 compatibility" below for details.
+
+The special constructor option 'api_version => 2' can be used to
+initialize version 2 callbacks while still setting other options and
+handlers. The 'api_version => 3' option can be used if you don't want
+to set any options and don't want to fall back to v2 compatible
+mode.
+
+Examples:
+
+ $p = HTML::Parser->new(api_version => 3,
+ text_h => [ sub {...}, "dtext" ]);
+
+This creates a new parser object with a text event handler subroutine
+that receives the original text with general entities decoded.
+
+ $p = HTML::Parser->new(api_version => 3,
+ start_h => [ 'my_start', "self,tokens" ]);
+
+This creates a new parser object with a start event handler method
+that receives the $p and the tokens array.
+
+ $p = HTML::Parser->new(api_version => 3,
+ handlers => { text => [\@array, "event,text"],
+ comment => [\@array, "event,text"],
+ });
+
+This creates a new parser object that stores the event type and the
+original text in @array for text and comment events.
+
+=back
+
+The following methods feed the HTML document
+to the C<HTML::Parser> object:
+
+=over
+
+=item $p->parse( $string )
+
+Parse $string as the next chunk of the HTML document. The return
+value is normally a reference to the parser object (i.e. $p).
+Handlers invoked should not attempt to modify the $string in-place until
+$p->parse returns.
+
+If an invoked event handler aborts parsing by calling $p->eof, then
+$p->parse() will return a FALSE value.
+
+=item $p->parse( $code_ref )
+
+If a code reference is passed as the argument to be parsed, then the
+chunks to be parsed are obtained by invoking this function repeatedly.
+Parsing continues until the function returns an empty (or undefined)
+result. When this happens $p->eof is automatically signaled.
+
+Parsing will also abort if one of the event handlers calls $p->eof.
+
+The effect of this is the same as:
+
+ while (1) {
+ my $chunk = &$code_ref();
+ if (!defined($chunk) || !length($chunk)) {
+ $p->eof;
+ return $p;
+ }
+ $p->parse($chunk) || return undef;
+ }
+
+But it is more efficient as this loop runs internally in XS code.
+
+=item $p->parse_file( $file )
+
+Parse text directly from a file. The $file argument can be a
+filename, an open file handle, or a reference to an open file
+handle.
+
+If $file contains a filename and the file can't be opened, then the
+method returns an undefined value and $! tells why it failed.
+Otherwise the return value is a reference to the parser object.
+
+If a file handle is passed as the $file argument, then the file will
+normally be read until EOF, but not closed.
+
+If an invoked event handler aborts parsing by calling $p->eof,
+then $p->parse_file() may not have read the entire file.
+
+On systems with multi-byte line terminators, the values passed for the
+offset and length argspecs may be too low if parse_file() is called on
+a file handle that is not in binary mode.
+
+If a filename is passed in, then parse_file() will open the file in
+binary mode.
+
+=item $p->eof
+
+Signals the end of the HTML document. Calling the $p->eof method
+outside a handler callback will flush any remaining buffered text
+(which triggers the C<text> event if there is any remaining text).
+
+Calling $p->eof inside a handler will terminate parsing at that point
+and cause $p->parse to return a FALSE value. This also terminates
+parsing by $p->parse_file().
+
+After $p->eof has been called, the parse() and parse_file() methods
+can be invoked to feed new documents with the parser object.
+
+The return value from eof() is a reference to the parser object.
+
+=back
+
+
+Most parser options are controlled by boolean attributes.
+Each boolean attribute is enabled by calling the corresponding method
+with a TRUE argument and disabled with a FALSE argument. The
+attribute value is left unchanged if no argument is given. The return
+value from each method is the old attribute value.
+
+Methods that can be used to get and/or set parser options are:
+
+=over
+
+=item $p->attr_encoded
+
+=item $p->attr_encoded( $bool )
+
+By default, the C<attr> and C<@attr> argspecs will have general
+entities for attribute values decoded. Enabling this attribute leaves
+entities alone.
+
+=item $p->boolean_attribute_value( $val )
+
+This method sets the value reported for boolean attributes inside HTML
+start tags. By default, the name of the attribute is also used as its
+value. This affects the values reported for C<tokens> and C<attr>
+argspecs.
+
+=item $p->case_sensitive
+
+=item $p->case_sensitive( $bool )
+
+By default, tagnames and attribute names are down-cased. Enabling this
+attribute leaves them as found in the HTML source document.
+
+=item $p->closing_plaintext
+
+=item $p->closing_plaintext( $bool )
+
+By default, "plaintext" element can never be closed. Everything up to
+the end of the document is parsed in CDATA mode. This historical
+behaviour is what at least MSIE does. Enabling this attribute makes
+closing "</plaintext>" tag effective and the parsing process will resume
+after seeing this tag. This emulates gecko-based browsers.
+
+=item $p->empty_element_tags
+
+=item $p->empty_element_tags( $bool )
+
+By default, empty element tags are not recognized as such and the "/"
+before ">" is just treated like a normal name character (unless
+C<strict_names> is enabled). Enabling this attribute make
+C<HTML::Parser> recognize these tags.
+
+Empty element tags look like start tags, but end with the character
+sequence "/>" instead of ">". When recognized by C<HTML::Parser> they
+cause an artificial end event in addition to the start event. The
+C<text> for the artificial end event will be empty and the C<tokenpos>
+array will be undefined even though the the token array will have one
+element containing the tag name.
+
+=item $p->marked_sections
+
+=item $p->marked_sections( $bool )
+
+By default, section markings like <![CDATA[...]]> are treated like
+ordinary text. When this attribute is enabled section markings are
+honoured.
+
+There are currently no events associated with the marked section
+markup, but the text can be returned as C<skipped_text>.
+
+=item $p->strict_comment
+
+=item $p->strict_comment( $bool )
+
+By default, comments are terminated by the first occurrence of "-->".
+This is the behaviour of most popular browsers (like Mozilla, Opera and
+MSIE), but it is not correct according to the official HTML
+standard. Officially, you need an even number of "--" tokens before
+the closing ">" is recognized and there may not be anything but
+whitespace between an even and an odd "--".
+
+The official behaviour is enabled by enabling this attribute.
+
+Enabling of 'strict_comment' also disables recognizing these forms as
+comments:
+
+ </ comment>
+ <! comment>
+
+
+=item $p->strict_end
+
+=item $p->strict_end( $bool )
+
+By default, attributes and other junk are allowed to be present on end tags in a
+manner that emulates MSIE's behaviour.
+
+The official behaviour is enabled with this attribute. If enabled,
+only whitespace is allowed between the tagname and the final ">".
+
+=item $p->strict_names
+
+=item $p->strict_names( $bool )
+
+By default, almost anything is allowed in tag and attribute names.
+This is the behaviour of most popular browsers and allows us to parse
+some broken tags with invalid attribute values like:
+
+ <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0>
+
+By default, "LIST]" is parsed as a boolean attribute, not as
+part of the ALT value as was clearly intended. This is also what
+Mozilla sees.
+
+The official behaviour is enabled by enabling this attribute. If
+enabled, it will cause the tag above to be reported as text
+since "LIST]" is not a legal attribute name.
+
+=item $p->unbroken_text
+
+=item $p->unbroken_text( $bool )
+
+By default, blocks of text are given to the text handler as soon as
+possible (but the parser takes care always to break text at a
+boundary between whitespace and non-whitespace so single words and
+entities can always be decoded safely). This might create breaks that
+make it hard to do transformations on the text. When this attribute is
+enabled, blocks of text are always reported in one piece. This will
+delay the text event until the following (non-text) event has been
+recognized by the parser.
+
+Note that the C<offset> argspec will give you the offset of the first
+segment of text and C<length> is the combined length of the segments.
+Since there might be ignored tags in between, these numbers can't be
+used to directly index in the original document file.
+
+=item $p->utf8_mode
+
+=item $p->utf8_mode( $bool )
+
+Enable this option when parsing raw undecoded UTF-8. This tells the
+parser that the entities expanded for strings reported by C<attr>,
+C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end
+up compatible with the surrounding text.
+
+If C<utf8_mode> is enabled then it is an error to pass strings
+containing characters with code above 255 to the parse() method, and
+the parse() method will croak if you try.
+
+Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8
+encoded. The character can also be represented by the entity
+"♥" or "♥". If we feed the parser:
+
+ $p->parse("\xE2\x99\xA5♥");
+
+then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without
+C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled.
+The later string is what you want.
+
+This option is only available with perl-5.8 or better.
+
+=item $p->xml_mode
+
+=item $p->xml_mode( $bool )
+
+Enabling this attribute changes the parser to allow some XML
+constructs. This enables the behaviour controlled by individually by
+the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and
+C<xml_pic> attributes and also suppresses special treatment of
+elements that are parsed as CDATA for HTML.
+
+=item $p->xml_pic
+
+=item $p->xml_pic( $bool )
+
+By default, I<processing instructions> are terminated by ">". When
+this attribute is enabled, processing instructions are terminated by
+"?>" instead.
+
+=back
+
+As markup and text is recognized, handlers are invoked. The following
+method is used to set up handlers for different events:
+
+=over
+
+=item $p->handler( event => \&subroutine, $argspec )
+
+=item $p->handler( event => $method_name, $argspec )
+
+=item $p->handler( event => \@accum, $argspec )
+
+=item $p->handler( event => "" );
+
+=item $p->handler( event => undef );
+
+=item $p->handler( event );
+
+This method assigns a subroutine, method, or array to handle an event.
+
+Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>,
+C<process>, C<start_document>, C<end_document> or C<default>.
+
+The C<\&subroutine> is a reference to a subroutine which is called to handle
+the event.
+
+The C<$method_name> is the name of a method of $p which is called to handle
+the event.
+
+The C<@accum> is an array that will hold the event information as
+sub-arrays.
+
+If the second argument is "", the event is ignored.
+If it is undef, the default handler is invoked for the event.
+
+The C<$argspec> is a string that describes the information to be reported
+for the event. Any requested information that does not apply to a
+specific event is passed as C<undef>. If argspec is omitted, then it
+is left unchanged.
+
+The return value from $p->handler is the old callback routine or a
+reference to the accumulator array.
+
+Any return values from handler callback routines/methods are always
+ignored. A handler callback can request parsing to be aborted by
+invoking the $p->eof method. A handler callback is not allowed to
+invoke the $p->parse() or $p->parse_file() method. An exception will
+be raised if it tries.
+
+Examples:
+
+ $p->handler(start => "start", 'self, attr, attrseq, text' );
+
+This causes the "start" method of object $p to be called for 'start' events.
+The callback signature is $p->start(\%attr, \@attr_seq, $text).
+
+ $p->handler(start => \&start, 'attr, attrseq, text' );
+
+This causes subroutine start() to be called for 'start' events.
+The callback signature is start(\%attr, \@attr_seq, $text).
+
+ $p->handler(start => \@accum, '"S", attr, attrseq, text' );
+
+This causes 'start' event information to be saved in @accum.
+The array elements will be ['S', \%attr, \@attr_seq, $text].
+
+ $p->handler(start => "");
+
+This causes 'start' events to be ignored. It also suppresses
+invocations of any default handler for start events. It is in most
+cases equivalent to $p->handler(start => sub {}), but is more
+efficient. It is different from the empty-sub-handler in that
+C<skipped_text> is not reset by it.
+
+ $p->handler(start => undef);
+
+This causes no handler to be associated with start events.
+If there is a default handler it will be invoked.
+
+=back
+
+Filters based on tags can be set up to limit the number of events
+reported. The main bottleneck during parsing is often the huge number
+of callbacks made from the parser. Applying filters can improve
+performance significantly.
+
+The following methods control filters:
+
+=over
+
+=item $p->ignore_elements( @tags )
+
+Both the C<start> event and the C<end> event as well as any events that
+would be reported in between are suppressed. The ignored elements can
+contain nested occurrences of itself. Example:
+
+ $p->ignore_elements(qw(script style));
+
+The C<script> and C<style> tags will always nest properly since their
+content is parsed in CDATA mode. For most other tags
+C<ignore_elements> must be used with caution since HTML is often not
+I<well formed>.
+
+=item $p->ignore_tags( @tags )
+
+Any C<start> and C<end> events involving any of the tags given are
+suppressed. To reset the filter (i.e. don't suppress any C<start> and
+C<end> events), call C<ignore_tags> without an argument.
+
+=item $p->report_tags( @tags )
+
+Any C<start> and C<end> events involving any of the tags I<not> given
+are suppressed. To reset the filter (i.e. report all C<start> and
+C<end> events), call C<report_tags> without an argument.
+
+=back
+
+Internally, the system has two filter lists, one for C<report_tags>
+and one for C<ignore_tags>, and both filters are applied. This
+effectively gives C<ignore_tags> precedence over C<report_tags>.
+
+Examples:
+
+ $p->ignore_tags(qw(style));
+ $p->report_tags(qw(script style));
+
+results in only C<script> events being reported.
+
+=head2 Argspec
+
+Argspec is a string containing a comma-separated list that describes
+the information reported by the event. The following argspec
+identifier names can be used:
+
+=over
+
+=item C<attr>
+
+Attr causes a reference to a hash of attribute name/value pairs to be
+passed.
+
+Boolean attributes' values are either the value set by
+$p->boolean_attribute_value, or the attribute name if no value has been
+set by $p->boolean_attribute_value.
+
+This passes undef except for C<start> events.
+
+Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute
+names are forced to lower case.
+
+General entities are decoded in the attribute values and
+one layer of matching quotes enclosing the attribute values is removed.
+
+The Unicode character set is assumed for entity decoding. With Perl
+version 5.6 or earlier only the Latin-1 range is supported, and
+entities for characters outside the range 0..255 are left unchanged.
+
+=item C<@attr>
+
+Basically the same as C<attr>, but keys and values are passed as
+individual arguments and the original sequence of the attributes is
+kept. The parameters passed will be the same as the @attr calculated
+here:
+
+ @attr = map { $_ => $attr->{$_} } @$attrseq;
+
+assuming $attr and $attrseq here are the hash and array passed as the
+result of C<attr> and C<attrseq> argspecs.
+
+This passes no values for events besides C<start>.
+
+=item C<attrseq>
+
+Attrseq causes a reference to an array of attribute names to be
+passed. This can be useful if you want to walk the C<attr> hash in
+the original sequence.
+
+This passes undef except for C<start> events.
+
+Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute
+names are forced to lower case.
+
+=item C<column>
+
+Column causes the column number of the start of the event to be passed.
+The first column on a line is 0.
+
+=item C<dtext>
+
+Dtext causes the decoded text to be passed. General entities are
+automatically decoded unless the event was inside a CDATA section or
+was between literal start and end tags (C<script>, C<style>,
+C<xmp>, and C<plaintext>).
+
+The Unicode character set is assumed for entity decoding. With Perl
+version 5.6 or earlier only the Latin-1 range is supported, and
+entities for characters outside the range 0..255 are left unchanged.
+
+This passes undef except for C<text> events.
+
+=item C<event>
+
+Event causes the event name to be passed.
+
+The event name is one of C<text>, C<start>, C<end>, C<declaration>,
+C<comment>, C<process>, C<start_document> or C<end_document>.
+
+=item C<is_cdata>
+
+Is_cdata causes a TRUE value to be passed if the event is inside a CDATA
+section or between literal start and end tags (C<script>,
+C<style>, C<xmp>, and C<plaintext>).
+
+if the flag is FALSE for a text event, then you should normally
+either use C<dtext> or decode the entities yourself before the text is
+processed further.
+
+=item C<length>
+
+Length causes the number of bytes of the source text of the event to
+be passed.
+
+=item C<line>
+
+Line causes the line number of the start of the event to be passed.
+The first line in the document is 1. Line counting doesn't start
+until at least one handler requests this value to be reported.
+
+=item C<offset>
+
+Offset causes the byte position in the HTML document of the start of
+the event to be passed. The first byte in the document has offset 0.
+
+=item C<offset_end>
+
+Offset_end causes the byte position in the HTML document of the end of
+the event to be passed. This is the same as C<offset> + C<length>.
+
+=item C<self>
+
+Self causes the current object to be passed to the handler. If the
+handler is a method, this must be the first element in the argspec.
+
+An alternative to passing self as an argspec is to register closures
+that capture $self by themselves as handlers. Unfortunately this
+creates circular references which prevent the HTML::Parser object
+from being garbage collected. Using the C<self> argspec avoids this
+problem.
+
+=item C<skipped_text>
+
+Skipped_text returns the concatenated text of all the events that have
+been skipped since the last time an event was reported. Events might
+be skipped because no handler is registered for them or because some
+filter applies. Skipped text also includes marked section markup,
+since there are no events that can catch it.
+
+If an C<"">-handler is registered for an event, then the text for this
+event is not included in C<skipped_text>. Skipped text both before
+and after the C<"">-event is included in the next reported
+C<skipped_text>.
+
+=item C<tag>
+
+Same as C<tagname>, but prefixed with "/" if it belongs to an C<end>
+event and "!" for a declaration. The C<tag> does not have any prefix
+for C<start> events, and is in this case identical to C<tagname>.
+
+=item C<tagname>
+
+This is the element name (or I<generic identifier> in SGML jargon) for
+start and end tags. Since HTML is case insensitive, this name is
+forced to lower case to ease string matching.
+
+Since XML is case sensitive, the tagname case is not changed when
+C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute
+is set.
+
+The declaration type of declaration elements is also passed as a tagname,
+even if that is a bit strange.
+In fact, in the current implementation tagname is
+identical to C<token0> except that the name may be forced to lower case.
+
+=item C<token0>
+
+Token0 causes the original text of the first token string to be
+passed. This should always be the same as $tokens->[0].
+
+For C<declaration> events, this is the declaration type.
+
+For C<start> and C<end> events, this is the tag name.
+
+For C<process> and non-strict C<comment> events, this is everything
+inside the tag.
+
+This passes undef if there are no tokens in the event.
+
+=item C<tokenpos>
+
+Tokenpos causes a reference to an array of token positions to be
+passed. For each string that appears in C<tokens>, this array
+contains two numbers. The first number is the offset of the start of
+the token in the original C<text> and the second number is the length
+of the token.
+
+Boolean attributes in a C<start> event will have (0,0) for the
+attribute value offset and length.
+
+This passes undef if there are no tokens in the event (e.g., C<text>)
+and for artificial C<end> events triggered by empty element tags.
+
+If you are using these offsets and lengths to modify C<text>, you
+should either work from right to left, or be very careful to calculate
+the changes to the offsets.
+
+=item C<tokens>
+
+Tokens causes a reference to an array of token strings to be passed.
+The strings are exactly as they were found in the original text,
+no decoding or case changes are applied.
+
+For C<declaration> events, the array contains each word, comment, and
+delimited string starting with the declaration type.
+
+For C<comment> events, this contains each sub-comment. If
+$p->strict_comments is disabled, there will be only one sub-comment.
+
+For C<start> events, this contains the original tag name followed by
+the attribute name/value pairs. The values of boolean attributes will
+be either the value set by $p->boolean_attribute_value, or the
+attribute name if no value has been set by
+$p->boolean_attribute_value.
+
+For C<end> events, this contains the original tag name (always one token).
+
+For C<process> events, this contains the process instructions (always one
+token).
+
+This passes C<undef> for C<text> events.
+
+=item C<text>
+
+Text causes the source text (including markup element delimiters) to be
+passed.
+
+=item C<undef>
+
+Pass an undefined value. Useful as padding where the same handler
+routine is registered for multiple events.
+
+=item C<'...'>
+
+A literal string of 0 to 255 characters enclosed
+in single (') or double (") quotes is passed as entered.
+
+=back
+
+The whole argspec string can be wrapped up in C<'@{...}'> to signal
+that the resulting event array should be flattened. This only makes a
+difference if an array reference is used as the handler target.
+Consider this example:
+
+ $p->handler(text => [], 'text');
+ $p->handler(text => [], '@{text}']);
+
+With two text events; C<"foo">, C<"bar">; then the first example will end
+up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in
+the handler target array.
+
+
+=head2 Events
+
+Handlers for the following events can be registered:
+
+=over
+
+=item C<comment>
+
+This event is triggered when a markup comment is recognized.
+
+Example:
+
+ <!-- This is a comment -- -- So is this -->
+
+=item C<declaration>
+
+This event is triggered when a I<markup declaration> is recognized.
+
+For typical HTML documents, the only declaration you are
+likely to find is <!DOCTYPE ...>.
+
+Example:
+
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+ "http://www.w3.org/TR/html40/strict.dtd">
+
+DTDs inside <!DOCTYPE ...> will confuse HTML::Parser.
+
+=item C<default>
+
+This event is triggered for events that do not have a specific
+handler. You can set up a handler for this event to catch stuff you
+did not want to catch explicitly.
+
+=item C<end>
+
+This event is triggered when an end tag is recognized.
+
+Example:
+
+ </A>
+
+=item C<end_document>
+
+This event is triggered when $p->eof is called and after any remaining
+text is flushed. There is no document text associated with this event.
+
+=item C<process>
+
+This event is triggered when a processing instructions markup is
+recognized.
+
+The format and content of processing instructions are system and
+application dependent.
+
+Examples:
+
+ <? HTML processing instructions >
+ <? XML processing instructions ?>
+
+=item C<start>
+
+This event is triggered when a start tag is recognized.
+
+Example:
+
+ <A HREF="http://www.perl.com/">
+
+=item C<start_document>
+
+This event is triggered before any other events for a new document. A
+handler for it can be used to initialize stuff. There is no document
+text associated with this event.
+
+=item C<text>
+
+This event is triggered when plain text (characters) is recognized.
+The text may contain multiple lines. A sequence of text may be broken
+between several text events unless $p->unbroken_text is enabled.
+
+The parser will make sure that it does not break a word or a sequence
+of whitespace between two text events.
+
+=back
+
+=head2 Unicode
+
+The C<HTML::Parser> can parse Unicode strings when running under
+perl-5.8 or better. If Unicode is passed to $p->parse() then chunks
+of Unicode will be reported to the handlers. The offset and length
+argspecs will also report their position in terms of characters.
+
+It is safe to parse raw undecoded UTF-8 if you either avoid decoding
+entities and make sure to not use I<argspecs> that do, or enable the
+C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be
+useful when parsing from a file where you need the reported offsets
+and lengths to match the byte offsets in the file.
+
+If a filename is passed to $p->parse_file() then the file will be read
+in binary mode. This will be fine if the file contains only ASCII or
+Latin-1 characters. If the file contains UTF-8 encoded text then care
+must be taken when decoding entities as described in the previous
+paragraph, but better is to open the file with the UTF-8 layer so that
+it is decoded properly:
+
+ open(my $fh, "<:utf8", "index.html") || die "...: $!";
+ $p->parse_file($fh);
+
+If the file contains text encoded in a charset besides ASCII, Latin-1
+or UTF-8 then decoding will always be needed.
+
+=head1 VERSION 2 COMPATIBILITY
+
+When an C<HTML::Parser> object is constructed with no arguments, a set
+of handlers is automatically provided that is compatible with the old
+HTML::Parser version 2 callback methods.
+
+This is equivalent to the following method calls:
+
+ $p->handler(start => "start", "self, tagname, attr, attrseq, text");
+ $p->handler(end => "end", "self, tagname, text");
+ $p->handler(text => "text", "self, text, is_cdata");
+ $p->handler(process => "process", "self, token0, text");
+ $p->handler(comment =>
+ sub {
+ my($self, $tokens) = @_;
+ for (@$tokens) {$self->comment($_);}},
+ "self, tokens");
+ $p->handler(declaration =>
+ sub {
+ my $self = shift;
+ $self->declaration(substr($_[0], 2, -1));},
+ "self, text");
+
+Setting up these handlers can also be requested with the "api_version =>
+2" constructor option.
+
+=head1 SUBCLASSING
+
+The C<HTML::Parser> class is subclassable. Parser objects are plain
+hashes and C<HTML::Parser> reserves only hash keys that start with
+"_hparser". The parser state can be set up by invoking the init()
+method, which takes the same arguments as new().
+
+=head1 EXAMPLES
+
+The first simple example shows how you might strip out comments from
+an HTML document. We achieve this by setting up a comment handler that
+does nothing and a default handler that will print out anything else:
+
+ use HTML::Parser;
+ HTML::Parser->new(default_h => [sub { print shift }, 'text'],
+ comment_h => [""],
+ )->parse_file(shift || die) || die $!;
+
+An alternative implementation is:
+
+ use HTML::Parser;
+ HTML::Parser->new(end_document_h => [sub { print shift },
+ 'skipped_text'],
+ comment_h => [""],
+ )->parse_file(shift || die) || die $!;
+
+This will in most cases be much more efficient since only a single
+callback will be made.
+
+The next example prints out the text that is inside the <title>
+element of an HTML document. Here we start by setting up a start
+handler. When it sees the title start tag it enables a text handler
+that prints any text found and an end handler that will terminate
+parsing as soon as the title end tag is seen:
+
+ use HTML::Parser ();
+
+ sub start_handler
+ {
+ return if shift ne "title";
+ my $self = shift;
+ $self->handler(text => sub { print shift }, "dtext");
+ $self->handler(end => sub { shift->eof if shift eq "title"; },
+ "tagname,self");
+ }
+
+ my $p = HTML::Parser->new(api_version => 3);
+ $p->handler( start => \&start_handler, "tagname,self");
+ $p->parse_file(shift || die) || die $!;
+ print "\n";
+
+More examples are found in the F<eg/> directory of the C<HTML-Parser>
+distribution: the program C<hrefsub> shows how you can edit all links
+found in a document; the program C<htextsub> shows how to edit the text only; the
+program C<hstrip> shows how you can strip out certain tags/elements
+and/or attributes; and the program C<htext> show how to obtain the
+plain text, but not any script/style content.
+
+You can browse the F<eg/> directory online from the I<[Browse]> link on
+the http://search.cpan.org/~gaas/HTML-Parser/ page.
+
+=head1 BUGS
+
+The <style> and <script> sections do not end with the first "</", but
+need the complete corresponding end tag. The standard behaviour is
+not really practical.
+
+When the I<strict_comment> option is enabled, we still recognize
+comments where there is something other than whitespace between even
+and odd "--" markers.
+
+Once $p->boolean_attribute_value has been set, there is no way to
+restore the default behaviour.
+
+There is currently no way to get both quote characters
+into the same literal argspec.
+
+Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them
+to repeat the previous start tag or close the previous start tag
+respectively.
+
+NET tags, e.g. "code/.../" are not recognized. This is SGML
+shorthand for "<code>...</code>".
+
+Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not
+recognized.
+
+=head1 DIAGNOSTICS
+
+The following messages may be produced by HTML::Parser. The notation
+in this listing is the same as used in L<perldiag>:
+
+=over
+
+=item Not a reference to a hash
+
+(F) The object blessed into or subclassed from HTML::Parser is not a
+hash as required by the HTML::Parser methods.
+
+=item Bad signature in parser state object at %p
+
+(F) The _hparser_xs_state element does not refer to a valid state structure.
+Something must have changed the internal value
+stored in this hash element, or the memory has been overwritten.
+
+=item _hparser_xs_state element is not a reference
+
+(F) The _hparser_xs_state element has been destroyed.
+
+=item Can't find '_hparser_xs_state' element in HTML::Parser hash
+
+(F) The _hparser_xs_state element is missing from the parser hash.
+It was either deleted, or not created when the object was created.
+
+=item API version %s not supported by HTML::Parser %s
+
+(F) The constructor option 'api_version' with an argument greater than
+or equal to 4 is reserved for future extensions.
+
+=item Bad constructor option '%s'
+
+(F) An unknown constructor option key was passed to the new() or
+init() methods.
+
+=item Parse loop not allowed
+
+(F) A handler invoked the parse() or parse_file() method.
+This is not permitted.
+
+=item marked sections not supported
+
+(F) The $p->marked_sections() method was invoked in a HTML::Parser
+module that was compiled without support for marked sections.
+
+=item Unknown boolean attribute (%d)
+
+(F) Something is wrong with the internal logic that set up aliases for
+boolean attributes.
+
+=item Only code or array references allowed as handler
+
+(F) The second argument for $p->handler must be either a subroutine
+reference, then name of a subroutine or method, or a reference to an
+array.
+
+=item No handler for %s events
+
+(F) The first argument to $p->handler must be a valid event name; i.e. one
+of "start", "end", "text", "process", "declaration" or "comment".
+
+=item Unrecognized identifier %s in argspec
+
+(F) The identifier is not a known argspec name.
+Use one of the names mentioned in the argspec section above.
+
+=item Literal string is longer than 255 chars in argspec
+
+(F) The current implementation limits the length of literals in
+an argspec to 255 characters. Make the literal shorter.
+
+=item Backslash reserved for literal string in argspec
+
+(F) The backslash character "\" is not allowed in argspec literals.
+It is reserved to permit quoting inside a literal in a later version.
+
+=item Unterminated literal string in argspec
+
+(F) The terminating quote character for a literal was not found.
+
+=item Bad argspec (%s)
+
+(F) Only identifier names, literals, spaces and commas
+are allowed in argspecs.
+
+=item Missing comma separator in argspec
+
+(F) Identifiers in an argspec must be separated with ",".
+
+=item Parsing of undecoded UTF-8 will give garbage when decoding entities
+
+(W) The first chunk parsed appears to contain undecoded UTF-8 and one
+or more argspecs that decode entities are used for the callback
+handlers.
+
+The result of decoding will be a mix of encoded and decoded characters
+for any entities that expand to characters with code above 127. This
+is not a good thing.
+
+The solution is to use the Encode::encode_utf8() on the data before
+feeding it to the $p->parse(). For $p->parse_file() pass a file that
+has been opened in ":utf8" mode.
+
+The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode>
+is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided.
+
+=item Parsing string decoded with wrong endianess
+
+(W) The first character in the document is U+FFFE. This is not a
+legal Unicode character but a byte swapped BOM. The result of parsing
+will likely be garbage.
+
+=item Parsing of undecoded UTF-32
+
+(W) The parser found the Unicode UTF-32 BOM signature at the start
+of the document. The result of parsing will likely be garbage.
+
+=item Parsing of undecoded UTF-16
+
+(W) The parser found the Unicode UTF-16 BOM signature at the start of
+the document. The result of parsing will likely be garbage.
+
+=back
+
+=head1 SEE ALSO
+
+L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>,
+L<HTML::LinkExtor>, L<HTML::Form>
+
+L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution)
+
+http://www.w3.org/TR/html4
+
+More information about marked sections and processing instructions may
+be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>.
+
+=head1 COPYRIGHT
+
+ Copyright 1996-2007 Gisle Aas. All rights reserved.
+ Copyright 1999-2000 Michael A. Chase. All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
--- /dev/null
+/* $Id: Parser.xs,v 2.137 2007/01/12 10:18:39 gisle Exp $
+ *
+ * Copyright 1999-2005, Gisle Aas.
+ * Copyright 1999-2000, Michael A. Chase.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the same terms as Perl itself.
+ */
+
+
+/*
+ * Standard XS greeting.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define PERL_NO_GET_CONTEXT /* we want efficiency */
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+#ifdef __cplusplus
+}
+#endif
+
+
+
+/*
+ * Some perl version compatibility gruff.
+ */
+#include "patchlevel.h"
+#if PATCHLEVEL <= 4 /* perl5.004_XX */
+
+#ifndef PL_sv_undef
+ #define PL_sv_undef sv_undef
+ #define PL_sv_yes sv_yes
+#endif
+
+#ifndef PL_hexdigit
+ #define PL_hexdigit hexdigit
+#endif
+
+#ifndef ERRSV
+ #define ERRSV GvSV(errgv)
+#endif
+
+#if (PATCHLEVEL == 4 && SUBVERSION <= 4)
+/* The newSVpvn function was introduced in perl5.004_05 */
+static SV *
+newSVpvn(char *s, STRLEN len)
+{
+ register SV *sv = newSV(0);
+ sv_setpvn(sv,s,len);
+ return sv;
+}
+#endif /* not perl5.004_05 */
+#endif /* perl5.004_XX */
+
+#ifndef dNOOP
+ #define dNOOP extern int errno
+#endif
+#ifndef dTHX
+ #define dTHX dNOOP
+ #define pTHX_
+ #define aTHX_
+#endif
+
+#ifndef MEMBER_TO_FPTR
+ #define MEMBER_TO_FPTR(x) (x)
+#endif
+
+#ifndef INT2PTR
+ #define INT2PTR(any,d) (any)(d)
+ #define PTR2IV(p) (IV)(p)
+#endif
+
+
+#if PATCHLEVEL > 6 || (PATCHLEVEL == 6 && SUBVERSION > 0)
+ #define RETHROW croak(Nullch)
+#else
+ #define RETHROW { STRLEN my_na; croak("%s", SvPV(ERRSV, my_na)); }
+#endif
+
+#if PATCHLEVEL < 8
+ /* No useable Unicode support */
+ /* Make these harmless if present */
+ #undef SvUTF8
+ #undef SvUTF8_on
+ #undef SvUTF8_off
+ #define SvUTF8(sv) 0
+ #define SvUTF8_on(sv) 0
+ #define SvUTF8_off(sv) 0
+#else
+ #define UNICODE_HTML_PARSER
+#endif
+
+#ifdef G_WARN_ON
+ #define DOWARN (PL_dowarn & G_WARN_ON)
+#else
+ #define DOWARN PL_dowarn
+#endif
+
+/*
+ * Include stuff. We include .c files instead of linking them,
+ * so that they don't have to pollute the external dll name space.
+ */
+
+#ifdef EXTERN
+ #undef EXTERN
+#endif
+
+#define EXTERN static /* Don't pollute */
+
+#include "hparser.h"
+#include "util.c"
+#include "hparser.c"
+
+
+/*
+ * Support functions for the XS glue
+ */
+
+static SV*
+check_handler(pTHX_ SV* h)
+{
+ if (SvROK(h)) {
+ SV* myref = SvRV(h);
+ if (SvTYPE(myref) == SVt_PVCV)
+ return newSVsv(h);
+ if (SvTYPE(myref) == SVt_PVAV)
+ return SvREFCNT_inc(myref);
+ croak("Only code or array references allowed as handler");
+ }
+ return SvOK(h) ? newSVsv(h) : 0;
+}
+
+
+static PSTATE*
+get_pstate_iv(pTHX_ SV* sv)
+{
+ PSTATE *p;
+#if PATCHLEVEL < 8
+ p = INT2PTR(PSTATE*, SvIV(sv));
+#else
+ MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, '~') : NULL;
+
+ if (!mg)
+ croak("Lost parser state magic");
+ p = (PSTATE *)mg->mg_ptr;
+ if (!p)
+ croak("Lost parser state magic");
+#endif
+ if (p->signature != P_SIGNATURE)
+ croak("Bad signature in parser state object at %p", p);
+ return p;
+}
+
+
+static PSTATE*
+get_pstate_hv(pTHX_ SV* sv) /* used by XS typemap */
+{
+ HV* hv;
+ SV** svp;
+
+ sv = SvRV(sv);
+ if (!sv || SvTYPE(sv) != SVt_PVHV)
+ croak("Not a reference to a hash");
+ hv = (HV*)sv;
+ svp = hv_fetch(hv, "_hparser_xs_state", 17, 0);
+ if (svp) {
+ if (SvROK(*svp))
+ return get_pstate_iv(aTHX_ SvRV(*svp));
+ else
+ croak("_hparser_xs_state element is not a reference");
+ }
+ croak("Can't find '_hparser_xs_state' element in HTML::Parser hash");
+ return 0;
+}
+
+
+static void
+free_pstate(pTHX_ PSTATE* pstate)
+{
+ int i;
+ SvREFCNT_dec(pstate->buf);
+ SvREFCNT_dec(pstate->pend_text);
+ SvREFCNT_dec(pstate->skipped_text);
+#ifdef MARKED_SECTION
+ SvREFCNT_dec(pstate->ms_stack);
+#endif
+ SvREFCNT_dec(pstate->bool_attr_val);
+ for (i = 0; i < EVENT_COUNT; i++) {
+ SvREFCNT_dec(pstate->handlers[i].cb);
+ SvREFCNT_dec(pstate->handlers[i].argspec);
+ }
+
+ SvREFCNT_dec(pstate->report_tags);
+ SvREFCNT_dec(pstate->ignore_tags);
+ SvREFCNT_dec(pstate->ignore_elements);
+ SvREFCNT_dec(pstate->ignoring_element);
+
+ SvREFCNT_dec(pstate->tmp);
+
+ pstate->signature = 0;
+ Safefree(pstate);
+}
+
+static int
+magic_free_pstate(pTHX_ SV *sv, MAGIC *mg)
+{
+#if PATCHLEVEL < 8
+ free_pstate(aTHX_ get_pstate_iv(aTHX_ sv));
+#else
+ free_pstate(aTHX_ (PSTATE *)mg->mg_ptr);
+#endif
+ return 0;
+}
+
+#if defined(USE_ITHREADS) && PATCHLEVEL >= 8
+
+static PSTATE *
+dup_pstate(pTHX_ PSTATE *pstate, CLONE_PARAMS *params)
+{
+ PSTATE *pstate2;
+ int i;
+
+ Newz(56, pstate2, 1, PSTATE);
+ pstate2->signature = pstate->signature;
+
+ pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params));
+ pstate2->offset = pstate->offset;
+ pstate2->line = pstate->line;
+ pstate2->column = pstate->column;
+ pstate2->start_document = pstate->start_document;
+ pstate2->parsing = pstate->parsing;
+ pstate2->eof = pstate->eof;
+
+ pstate2->literal_mode = pstate->literal_mode;
+ pstate2->is_cdata = pstate->is_cdata;
+ pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end;
+ pstate2->pending_end_tag = pstate->pending_end_tag;
+
+ pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params));
+ pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata;
+ pstate2->pend_text_offset = pstate->pend_text_offset;
+ pstate2->pend_text_line = pstate->pend_text_offset;
+ pstate2->pend_text_column = pstate->pend_text_column;
+
+ pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params));
+
+#ifdef MARKED_SECTION
+ pstate2->ms = pstate->ms;
+ pstate2->ms_stack =
+ (AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params));
+ pstate2->marked_sections = pstate->marked_sections;
+#endif
+
+ pstate2->strict_comment = pstate->strict_comment;
+ pstate2->strict_names = pstate->strict_names;
+ pstate2->strict_end = pstate->strict_end;
+ pstate2->xml_mode = pstate->xml_mode;
+ pstate2->unbroken_text = pstate->unbroken_text;
+ pstate2->attr_encoded = pstate->attr_encoded;
+ pstate2->case_sensitive = pstate->case_sensitive;
+ pstate2->closing_plaintext = pstate->closing_plaintext;
+ pstate2->utf8_mode = pstate->utf8_mode;
+ pstate2->empty_element_tags = pstate->empty_element_tags;
+ pstate2->xml_pic = pstate->xml_pic;
+
+ pstate2->bool_attr_val =
+ SvREFCNT_inc(sv_dup(pstate->bool_attr_val, params));
+ for (i = 0; i < EVENT_COUNT; i++) {
+ pstate2->handlers[i].cb =
+ SvREFCNT_inc(sv_dup(pstate->handlers[i].cb, params));
+ pstate2->handlers[i].argspec =
+ SvREFCNT_inc(sv_dup(pstate->handlers[i].argspec, params));
+ }
+ pstate2->argspec_entity_decode = pstate->argspec_entity_decode;
+
+ pstate2->report_tags =
+ (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->report_tags, params));
+ pstate2->ignore_tags =
+ (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_tags, params));
+ pstate2->ignore_elements =
+ (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_elements, params));
+
+ pstate2->ignoring_element =
+ SvREFCNT_inc(sv_dup(pstate->ignoring_element, params));
+ pstate2->ignore_depth = pstate->ignore_depth;
+
+ if (params->flags & CLONEf_JOIN_IN) {
+ pstate2->entity2char =
+ perl_get_hv("HTML::Entities::entity2char", TRUE);
+ } else {
+ pstate2->entity2char = (HV *)sv_dup((SV *)pstate->entity2char, params);
+ }
+ pstate2->tmp = SvREFCNT_inc(sv_dup(pstate->tmp, params));
+
+ return pstate2;
+}
+
+static int
+magic_dup_pstate(pTHX_ MAGIC *mg, CLONE_PARAMS *params)
+{
+ mg->mg_ptr = (char *)dup_pstate(aTHX_ (PSTATE *)mg->mg_ptr, params);
+ return 0;
+}
+
+#endif
+
+MGVTBL vtbl_pstate =
+{
+ 0,
+ 0,
+ 0,
+ 0,
+ MEMBER_TO_FPTR(magic_free_pstate),
+#if defined(USE_ITHREADS) && PATCHLEVEL >= 8
+ 0,
+ MEMBER_TO_FPTR(magic_dup_pstate),
+#endif
+};
+
+
+/*
+ * XS interface definition.
+ */
+
+MODULE = HTML::Parser PACKAGE = HTML::Parser
+
+PROTOTYPES: DISABLE
+
+void
+_alloc_pstate(self)
+ SV* self;
+ PREINIT:
+ PSTATE* pstate;
+ SV* sv;
+ HV* hv;
+ MAGIC* mg;
+
+ CODE:
+ sv = SvRV(self);
+ if (!sv || SvTYPE(sv) != SVt_PVHV)
+ croak("Not a reference to a hash");
+ hv = (HV*)sv;
+
+ Newz(56, pstate, 1, PSTATE);
+ pstate->signature = P_SIGNATURE;
+ pstate->entity2char = perl_get_hv("HTML::Entities::entity2char", TRUE);
+ pstate->tmp = NEWSV(0, 20);
+
+ sv = newSViv(PTR2IV(pstate));
+#if PATCHLEVEL < 8
+ sv_magic(sv, 0, '~', 0, 0);
+#else
+ sv_magic(sv, 0, '~', (char *)pstate, 0);
+#endif
+ mg = mg_find(sv, '~');
+ assert(mg);
+ mg->mg_virtual = &vtbl_pstate;
+#if defined(USE_ITHREADS) && PATCHLEVEL >= 8
+ mg->mg_flags |= MGf_DUP;
+#endif
+ SvREADONLY_on(sv);
+
+ hv_store(hv, "_hparser_xs_state", 17, newRV_noinc(sv), 0);
+
+void
+parse(self, chunk)
+ SV* self;
+ SV* chunk
+ PREINIT:
+ PSTATE* p_state = get_pstate_hv(aTHX_ self);
+ PPCODE:
+ if (p_state->parsing)
+ croak("Parse loop not allowed");
+ p_state->parsing = 1;
+ if (SvROK(chunk) && SvTYPE(SvRV(chunk)) == SVt_PVCV) {
+ SV* generator = chunk;
+ STRLEN len;
+ do {
+ int count;
+ PUSHMARK(SP);
+ count = perl_call_sv(generator, G_SCALAR|G_EVAL);
+ SPAGAIN;
+ chunk = count ? POPs : 0;
+ PUTBACK;
+
+ if (SvTRUE(ERRSV)) {
+ p_state->parsing = 0;
+ p_state->eof = 0;
+ RETHROW;
+ }
+
+ if (chunk && SvOK(chunk)) {
+ (void)SvPV(chunk, len); /* get length */
+ }
+ else {
+ len = 0;
+ }
+ parse(aTHX_ p_state, len ? chunk : 0, self);
+ SPAGAIN;
+
+ } while (len && !p_state->eof);
+ }
+ else {
+ parse(aTHX_ p_state, chunk, self);
+ SPAGAIN;
+ }
+ p_state->parsing = 0;
+ if (p_state->eof) {
+ p_state->eof = 0;
+ PUSHs(sv_newmortal());
+ }
+ else {
+ PUSHs(self);
+ }
+
+void
+eof(self)
+ SV* self;
+ PREINIT:
+ PSTATE* p_state = get_pstate_hv(aTHX_ self);
+ PPCODE:
+ if (p_state->parsing)
+ p_state->eof = 1;
+ else {
+ p_state->parsing = 1;
+ parse(aTHX_ p_state, 0, self); /* flush */
+ p_state->parsing = 0;
+ }
+ PUSHs(self);
+
+SV*
+strict_comment(pstate,...)
+ PSTATE* pstate
+ ALIAS:
+ HTML::Parser::strict_comment = 1
+ HTML::Parser::strict_names = 2
+ HTML::Parser::xml_mode = 3
+ HTML::Parser::unbroken_text = 4
+ HTML::Parser::marked_sections = 5
+ HTML::Parser::attr_encoded = 6
+ HTML::Parser::case_sensitive = 7
+ HTML::Parser::strict_end = 8
+ HTML::Parser::closing_plaintext = 9
+ HTML::Parser::utf8_mode = 10
+ HTML::Parser::empty_element_tags = 11
+ HTML::Parser::xml_pic = 12
+ PREINIT:
+ bool *attr;
+ CODE:
+ switch (ix) {
+ case 1: attr = &pstate->strict_comment; break;
+ case 2: attr = &pstate->strict_names; break;
+ case 3: attr = &pstate->xml_mode; break;
+ case 4: attr = &pstate->unbroken_text; break;
+ case 5:
+#ifdef MARKED_SECTION
+ attr = &pstate->marked_sections; break;
+#else
+ croak("marked sections not supported"); break;
+#endif
+ case 6: attr = &pstate->attr_encoded; break;
+ case 7: attr = &pstate->case_sensitive; break;
+ case 8: attr = &pstate->strict_end; break;
+ case 9: attr = &pstate->closing_plaintext; break;
+#ifdef UNICODE_HTML_PARSER
+ case 10: attr = &pstate->utf8_mode; break;
+#else
+ case 10: croak("The utf8_mode does not work with this perl; perl-5.8 or better required");
+#endif
+ case 11: attr = &pstate->empty_element_tags; break;
+ case 12: attr = &pstate->xml_pic; break;
+ default:
+ croak("Unknown boolean attribute (%d)", ix);
+ }
+ RETVAL = boolSV(*attr);
+ if (items > 1)
+ *attr = SvTRUE(ST(1));
+ OUTPUT:
+ RETVAL
+
+SV*
+boolean_attribute_value(pstate,...)
+ PSTATE* pstate
+ CODE:
+ RETVAL = pstate->bool_attr_val ? newSVsv(pstate->bool_attr_val)
+ : &PL_sv_undef;
+ if (items > 1) {
+ SvREFCNT_dec(pstate->bool_attr_val);
+ pstate->bool_attr_val = newSVsv(ST(1));
+ }
+ OUTPUT:
+ RETVAL
+
+void
+ignore_tags(pstate,...)
+ PSTATE* pstate
+ ALIAS:
+ HTML::Parser::report_tags = 1
+ HTML::Parser::ignore_tags = 2
+ HTML::Parser::ignore_elements = 3
+ PREINIT:
+ HV** attr;
+ int i;
+ CODE:
+ switch (ix) {
+ case 1: attr = &pstate->report_tags; break;
+ case 2: attr = &pstate->ignore_tags; break;
+ case 3: attr = &pstate->ignore_elements; break;
+ default:
+ croak("Unknown tag-list attribute (%d)", ix);
+ }
+ if (GIMME_V != G_VOID)
+ croak("Can't report tag lists yet");
+
+ items--; /* pstate */
+ if (items) {
+ if (*attr)
+ hv_clear(*attr);
+ else
+ *attr = newHV();
+
+ for (i = 0; i < items; i++) {
+ SV* sv = ST(i+1);
+ if (SvROK(sv)) {
+ sv = SvRV(sv);
+ if (SvTYPE(sv) == SVt_PVAV) {
+ AV* av = (AV*)sv;
+ STRLEN j;
+ STRLEN len = av_len(av) + 1;
+ for (j = 0; j < len; j++) {
+ SV**svp = av_fetch(av, j, 0);
+ if (svp) {
+ hv_store_ent(*attr, *svp, newSViv(0), 0);
+ }
+ }
+ }
+ else
+ croak("Tag list must be plain scalars and arrays");
+ }
+ else {
+ hv_store_ent(*attr, sv, newSViv(0), 0);
+ }
+ }
+ }
+ else if (*attr) {
+ SvREFCNT_dec(*attr);
+ *attr = 0;
+ }
+
+void
+handler(pstate, eventname,...)
+ PSTATE* pstate
+ SV* eventname
+ PREINIT:
+ STRLEN name_len;
+ char *name = SvPV(eventname, name_len);
+ int event = -1;
+ int i;
+ struct p_handler *h;
+ PPCODE:
+ /* map event name string to event_id */
+ for (i = 0; i < EVENT_COUNT; i++) {
+ if (strEQ(name, event_id_str[i])) {
+ event = i;
+ break;
+ }
+ }
+ if (event < 0)
+ croak("No handler for %s events", name);
+
+ h = &pstate->handlers[event];
+
+ /* set up return value */
+ if (h->cb) {
+ PUSHs((SvTYPE(h->cb) == SVt_PVAV)
+ ? sv_2mortal(newRV_inc(h->cb))
+ : sv_2mortal(newSVsv(h->cb)));
+ }
+ else {
+ PUSHs(&PL_sv_undef);
+ }
+
+ /* update */
+ if (items > 3) {
+ SvREFCNT_dec(h->argspec);
+ h->argspec = 0;
+ h->argspec = argspec_compile(ST(3), pstate);
+ }
+ if (items > 2) {
+ SvREFCNT_dec(h->cb);
+ h->cb = 0;
+ h->cb = check_handler(aTHX_ ST(2));
+ }
+
+
+MODULE = HTML::Parser PACKAGE = HTML::Entities
+
+void
+decode_entities(...)
+ PREINIT:
+ int i;
+ HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE);
+ PPCODE:
+ if (GIMME_V == G_SCALAR && items > 1)
+ items = 1;
+ for (i = 0; i < items; i++) {
+ if (GIMME_V != G_VOID)
+ ST(i) = sv_2mortal(newSVsv(ST(i)));
+ else if (SvREADONLY(ST(i)))
+ croak("Can't inline decode readonly string");
+ decode_entities(aTHX_ ST(i), entity2char, 0);
+ }
+ SP += items;
+
+void
+_decode_entities(string, entities, ...)
+ SV* string
+ SV* entities
+ PREINIT:
+ HV* entities_hv;
+ bool expand_prefix = (items > 2) ? SvTRUE(ST(2)) : 0;
+ CODE:
+ if (SvOK(entities)) {
+ if (SvROK(entities) && SvTYPE(SvRV(entities)) == SVt_PVHV) {
+ entities_hv = (HV*)SvRV(entities);
+ }
+ else {
+ croak("2nd argument must be hash reference");
+ }
+ }
+ else {
+ entities_hv = 0;
+ }
+ if (SvREADONLY(string))
+ croak("Can't inline decode readonly string");
+ decode_entities(aTHX_ string, entities_hv, expand_prefix);
+
+bool
+_probably_utf8_chunk(string)
+ SV* string
+ PREINIT:
+ STRLEN len;
+ char *s;
+ CODE:
+#ifdef UNICODE_HTML_PARSER
+ sv_utf8_downgrade(string, 0);
+ s = SvPV(string, len);
+ RETVAL = probably_utf8_chunk(aTHX_ s, len);
+#else
+ RETVAL = 0; /* avoid never initialized complains from compiler */
+ croak("_probably_utf8_chunk() only works for Unicode enabled perls");
+#endif
+ OUTPUT:
+ RETVAL
+
+int
+UNICODE_SUPPORT()
+ PROTOTYPE:
+ CODE:
+#ifdef UNICODE_HTML_PARSER
+ RETVAL = 1;
+#else
+ RETVAL = 0;
+#endif
+ OUTPUT:
+ RETVAL
+
+
+MODULE = HTML::Parser PACKAGE = HTML::Parser
--- /dev/null
+if ($Config{gccversion}) {
+ print "Turning off optimizations to avoid compiler bug\n";
+ $self->{OPTIMIZE} = " ";
+}
--- /dev/null
+/* $Id: hparser.c,v 2.134 2007/01/12 10:54:06 gisle Exp $
+ *
+ * Copyright 1999-2007, Gisle Aas
+ * Copyright 1999-2000, Michael A. Chase
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the same terms as Perl itself.
+ */
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+#include "hctype.h" /* isH...() macros */
+#include "tokenpos.h" /* dTOKEN; PUSH_TOKEN() */
+
+
+static
+struct literal_tag {
+ int len;
+ char* str;
+ int is_cdata;
+}
+literal_mode_elem[] =
+{
+ {6, "script", 1},
+ {5, "style", 1},
+ {3, "xmp", 1},
+ {9, "plaintext", 1},
+ {5, "title", 0},
+ {8, "textarea", 0},
+ {0, 0, 0}
+};
+
+enum argcode {
+ ARG_SELF = 1, /* need to avoid '\0' in argspec string */
+ ARG_TOKENS,
+ ARG_TOKENPOS,
+ ARG_TOKEN0,
+ ARG_TAGNAME,
+ ARG_TAG,
+ ARG_ATTR,
+ ARG_ATTRARR,
+ ARG_ATTRSEQ,
+ ARG_TEXT,
+ ARG_DTEXT,
+ ARG_IS_CDATA,
+ ARG_SKIPPED_TEXT,
+ ARG_OFFSET,
+ ARG_OFFSET_END,
+ ARG_LENGTH,
+ ARG_LINE,
+ ARG_COLUMN,
+ ARG_EVENT,
+ ARG_UNDEF,
+ ARG_LITERAL, /* Always keep last */
+
+ /* extra flags always encoded first */
+ ARG_FLAG_FLAT_ARRAY
+};
+
+char *argname[] = {
+ /* Must be in the same order as enum argcode */
+ "self", /* ARG_SELF */
+ "tokens", /* ARG_TOKENS */
+ "tokenpos", /* ARG_TOKENPOS */
+ "token0", /* ARG_TOKEN0 */
+ "tagname", /* ARG_TAGNAME */
+ "tag", /* ARG_TAG */
+ "attr", /* ARG_ATTR */
+ "@attr", /* ARG_ATTRARR */
+ "attrseq", /* ARG_ATTRSEQ */
+ "text", /* ARG_TEXT */
+ "dtext", /* ARG_DTEXT */
+ "is_cdata", /* ARG_IS_CDATA */
+ "skipped_text", /* ARG_SKIPPED_TEXT */
+ "offset", /* ARG_OFFSET */
+ "offset_end", /* ARG_OFFSET_END */
+ "length", /* ARG_LENGTH */
+ "line", /* ARG_LINE */
+ "column", /* ARG_COLUMN */
+ "event", /* ARG_EVENT */
+ "undef", /* ARG_UNDEF */
+ /* ARG_LITERAL (not compared) */
+ /* ARG_FLAG_FLAT_ARRAY */
+};
+
+#define CASE_SENSITIVE(p_state) \
+ ((p_state)->xml_mode || (p_state)->case_sensitive)
+#define STRICT_NAMES(p_state) \
+ ((p_state)->xml_mode || (p_state)->strict_names)
+#define ALLOW_EMPTY_TAG(p_state) \
+ ((p_state)->xml_mode || (p_state)->empty_element_tags)
+
+static void flush_pending_text(PSTATE* p_state, SV* self);
+
+/*
+ * Parser functions.
+ *
+ * parse() - top level entry point.
+ * deals with text and calls one of its
+ * subordinate parse_*() routines after
+ * looking at the first char after "<"
+ * parse_decl() - deals with declarations <!...>
+ * parse_comment() - deals with <!-- ... -->
+ * parse_marked_section - deals with <![ ... [ ... ]]>
+ * parse_end() - deals with end tags </...>
+ * parse_start() - deals with start tags <A...>
+ * parse_process() - deals with process instructions <?...>
+ * parse_null() - deals with anything else <....>
+ *
+ * report_event() - called whenever any of the parse*() routines
+ * has recongnized something.
+ */
+
+static void
+report_event(PSTATE* p_state,
+ event_id_t event,
+ char *beg, char *end, U32 utf8,
+ token_pos_t *tokens, int num_tokens,
+ SV* self
+ )
+{
+ struct p_handler *h;
+ dTHX;
+ dSP;
+ AV *array;
+ STRLEN my_na;
+ char *argspec;
+ char *s;
+ STRLEN offset;
+ STRLEN line;
+ STRLEN column;
+
+#ifdef UNICODE_HTML_PARSER
+ #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b))
+#else
+ #define CHR_DIST(a,b) ((a) - (b))
+#endif
+
+ /* some events might still fire after a handler has signaled eof
+ * so suppress them here.
+ */
+ if (p_state->eof)
+ return;
+
+ /* capture offsets */
+ offset = p_state->offset;
+ line = p_state->line;
+ column = p_state->column;
+
+#if 0
+ { /* used for debugging at some point */
+ char *s = beg;
+ int i;
+
+ /* print debug output */
+ switch(event) {
+ case E_DECLARATION: printf("DECLARATION"); break;
+ case E_COMMENT: printf("COMMENT"); break;
+ case E_START: printf("START"); break;
+ case E_END: printf("END"); break;
+ case E_TEXT: printf("TEXT"); break;
+ case E_PROCESS: printf("PROCESS"); break;
+ case E_NONE: printf("NONE"); break;
+ default: printf("EVENT #%d", event); break;
+ }
+
+ printf(" [");
+ while (s < end) {
+ if (*s == '\n') {
+ putchar('\\'); putchar('n');
+ }
+ else
+ putchar(*s);
+ s++;
+ }
+ printf("] %d\n", end - beg);
+ for (i = 0; i < num_tokens; i++) {
+ printf(" token %d: %d %d\n",
+ i,
+ tokens[i].beg - beg,
+ tokens[i].end - tokens[i].beg);
+ }
+ }
+#endif
+
+ if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) {
+ token_pos_t t;
+ char dummy;
+ t.beg = p_state->pending_end_tag;
+ t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag);
+ p_state->pending_end_tag = 0;
+ report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
+ SPAGAIN;
+ }
+
+ /* update offsets */
+ p_state->offset += CHR_DIST(end, beg);
+ if (line) {
+ char *s = beg;
+ char *nl = NULL;
+ while (s < end) {
+ if (*s == '\n') {
+ p_state->line++;
+ nl = s;
+ }
+ s++;
+ }
+ if (nl)
+ p_state->column = CHR_DIST(end, nl) - 1;
+ else
+ p_state->column += CHR_DIST(end, beg);
+ }
+
+ if (event == E_NONE)
+ goto IGNORE_EVENT;
+
+#ifdef MARKED_SECTION
+ if (p_state->ms == MS_IGNORE)
+ goto IGNORE_EVENT;
+#endif
+
+ /* tag filters */
+ if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) {
+
+ if (event == E_START || event == E_END) {
+ SV* tagname = p_state->tmp;
+
+ assert(num_tokens >= 1);
+ sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg);
+ if (utf8)
+ SvUTF8_on(tagname);
+ else
+ SvUTF8_off(tagname);
+ if (!CASE_SENSITIVE(p_state))
+ sv_lower(aTHX_ tagname);
+
+ if (p_state->ignoring_element) {
+ if (sv_eq(p_state->ignoring_element, tagname)) {
+ if (event == E_START)
+ p_state->ignore_depth++;
+ else if (--p_state->ignore_depth == 0) {
+ SvREFCNT_dec(p_state->ignoring_element);
+ p_state->ignoring_element = 0;
+ }
+ }
+ goto IGNORE_EVENT;
+ }
+
+ if (p_state->ignore_elements &&
+ hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0))
+ {
+ if (event == E_START) {
+ p_state->ignoring_element = newSVsv(tagname);
+ p_state->ignore_depth = 1;
+ }
+ goto IGNORE_EVENT;
+ }
+
+ if (p_state->ignore_tags &&
+ hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0))
+ {
+ goto IGNORE_EVENT;
+ }
+ if (p_state->report_tags &&
+ !hv_fetch_ent(p_state->report_tags, tagname, 0, 0))
+ {
+ goto IGNORE_EVENT;
+ }
+ }
+ else if (p_state->ignoring_element) {
+ goto IGNORE_EVENT;
+ }
+ }
+
+ h = &p_state->handlers[event];
+ if (!h->cb) {
+ /* event = E_DEFAULT; */
+ h = &p_state->handlers[E_DEFAULT];
+ if (!h->cb)
+ goto IGNORE_EVENT;
+ }
+
+ if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) {
+ /* FALSE scalar ('' or 0) means IGNORE this event */
+ return;
+ }
+
+ if (p_state->unbroken_text && event == E_TEXT) {
+ /* should buffer text */
+ if (!p_state->pend_text)
+ p_state->pend_text = newSV(256);
+ if (SvOK(p_state->pend_text)) {
+ if (p_state->is_cdata != p_state->pend_text_is_cdata) {
+ flush_pending_text(p_state, self);
+ SPAGAIN;
+ goto INIT_PEND_TEXT;
+ }
+ }
+ else {
+ INIT_PEND_TEXT:
+ p_state->pend_text_offset = offset;
+ p_state->pend_text_line = line;
+ p_state->pend_text_column = column;
+ p_state->pend_text_is_cdata = p_state->is_cdata;
+ sv_setpvn(p_state->pend_text, "", 0);
+ if (!utf8)
+ SvUTF8_off(p_state->pend_text);
+ }
+#ifdef UNICODE_HTML_PARSER
+ if (utf8 && !SvUTF8(p_state->pend_text))
+ sv_utf8_upgrade(p_state->pend_text);
+ if (utf8 || !SvUTF8(p_state->pend_text)) {
+ sv_catpvn(p_state->pend_text, beg, end - beg);
+ }
+ else {
+ SV *tmp = newSVpvn(beg, end - beg);
+ sv_utf8_upgrade(tmp);
+ sv_catsv(p_state->pend_text, tmp);
+ SvREFCNT_dec(tmp);
+ }
+#else
+ sv_catpvn(p_state->pend_text, beg, end - beg);
+#endif
+ return;
+ }
+ else if (p_state->pend_text && SvOK(p_state->pend_text)) {
+ flush_pending_text(p_state, self);
+ SPAGAIN;
+ }
+
+ /* At this point we have decided to generate an event callback */
+
+ argspec = h->argspec ? SvPV(h->argspec, my_na) : "";
+
+ if (SvTYPE(h->cb) == SVt_PVAV) {
+
+ if (*argspec == ARG_FLAG_FLAT_ARRAY) {
+ argspec++;
+ array = (AV*)h->cb;
+ }
+ else {
+ /* start sub-array for accumulator array */
+ array = newAV();
+ }
+ }
+ else {
+ array = 0;
+ if (*argspec == ARG_FLAG_FLAT_ARRAY)
+ argspec++;
+
+ /* start argument stack for callback */
+ ENTER;
+ SAVETMPS;
+ PUSHMARK(SP);
+ }
+
+ for (s = argspec; *s; s++) {
+ SV* arg = 0;
+ int push_arg = 1;
+ enum argcode argcode = (enum argcode)*s;
+
+ switch( argcode ) {
+
+ case ARG_SELF:
+ arg = sv_mortalcopy(self);
+ break;
+
+ case ARG_TOKENS:
+ if (num_tokens >= 1) {
+ AV* av = newAV();
+ SV* prev_token = &PL_sv_undef;
+ int i;
+ av_extend(av, num_tokens);
+ for (i = 0; i < num_tokens; i++) {
+ if (tokens[i].beg) {
+ prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg);
+ if (utf8)
+ SvUTF8_on(prev_token);
+ av_push(av, prev_token);
+ }
+ else { /* boolean */
+ av_push(av, p_state->bool_attr_val
+ ? newSVsv(p_state->bool_attr_val)
+ : newSVsv(prev_token));
+ }
+ }
+ arg = sv_2mortal(newRV_noinc((SV*)av));
+ }
+ break;
+
+ case ARG_TOKENPOS:
+ if (num_tokens >= 1 && tokens[0].beg >= beg) {
+ AV* av = newAV();
+ int i;
+ av_extend(av, num_tokens*2);
+ for (i = 0; i < num_tokens; i++) {
+ if (tokens[i].beg) {
+ av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg)));
+ av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg)));
+ }
+ else { /* boolean tag value */
+ av_push(av, newSViv(0));
+ av_push(av, newSViv(0));
+ }
+ }
+ arg = sv_2mortal(newRV_noinc((SV*)av));
+ }
+ break;
+
+ case ARG_TOKEN0:
+ case ARG_TAGNAME:
+ /* fall through */
+
+ case ARG_TAG:
+ if (num_tokens >= 1) {
+ arg = sv_2mortal(newSVpvn(tokens[0].beg,
+ tokens[0].end - tokens[0].beg));
+ if (utf8)
+ SvUTF8_on(arg);
+ if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0)
+ sv_lower(aTHX_ arg);
+ if (argcode == ARG_TAG && event != E_START) {
+ char *e_type = "!##/#?#";
+ sv_insert(arg, 0, 0, &e_type[event], 1);
+ }
+ }
+ break;
+
+ case ARG_ATTR:
+ case ARG_ATTRARR:
+ if (event == E_START) {
+ HV* hv;
+ int i;
+ if (argcode == ARG_ATTR) {
+ hv = newHV();
+ arg = sv_2mortal(newRV_noinc((SV*)hv));
+ }
+ else {
+#ifdef __GNUC__
+ /* gcc -Wall reports this variable as possibly used uninitialized */
+ hv = 0;
+#endif
+ push_arg = 0; /* deal with argument pushing here */
+ }
+
+ for (i = 1; i < num_tokens; i += 2) {
+ SV* attrname = newSVpvn(tokens[i].beg,
+ tokens[i].end-tokens[i].beg);
+ SV* attrval;
+
+ if (utf8)
+ SvUTF8_on(attrname);
+ if (tokens[i+1].beg) {
+ char *beg = tokens[i+1].beg;
+ STRLEN len = tokens[i+1].end - beg;
+ if (*beg == '"' || *beg == '\'') {
+ assert(len >= 2 && *beg == beg[len-1]);
+ beg++; len -= 2;
+ }
+ attrval = newSVpvn(beg, len);
+ if (utf8)
+ SvUTF8_on(attrval);
+ if (!p_state->attr_encoded) {
+#ifdef UNICODE_HTML_PARSER
+ if (p_state->utf8_mode)
+ sv_utf8_decode(attrval);
+#endif
+ decode_entities(aTHX_ attrval, p_state->entity2char, 0);
+ if (p_state->utf8_mode)
+ SvUTF8_off(attrval);
+ }
+ }
+ else { /* boolean */
+ if (p_state->bool_attr_val)
+ attrval = newSVsv(p_state->bool_attr_val);
+ else
+ attrval = newSVsv(attrname);
+ }
+
+ if (!CASE_SENSITIVE(p_state))
+ sv_lower(aTHX_ attrname);
+
+ if (argcode == ARG_ATTR) {
+ if (hv_exists_ent(hv, attrname, 0) ||
+ !hv_store_ent(hv, attrname, attrval, 0)) {
+ SvREFCNT_dec(attrval);
+ }
+ SvREFCNT_dec(attrname);
+ }
+ else { /* ARG_ATTRARR */
+ if (array) {
+ av_push(array, attrname);
+ av_push(array, attrval);
+ }
+ else {
+ XPUSHs(sv_2mortal(attrname));
+ XPUSHs(sv_2mortal(attrval));
+ }
+ }
+ }
+ }
+ else if (argcode == ARG_ATTRARR) {
+ push_arg = 0;
+ }
+ break;
+
+ case ARG_ATTRSEQ: /* (v2 compatibility stuff) */
+ if (event == E_START) {
+ AV* av = newAV();
+ int i;
+ for (i = 1; i < num_tokens; i += 2) {
+ SV* attrname = newSVpvn(tokens[i].beg,
+ tokens[i].end-tokens[i].beg);
+ if (utf8)
+ SvUTF8_on(attrname);
+ if (!CASE_SENSITIVE(p_state))
+ sv_lower(aTHX_ attrname);
+ av_push(av, attrname);
+ }
+ arg = sv_2mortal(newRV_noinc((SV*)av));
+ }
+ break;
+
+ case ARG_TEXT:
+ arg = sv_2mortal(newSVpvn(beg, end - beg));
+ if (utf8)
+ SvUTF8_on(arg);
+ break;
+
+ case ARG_DTEXT:
+ if (event == E_TEXT) {
+ arg = sv_2mortal(newSVpvn(beg, end - beg));
+ if (utf8)
+ SvUTF8_on(arg);
+ if (!p_state->is_cdata) {
+#ifdef UNICODE_HTML_PARSER
+ if (p_state->utf8_mode)
+ sv_utf8_decode(arg);
+#endif
+ decode_entities(aTHX_ arg, p_state->entity2char, 1);
+ if (p_state->utf8_mode)
+ SvUTF8_off(arg);
+ }
+ }
+ break;
+
+ case ARG_IS_CDATA:
+ if (event == E_TEXT) {
+ arg = boolSV(p_state->is_cdata);
+ }
+ break;
+
+ case ARG_SKIPPED_TEXT:
+ arg = sv_2mortal(p_state->skipped_text);
+ p_state->skipped_text = newSVpvn("", 0);
+ break;
+
+ case ARG_OFFSET:
+ arg = sv_2mortal(newSViv(offset));
+ break;
+
+ case ARG_OFFSET_END:
+ arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg)));
+ break;
+
+ case ARG_LENGTH:
+ arg = sv_2mortal(newSViv(CHR_DIST(end, beg)));
+ break;
+
+ case ARG_LINE:
+ arg = sv_2mortal(newSViv(line));
+ break;
+
+ case ARG_COLUMN:
+ arg = sv_2mortal(newSViv(column));
+ break;
+
+ case ARG_EVENT:
+ assert(event >= 0 && event < EVENT_COUNT);
+ arg = sv_2mortal(newSVpv(event_id_str[event], 0));
+ break;
+
+ case ARG_LITERAL:
+ {
+ int len = (unsigned char)s[1];
+ arg = sv_2mortal(newSVpvn(s+2, len));
+ if (SvUTF8(h->argspec))
+ SvUTF8_on(arg);
+ s += len + 1;
+ }
+ break;
+
+ case ARG_UNDEF:
+ arg = sv_mortalcopy(&PL_sv_undef);
+ break;
+
+ default:
+ arg = sv_2mortal(newSVpvf("Bad argspec %d", *s));
+ break;
+ }
+
+ if (push_arg) {
+ if (!arg)
+ arg = sv_mortalcopy(&PL_sv_undef);
+
+ if (array) {
+ /* have to fix mortality here or add mortality to
+ * XPUSHs after removing it from the switch cases.
+ */
+ av_push(array, SvREFCNT_inc(arg));
+ }
+ else {
+ XPUSHs(arg);
+ }
+ }
+ }
+
+ if (array) {
+ if (array != (AV*)h->cb)
+ av_push((AV*)h->cb, newRV_noinc((SV*)array));
+ }
+ else {
+ PUTBACK;
+
+ if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) {
+ char *method = SvPV(h->cb, my_na);
+ perl_call_method(method, G_DISCARD | G_EVAL | G_VOID);
+ }
+ else {
+ perl_call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID);
+ }
+
+ if (SvTRUE(ERRSV)) {
+ RETHROW;
+ }
+
+ FREETMPS;
+ LEAVE;
+ }
+ if (p_state->skipped_text)
+ SvCUR_set(p_state->skipped_text, 0);
+ return;
+
+IGNORE_EVENT:
+ if (p_state->skipped_text) {
+ if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text))
+ flush_pending_text(p_state, self);
+#ifdef UNICODE_HTML_PARSER
+ if (utf8 && !SvUTF8(p_state->skipped_text))
+ sv_utf8_upgrade(p_state->skipped_text);
+ if (utf8 || !SvUTF8(p_state->skipped_text)) {
+#endif
+ sv_catpvn(p_state->skipped_text, beg, end - beg);
+#ifdef UNICODE_HTML_PARSER
+ }
+ else {
+ SV *tmp = newSVpvn(beg, end - beg);
+ sv_utf8_upgrade(tmp);
+ sv_catsv(p_state->pend_text, tmp);
+ SvREFCNT_dec(tmp);
+ }
+#endif
+ }
+#undef CHR_DIST
+ return;
+}
+
+
+EXTERN SV*
+argspec_compile(SV* src, PSTATE* p_state)
+{
+ dTHX;
+ SV* argspec = newSVpvn("", 0);
+ STRLEN len;
+ char *s = SvPV(src, len);
+ char *end = s + len;
+
+ if (SvUTF8(src))
+ SvUTF8_on(argspec);
+
+ while (isHSPACE(*s))
+ s++;
+
+ if (*s == '@') {
+ /* try to deal with '@{ ... }' wrapping */
+ char *tmp = s + 1;
+ while (isHSPACE(*tmp))
+ tmp++;
+ if (*tmp == '{') {
+ char c = ARG_FLAG_FLAT_ARRAY;
+ sv_catpvn(argspec, &c, 1);
+ tmp++;
+ while (isHSPACE(*tmp))
+ tmp++;
+ s = tmp;
+ }
+ }
+ while (s < end) {
+ if (isHNAME_FIRST(*s) || *s == '@') {
+ char *name = s;
+ int a = ARG_SELF;
+ char **arg_name;
+
+ s++;
+ while (isHNAME_CHAR(*s))
+ s++;
+
+ /* check identifier */
+ for ( arg_name = argname; a < ARG_LITERAL ; ++a, ++arg_name ) {
+ if (strnEQ(*arg_name, name, s - name) &&
+ (*arg_name)[s - name] == '\0')
+ break;
+ }
+ if (a < ARG_LITERAL) {
+ char c = (unsigned char) a;
+ sv_catpvn(argspec, &c, 1);
+
+ if (a == ARG_LINE || a == ARG_COLUMN) {
+ if (!p_state->line)
+ p_state->line = 1; /* enable tracing of line/column */
+ }
+ if (a == ARG_SKIPPED_TEXT) {
+ if (!p_state->skipped_text) {
+ p_state->skipped_text = newSVpvn("", 0);
+ }
+ }
+ if (a == ARG_ATTR || a == ARG_ATTRARR || a == ARG_DTEXT) {
+ p_state->argspec_entity_decode++;
+ }
+ }
+ else {
+ croak("Unrecognized identifier %.*s in argspec", s - name, name);
+ }
+ }
+ else if (*s == '"' || *s == '\'') {
+ char *string_beg = s;
+ s++;
+ while (s < end && *s != *string_beg && *s != '\\')
+ s++;
+ if (*s == *string_beg) {
+ /* literal */
+ int len = s - string_beg - 1;
+ unsigned char buf[2];
+ if (len > 255)
+ croak("Literal string is longer than 255 chars in argspec");
+ buf[0] = ARG_LITERAL;
+ buf[1] = len;
+ sv_catpvn(argspec, (char*)buf, 2);
+ sv_catpvn(argspec, string_beg+1, len);
+ s++;
+ }
+ else if (*s == '\\') {
+ croak("Backslash reserved for literal string in argspec");
+ }
+ else {
+ croak("Unterminated literal string in argspec");
+ }
+ }
+ else {
+ croak("Bad argspec (%s)", s);
+ }
+
+ while (isHSPACE(*s))
+ s++;
+
+ if (*s == '}' && SvPVX(argspec)[0] == ARG_FLAG_FLAT_ARRAY) {
+ /* end of '@{ ... }' */
+ s++;
+ while (isHSPACE(*s))
+ s++;
+ if (s < end)
+ croak("Bad argspec: stuff after @{...} (%s)", s);
+ }
+
+ if (s == end)
+ break;
+ if (*s != ',') {
+ croak("Missing comma separator in argspec");
+ }
+ s++;
+ while (isHSPACE(*s))
+ s++;
+ }
+ return argspec;
+}
+
+
+static void
+flush_pending_text(PSTATE* p_state, SV* self)
+{
+ dTHX;
+ bool old_unbroken_text = p_state->unbroken_text;
+ SV* old_pend_text = p_state->pend_text;
+ bool old_is_cdata = p_state->is_cdata;
+ STRLEN old_offset = p_state->offset;
+ STRLEN old_line = p_state->line;
+ STRLEN old_column = p_state->column;
+
+ assert(p_state->pend_text && SvOK(p_state->pend_text));
+
+ p_state->unbroken_text = 0;
+ p_state->pend_text = 0;
+ p_state->is_cdata = p_state->pend_text_is_cdata;
+ p_state->offset = p_state->pend_text_offset;
+ p_state->line = p_state->pend_text_line;
+ p_state->column = p_state->pend_text_column;
+
+ report_event(p_state, E_TEXT,
+ SvPVX(old_pend_text), SvEND(old_pend_text),
+ SvUTF8(old_pend_text), 0, 0, self);
+ SvOK_off(old_pend_text);
+
+ p_state->unbroken_text = old_unbroken_text;
+ p_state->pend_text = old_pend_text;
+ p_state->is_cdata = old_is_cdata;
+ p_state->offset = old_offset;
+ p_state->line = old_line;
+ p_state->column = old_column;
+}
+
+static char*
+skip_until_gt(char *beg, char *end)
+{
+ /* tries to emulate quote skipping behaviour observed in MSIE */
+ char *s = beg;
+ char quote = '\0';
+ char prev = ' ';
+ while (s < end) {
+ if (!quote && *s == '>')
+ return s;
+ if (*s == '"' || *s == '\'') {
+ if (*s == quote) {
+ quote = '\0'; /* end of quoted string */
+ }
+ else if (!quote && (prev == ' ' || prev == '=')) {
+ quote = *s;
+ }
+ }
+ prev = *s++;
+ }
+ return end;
+}
+
+static char*
+parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
+{
+ char *s = beg;
+
+ if (p_state->strict_comment) {
+ dTOKENS(4);
+ char *start_com = s; /* also used to signal inside/outside */
+
+ while (1) {
+ /* try to locate "--" */
+ FIND_DASH_DASH:
+ /* printf("find_dash_dash: [%s]\n", s); */
+ while (s < end && *s != '-' && *s != '>')
+ s++;
+
+ if (s == end) {
+ FREE_TOKENS;
+ return beg;
+ }
+
+ if (*s == '>') {
+ s++;
+ if (start_com)
+ goto FIND_DASH_DASH;
+
+ /* we are done recognizing all comments, make callbacks */
+ report_event(p_state, E_COMMENT,
+ beg - 4, s, utf8,
+ tokens, num_tokens,
+ self);
+ FREE_TOKENS;
+
+ return s;
+ }
+
+ s++;
+ if (s == end) {
+ FREE_TOKENS;
+ return beg;
+ }
+
+ if (*s == '-') {
+ /* two dashes in a row seen */
+ s++;
+ /* do something */
+ if (start_com) {
+ PUSH_TOKEN(start_com, s-2);
+ start_com = 0;
+ }
+ else {
+ start_com = s;
+ }
+ }
+ }
+ }
+ else if (p_state->no_dash_dash_comment_end) {
+ token_pos_t token;
+ token.beg = beg;
+ /* a lone '>' signals end-of-comment */
+ while (s < end && *s != '>')
+ s++;
+ token.end = s;
+ if (s < end) {
+ s++;
+ report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
+ return s;
+ }
+ else {
+ return beg;
+ }
+ }
+ else { /* non-strict comment */
+ token_pos_t token;
+ token.beg = beg;
+ /* try to locate /--\s*>/ which signals end-of-comment */
+ LOCATE_END:
+ while (s < end && *s != '-')
+ s++;
+ token.end = s;
+ if (s < end) {
+ s++;
+ if (*s == '-') {
+ s++;
+ while (isHSPACE(*s))
+ s++;
+ if (*s == '>') {
+ s++;
+ /* yup */
+ report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
+ return s;
+ }
+ }
+ if (s < end) {
+ s = token.end + 1;
+ goto LOCATE_END;
+ }
+ }
+
+ if (s == end)
+ return beg;
+ }
+
+ return 0;
+}
+
+
+#ifdef MARKED_SECTION
+
+static void
+marked_section_update(PSTATE* p_state)
+{
+ dTHX;
+ /* we look at p_state->ms_stack to determine p_state->ms */
+ AV* ms_stack = p_state->ms_stack;
+ p_state->ms = MS_NONE;
+
+ if (ms_stack) {
+ int stack_len = av_len(ms_stack);
+ int stack_idx;
+ for (stack_idx = 0; stack_idx <= stack_len; stack_idx++) {
+ SV** svp = av_fetch(ms_stack, stack_idx, 0);
+ if (svp) {
+ AV* tokens = (AV*)SvRV(*svp);
+ int tokens_len = av_len(tokens);
+ int i;
+ assert(SvTYPE(tokens) == SVt_PVAV);
+ for (i = 0; i <= tokens_len; i++) {
+ SV** svp = av_fetch(tokens, i, 0);
+ if (svp) {
+ STRLEN len;
+ char *token_str = SvPV(*svp, len);
+ enum marked_section_t token;
+ if (strEQ(token_str, "include"))
+ token = MS_INCLUDE;
+ else if (strEQ(token_str, "rcdata"))
+ token = MS_RCDATA;
+ else if (strEQ(token_str, "cdata"))
+ token = MS_CDATA;
+ else if (strEQ(token_str, "ignore"))
+ token = MS_IGNORE;
+ else
+ token = MS_NONE;
+ if (p_state->ms < token)
+ p_state->ms = token;
+ }
+ }
+ }
+ }
+ }
+ /* printf("MS %d\n", p_state->ms); */
+ p_state->is_cdata = (p_state->ms == MS_CDATA);
+ return;
+}
+
+
+static char*
+parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
+{
+ dTHX;
+ char *s;
+ AV* tokens = 0;
+
+ if (!p_state->marked_sections)
+ return 0;
+
+ assert(beg[0] == '<');
+ assert(beg[1] == '!');
+ assert(beg[2] == '[');
+ s = beg + 3;
+
+FIND_NAMES:
+ while (isHSPACE(*s))
+ s++;
+ while (isHNAME_FIRST(*s)) {
+ char *name_start = s;
+ char *name_end;
+ SV *name;
+ s++;
+ while (isHNAME_CHAR(*s))
+ s++;
+ name_end = s;
+ while (isHSPACE(*s))
+ s++;
+ if (s == end)
+ goto PREMATURE;
+
+ if (!tokens)
+ tokens = newAV();
+ name = newSVpvn(name_start, name_end - name_start);
+ if (utf8)
+ SvUTF8_on(name);
+ av_push(tokens, sv_lower(aTHX_ name));
+ }
+ if (*s == '-') {
+ s++;
+ if (*s == '-') {
+ /* comment */
+ s++;
+ while (1) {
+ while (s < end && *s != '-')
+ s++;
+ if (s == end)
+ goto PREMATURE;
+
+ s++; /* skip first '-' */
+ if (*s == '-') {
+ s++;
+ /* comment finished */
+ goto FIND_NAMES;
+ }
+ }
+ }
+ else
+ goto FAIL;
+
+ }
+ if (*s == '[') {
+ s++;
+ /* yup */
+
+ if (!tokens) {
+ tokens = newAV();
+ av_push(tokens, newSVpvn("include", 7));
+ }
+
+ if (!p_state->ms_stack)
+ p_state->ms_stack = newAV();
+ av_push(p_state->ms_stack, newRV_noinc((SV*)tokens));
+ marked_section_update(p_state);
+ report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self);
+ return s;
+ }
+
+FAIL:
+ SvREFCNT_dec(tokens);
+ return 0; /* not yet implemented */
+
+PREMATURE:
+ SvREFCNT_dec(tokens);
+ return beg;
+}
+#endif
+
+
+static char*
+parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
+{
+ char *s = beg + 2;
+
+ if (*s == '-') {
+ /* comment? */
+
+ char *tmp;
+ s++;
+ if (s == end)
+ return beg;
+
+ if (*s != '-')
+ goto DECL_FAIL; /* nope, illegal */
+
+ /* yes, two dashes seen */
+ s++;
+
+ tmp = parse_comment(p_state, s, end, utf8, self);
+ return (tmp == s) ? beg : tmp;
+ }
+
+#ifdef MARKED_SECTION
+ if (*s == '[') {
+ /* marked section */
+ char *tmp;
+ tmp = parse_marked_section(p_state, beg, end, utf8, self);
+ if (!tmp)
+ goto DECL_FAIL;
+ return tmp;
+ }
+#endif
+
+ if (*s == '>') {
+ /* make <!> into empty comment <SGML Handbook 36:32> */
+ token_pos_t token;
+ token.beg = s;
+ token.end = s;
+ s++;
+ report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
+ return s;
+ }
+
+ if (isALPHA(*s)) {
+ dTOKENS(8);
+ char *decl_id = s;
+ STRLEN decl_id_len;
+
+ s++;
+ /* declaration */
+ while (s < end && isHNAME_CHAR(*s))
+ s++;
+ decl_id_len = s - decl_id;
+ if (s == end)
+ goto PREMATURE;
+
+ /* just hardcode a few names as the recognized declarations */
+ if (!((decl_id_len == 7 &&
+ strnEQx(decl_id, "DOCTYPE", 7, !CASE_SENSITIVE(p_state))) ||
+ (decl_id_len == 6 &&
+ strnEQx(decl_id, "ENTITY", 6, !CASE_SENSITIVE(p_state)))
+ )
+ )
+ {
+ goto FAIL;
+ }
+
+ /* first word available */
+ PUSH_TOKEN(decl_id, s);
+
+ while (1) {
+ while (s < end && isHSPACE(*s))
+ s++;
+
+ if (s == end)
+ goto PREMATURE;
+
+ if (*s == '"' || *s == '\'') {
+ char *str_beg = s;
+ s++;
+ while (s < end && *s != *str_beg)
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ s++;
+ PUSH_TOKEN(str_beg, s);
+ }
+ else if (*s == '-') {
+ /* comment */
+ char *com_beg = s;
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ if (*s != '-')
+ goto FAIL;
+ s++;
+
+ while (1) {
+ while (s < end && *s != '-')
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ if (*s == '-') {
+ s++;
+ PUSH_TOKEN(com_beg, s);
+ break;
+ }
+ }
+ }
+ else if (*s != '>') {
+ /* plain word */
+ char *word_beg = s;
+ s++;
+ while (s < end && isHNOT_SPACE_GT(*s))
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ PUSH_TOKEN(word_beg, s);
+ }
+ else {
+ break;
+ }
+ }
+
+ if (s == end)
+ goto PREMATURE;
+ if (*s == '>') {
+ s++;
+ report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self);
+ FREE_TOKENS;
+ return s;
+ }
+
+ FAIL:
+ FREE_TOKENS;
+ goto DECL_FAIL;
+
+ PREMATURE:
+ FREE_TOKENS;
+ return beg;
+
+ }
+
+DECL_FAIL:
+ if (p_state->strict_comment)
+ return 0;
+
+ /* consider everything up to the first '>' a comment */
+ while (s < end && *s != '>')
+ s++;
+ if (s < end) {
+ token_pos_t token;
+ token.beg = beg + 2;
+ token.end = s;
+ s++;
+ report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
+ return s;
+ }
+ else {
+ return beg;
+ }
+}
+
+
+static char*
+parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
+{
+ char *s = beg;
+ int empty_tag = 0;
+ dTOKENS(16);
+
+ hctype_t tag_name_first, tag_name_char;
+ hctype_t attr_name_first, attr_name_char;
+
+ if (STRICT_NAMES(p_state)) {
+ tag_name_first = attr_name_first = HCTYPE_NAME_FIRST;
+ tag_name_char = attr_name_char = HCTYPE_NAME_CHAR;
+ }
+ else {
+ tag_name_first = tag_name_char = HCTYPE_NOT_SPACE_GT;
+ attr_name_first = HCTYPE_NOT_SPACE_GT;
+ attr_name_char = HCTYPE_NOT_SPACE_EQ_GT;
+ }
+
+ s += 2;
+
+ while (s < end && isHCTYPE(*s, tag_name_char)) {
+ if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
+ if ((s + 1) == end)
+ goto PREMATURE;
+ if (*(s + 1) == '>')
+ break;
+ }
+ s++;
+ }
+ PUSH_TOKEN(beg+1, s); /* tagname */
+
+ while (isHSPACE(*s))
+ s++;
+ if (s == end)
+ goto PREMATURE;
+
+ while (isHCTYPE(*s, attr_name_first)) {
+ /* attribute */
+ char *attr_name_beg = s;
+ char *attr_name_end;
+ if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
+ if ((s + 1) == end)
+ goto PREMATURE;
+ if (*(s + 1) == '>')
+ break;
+ }
+ s++;
+ while (s < end && isHCTYPE(*s, attr_name_char)) {
+ if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
+ if ((s + 1) == end)
+ goto PREMATURE;
+ if (*(s + 1) == '>')
+ break;
+ }
+ s++;
+ }
+ if (s == end)
+ goto PREMATURE;
+
+ attr_name_end = s;
+ PUSH_TOKEN(attr_name_beg, attr_name_end); /* attr name */
+
+ while (isHSPACE(*s))
+ s++;
+ if (s == end)
+ goto PREMATURE;
+
+ if (*s == '=') {
+ /* with a value */
+ s++;
+ while (isHSPACE(*s))
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ if (*s == '>') {
+ /* parse it similar to ="" */
+ PUSH_TOKEN(s, s);
+ break;
+ }
+ if (*s == '"' || *s == '\'') {
+ char *str_beg = s;
+ s++;
+ while (s < end && *s != *str_beg)
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ s++;
+ PUSH_TOKEN(str_beg, s);
+ }
+ else {
+ char *word_start = s;
+ while (s < end && isHNOT_SPACE_GT(*s)) {
+ if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
+ if ((s + 1) == end)
+ goto PREMATURE;
+ if (*(s + 1) == '>')
+ break;
+ }
+ s++;
+ }
+ if (s == end)
+ goto PREMATURE;
+ PUSH_TOKEN(word_start, s);
+ }
+ while (isHSPACE(*s))
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ }
+ else {
+ PUSH_TOKEN(0, 0); /* boolean attr value */
+ }
+ }
+
+ if (ALLOW_EMPTY_TAG(p_state) && *s == '/') {
+ s++;
+ if (s == end)
+ goto PREMATURE;
+ empty_tag = 1;
+ }
+
+ if (*s == '>') {
+ s++;
+ /* done */
+ report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self);
+ if (empty_tag) {
+ report_event(p_state, E_END, s, s, utf8, tokens, 1, self);
+ }
+ else if (!p_state->xml_mode) {
+ /* find out if this start tag should put us into literal_mode
+ */
+ int i;
+ int tag_len = tokens[0].end - tokens[0].beg;
+
+ for (i = 0; literal_mode_elem[i].len; i++) {
+ if (tag_len == literal_mode_elem[i].len) {
+ /* try to match it */
+ char *s = beg + 1;
+ char *t = literal_mode_elem[i].str;
+ int len = tag_len;
+ while (len) {
+ if (toLOWER(*s) != *t)
+ break;
+ s++;
+ t++;
+ if (!--len) {
+ /* found it */
+ p_state->literal_mode = literal_mode_elem[i].str;
+ p_state->is_cdata = literal_mode_elem[i].is_cdata;
+ /* printf("Found %s\n", p_state->literal_mode); */
+ goto END_OF_LITERAL_SEARCH;
+ }
+ }
+ }
+ }
+ END_OF_LITERAL_SEARCH:
+ ;
+ }
+
+ FREE_TOKENS;
+ return s;
+ }
+
+ FREE_TOKENS;
+ return 0;
+
+PREMATURE:
+ FREE_TOKENS;
+ return beg;
+}
+
+
+static char*
+parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
+{
+ char *s = beg+2;
+ hctype_t name_first, name_char;
+
+ if (STRICT_NAMES(p_state)) {
+ name_first = HCTYPE_NAME_FIRST;
+ name_char = HCTYPE_NAME_CHAR;
+ }
+ else {
+ name_first = name_char = HCTYPE_NOT_SPACE_GT;
+ }
+
+ if (isHCTYPE(*s, name_first)) {
+ token_pos_t tagname;
+ tagname.beg = s;
+ s++;
+ while (s < end && isHCTYPE(*s, name_char))
+ s++;
+ tagname.end = s;
+
+ if (p_state->strict_end) {
+ while (isHSPACE(*s))
+ s++;
+ }
+ else {
+ s = skip_until_gt(s, end);
+ }
+ if (s < end) {
+ if (*s == '>') {
+ s++;
+ /* a complete end tag has been recognized */
+ report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self);
+ return s;
+ }
+ }
+ else {
+ return beg;
+ }
+ }
+ else if (!p_state->strict_comment) {
+ s = skip_until_gt(s, end);
+ if (s < end) {
+ token_pos_t token;
+ token.beg = beg + 2;
+ token.end = s;
+ s++;
+ report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
+ return s;
+ }
+ else {
+ return beg;
+ }
+ }
+ return 0;
+}
+
+
+static char*
+parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
+{
+ char *s = beg + 2; /* skip '<?' */
+ /* processing instruction */
+ token_pos_t token_pos;
+ token_pos.beg = s;
+
+ while (s < end) {
+ if (*s == '>') {
+ token_pos.end = s;
+ s++;
+
+ if (p_state->xml_mode || p_state->xml_pic) {
+ /* XML processing instructions are ended by "?>" */
+ if (s - beg < 4 || s[-2] != '?')
+ continue;
+ token_pos.end = s - 2;
+ }
+
+ /* a complete processing instruction seen */
+ report_event(p_state, E_PROCESS, beg, s, utf8,
+ &token_pos, 1, self);
+ return s;
+ }
+ s++;
+ }
+ return beg; /* could not fix end */
+}
+
+
+#ifdef USE_PFUNC
+static char*
+parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
+{
+ return 0;
+}
+
+
+
+#include "pfunc.h" /* declares the parsefunc[] */
+#endif /* USE_PFUNC */
+
+static char*
+parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
+{
+ char *s = beg;
+ char *t = beg;
+ char *new_pos;
+
+ while (!p_state->eof) {
+ /*
+ * At the start of this loop we will always be ready for eating text
+ * or a new tag. We will never be inside some tag. The 't' points
+ * to where we started and the 's' is advanced as we go.
+ */
+
+ while (p_state->literal_mode) {
+ char *l = p_state->literal_mode;
+ bool skip_quoted_end = (strEQ(l, "script") || strEQ(l, "style"));
+ char inside_quote = 0;
+ bool escape_next = 0;
+ char *end_text;
+
+ while (s < end) {
+ if (*s == '<' && !inside_quote)
+ break;
+ if (skip_quoted_end) {
+ if (escape_next) {
+ escape_next = 0;
+ }
+ else {
+ if (*s == '\\')
+ escape_next = 1;
+ else if (inside_quote && *s == inside_quote)
+ inside_quote = 0;
+ else if (*s == '\r' || *s == '\n')
+ inside_quote = 0;
+ else if (!inside_quote && (*s == '"' || *s == '\''))
+ inside_quote = *s;
+ }
+ }
+ s++;
+ }
+
+ if (s == end) {
+ s = t;
+ goto DONE;
+ }
+
+ end_text = s;
+ s++;
+
+ /* here we rely on '\0' termination of perl svpv buffers */
+ if (*s == '/') {
+ s++;
+ while (*l && toLOWER(*s) == *l) {
+ s++;
+ l++;
+ }
+
+ if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) {
+ /* matched it all */
+ token_pos_t end_token;
+ end_token.beg = end_text + 2;
+ end_token.end = s;
+
+ while (isHSPACE(*s))
+ s++;
+ if (*s == '>') {
+ s++;
+ if (t != end_text)
+ report_event(p_state, E_TEXT, t, end_text, utf8,
+ 0, 0, self);
+ report_event(p_state, E_END, end_text, s, utf8,
+ &end_token, 1, self);
+ p_state->literal_mode = 0;
+ p_state->is_cdata = 0;
+ t = s;
+ }
+ }
+ }
+ }
+
+#ifdef MARKED_SECTION
+ while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) {
+ while (s < end && *s != ']')
+ s++;
+ if (*s == ']') {
+ char *end_text = s;
+ s++;
+ if (*s == ']' && *(s + 1) == '>') {
+ s += 2;
+ /* marked section end */
+ if (t != end_text)
+ report_event(p_state, E_TEXT, t, end_text, utf8,
+ 0, 0, self);
+ report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self);
+ t = s;
+ SvREFCNT_dec(av_pop(p_state->ms_stack));
+ marked_section_update(p_state);
+ continue;
+ }
+ }
+ if (s == end) {
+ s = t;
+ goto DONE;
+ }
+ }
+#endif
+
+ /* first we try to match as much text as possible */
+ while (s < end && *s != '<') {
+#ifdef MARKED_SECTION
+ if (p_state->ms && *s == ']') {
+ char *end_text = s;
+ s++;
+ if (*s == ']') {
+ s++;
+ if (*s == '>') {
+ s++;
+ report_event(p_state, E_TEXT, t, end_text, utf8,
+ 0, 0, self);
+ report_event(p_state, E_NONE, end_text, s, utf8,
+ 0, 0, self);
+ t = s;
+ SvREFCNT_dec(av_pop(p_state->ms_stack));
+ marked_section_update(p_state);
+ continue;
+ }
+ }
+ }
+#endif
+ s++;
+ }
+ if (s != t) {
+ if (*s == '<') {
+ report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
+ t = s;
+ }
+ else {
+ s--;
+ if (isHSPACE(*s)) {
+ /* wait with white space at end */
+ while (s >= t && isHSPACE(*s))
+ s--;
+ }
+ else {
+ /* might be a chopped up entities/words */
+ while (s >= t && !isHSPACE(*s))
+ s--;
+ while (s >= t && isHSPACE(*s))
+ s--;
+ }
+ s++;
+ if (s != t)
+ report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
+ break;
+ }
+ }
+
+ if (end - s < 3)
+ break;
+
+ /* next char is known to be '<' and pointed to by 't' as well as 's' */
+ s++;
+
+#ifdef USE_PFUNC
+ new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self);
+#else
+ if (isHNAME_FIRST(*s))
+ new_pos = parse_start(p_state, t, end, utf8, self);
+ else if (*s == '/')
+ new_pos = parse_end(p_state, t, end, utf8, self);
+ else if (*s == '!')
+ new_pos = parse_decl(p_state, t, end, utf8, self);
+ else if (*s == '?')
+ new_pos = parse_process(p_state, t, end, utf8, self);
+ else
+ new_pos = 0;
+#endif /* USE_PFUNC */
+
+ if (new_pos) {
+ if (new_pos == t) {
+ /* no progress, need more data to know what it is */
+ s = t;
+ break;
+ }
+ t = s = new_pos;
+ }
+
+ /* if we get out here then this was not a conforming tag, so
+ * treat it is plain text at the top of the loop again (we
+ * have already skipped past the "<").
+ */
+ }
+
+DONE:
+ return s;
+
+}
+
+EXTERN void
+parse(pTHX_
+ PSTATE* p_state,
+ SV* chunk,
+ SV* self)
+{
+ char *s, *beg, *end;
+ U32 utf8 = 0;
+ STRLEN len;
+
+ if (!p_state->start_document) {
+ char dummy[1];
+ report_event(p_state, E_START_DOCUMENT, dummy, dummy, 0, 0, 0, self);
+ p_state->start_document = 1;
+ }
+
+ if (!chunk) {
+ /* eof */
+ char empty[1];
+ if (p_state->buf && SvOK(p_state->buf)) {
+ /* flush it */
+ s = SvPV(p_state->buf, len);
+ end = s + len;
+ utf8 = SvUTF8(p_state->buf);
+ assert(len);
+
+ while (s < end) {
+ if (p_state->literal_mode) {
+ if (strEQ(p_state->literal_mode, "plaintext") ||
+ strEQ(p_state->literal_mode, "xmp") ||
+ strEQ(p_state->literal_mode, "textarea"))
+ {
+ /* rest is considered text */
+ break;
+ }
+ if (strEQ(p_state->literal_mode, "script") ||
+ strEQ(p_state->literal_mode, "style"))
+ {
+ /* effectively make it an empty element */
+ token_pos_t t;
+ char dummy;
+ t.beg = p_state->literal_mode;
+ t.end = p_state->literal_mode + strlen(p_state->literal_mode);
+ report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
+ }
+ else {
+ p_state->pending_end_tag = p_state->literal_mode;
+ }
+ p_state->literal_mode = 0;
+ s = parse_buf(aTHX_ p_state, s, end, utf8, self);
+ continue;
+ }
+
+ if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') {
+ p_state->no_dash_dash_comment_end = 1;
+ s = parse_buf(aTHX_ p_state, s, end, utf8, self);
+ continue;
+ }
+
+ if (!p_state->strict_comment && *s == '<') {
+ char *s1 = s + 1;
+ if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') {
+ /* some kind of unterminated markup. Report rest as as comment */
+ token_pos_t token;
+ token.beg = s + 1;
+ token.end = end;
+ report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self);
+ s = end;
+ }
+ }
+
+ break;
+ }
+
+ if (s < end) {
+ /* report rest as text */
+ report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self);
+ }
+
+ SvREFCNT_dec(p_state->buf);
+ p_state->buf = 0;
+ }
+ if (p_state->pend_text && SvOK(p_state->pend_text))
+ flush_pending_text(p_state, self);
+
+ if (p_state->ignoring_element) {
+ /* document not balanced */
+ SvREFCNT_dec(p_state->ignoring_element);
+ p_state->ignoring_element = 0;
+ }
+ report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);
+
+ /* reset state */
+ p_state->offset = 0;
+ if (p_state->line)
+ p_state->line = 1;
+ p_state->column = 0;
+ p_state->start_document = 0;
+ p_state->literal_mode = 0;
+ p_state->is_cdata = 0;
+ return;
+ }
+
+#ifdef UNICODE_HTML_PARSER
+ if (p_state->utf8_mode)
+ sv_utf8_downgrade(chunk, 0);
+#endif
+
+ if (p_state->buf && SvOK(p_state->buf)) {
+ sv_catsv(p_state->buf, chunk);
+ beg = SvPV(p_state->buf, len);
+ utf8 = SvUTF8(p_state->buf);
+ }
+ else {
+ beg = SvPV(chunk, len);
+ utf8 = SvUTF8(chunk);
+ if (p_state->offset == 0 && DOWARN) {
+ /* Print warnings if we find unexpected Unicode BOM forms */
+#ifdef UNICODE_HTML_PARSER
+ if (p_state->argspec_entity_decode &&
+ !p_state->utf8_mode && (
+ (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) ||
+ (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) ||
+ (!utf8 && probably_utf8_chunk(aTHX_ beg, len))
+ )
+ )
+ {
+ warn("Parsing of undecoded UTF-8 will give garbage when decoding entities");
+ }
+ if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) {
+ warn("Parsing string decoded with wrong endianess");
+ }
+#endif
+ if (!utf8 && len >= 4 &&
+ (strnEQ(beg, "\x00\x00\xFE\xFF", 4) ||
+ strnEQ(beg, "\xFE\xFF\x00\x00", 4))
+ )
+ {
+ warn("Parsing of undecoded UTF-32");
+ }
+ else if (!utf8 && len >= 2 &&
+ (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2))
+ )
+ {
+ warn("Parsing of undecoded UTF-16");
+ }
+ }
+ }
+
+ if (!len)
+ return; /* nothing to do */
+
+ end = beg + len;
+ s = parse_buf(aTHX_ p_state, beg, end, utf8, self);
+
+ if (s == end || p_state->eof) {
+ if (p_state->buf) {
+ SvOK_off(p_state->buf);
+ }
+ }
+ else {
+ /* need to keep rest in buffer */
+ if (p_state->buf) {
+ /* chop off some chars at the beginning */
+ if (SvOK(p_state->buf)) {
+ sv_chop(p_state->buf, s);
+ }
+ else {
+ sv_setpvn(p_state->buf, s, end - s);
+ if (utf8)
+ SvUTF8_on(p_state->buf);
+ else
+ SvUTF8_off(p_state->buf);
+ }
+ }
+ else {
+ p_state->buf = newSVpv(s, end - s);
+ if (utf8)
+ SvUTF8_on(p_state->buf);
+ }
+ }
+ return;
+}
--- /dev/null
+/* $Id: hparser.h,v 2.34 2006/04/26 07:01:10 gisle Exp $
+ *
+ * Copyright 1999-2005, Gisle Aas
+ * Copyright 1999-2000, Michael A. Chase
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the same terms as Perl itself.
+ */
+
+/*
+ * Declare various structures and constants. The main thing
+ * is 'struct p_state' that contains various fields to represent
+ * the state of the parser.
+ */
+
+#ifdef MARKED_SECTION
+
+enum marked_section_t {
+ MS_NONE = 0,
+ MS_INCLUDE,
+ MS_RCDATA,
+ MS_CDATA,
+ MS_IGNORE
+};
+
+#endif /* MARKED_SECTION */
+
+
+#define P_SIGNATURE 0x16091964 /* tag struct p_state for safer cast */
+
+enum event_id {
+ E_DECLARATION = 0,
+ E_COMMENT,
+ E_START,
+ E_END,
+ E_TEXT,
+ E_PROCESS,
+ E_START_DOCUMENT,
+ E_END_DOCUMENT,
+ E_DEFAULT,
+ /**/
+ EVENT_COUNT,
+ E_NONE /* used for reporting skipped text (non-events) */
+};
+typedef enum event_id event_id_t;
+
+/* must match event_id_t above */
+static char* event_id_str[] = {
+ "declaration",
+ "comment",
+ "start",
+ "end",
+ "text",
+ "process",
+ "start_document",
+ "end_document",
+ "default",
+};
+
+struct p_handler {
+ SV* cb;
+ SV* argspec;
+};
+
+struct p_state {
+ U32 signature;
+
+ /* state */
+ SV* buf;
+ STRLEN offset;
+ STRLEN line;
+ STRLEN column;
+ bool start_document;
+ bool parsing;
+ bool eof;
+
+ /* special parsing modes */
+ char* literal_mode;
+ bool is_cdata;
+ bool no_dash_dash_comment_end;
+ char *pending_end_tag;
+
+ /* unbroken_text option needs a buffer of pending text */
+ SV* pend_text;
+ bool pend_text_is_cdata;
+ STRLEN pend_text_offset;
+ STRLEN pend_text_line;
+ STRLEN pend_text_column;
+
+ /* skipped text is accumulated here */
+ SV* skipped_text;
+
+#ifdef MARKED_SECTION
+ /* marked section support */
+ enum marked_section_t ms;
+ AV* ms_stack;
+ bool marked_sections;
+#endif
+
+ /* various boolean configuration attributes */
+ bool strict_comment;
+ bool strict_names;
+ bool strict_end;
+ bool xml_mode;
+ bool unbroken_text;
+ bool attr_encoded;
+ bool case_sensitive;
+ bool closing_plaintext;
+ bool utf8_mode;
+ bool empty_element_tags;
+ bool xml_pic;
+
+ /* other configuration stuff */
+ SV* bool_attr_val;
+ struct p_handler handlers[EVENT_COUNT];
+ bool argspec_entity_decode;
+
+ /* filters */
+ HV* report_tags;
+ HV* ignore_tags;
+ HV* ignore_elements;
+
+ /* these are set when we are currently inside an element we want to ignore */
+ SV* ignoring_element;
+ int ignore_depth;
+
+ /* cache */
+ HV* entity2char; /* %HTML::Entities::entity2char */
+ SV* tmp;
+};
+typedef struct p_state PSTATE;
+
--- /dev/null
+package HTML::Entities;
+
+# $Id: Entities.pm,v 1.35 2006/03/22 09:15:23 gisle Exp $
+
+=head1 NAME
+
+HTML::Entities - Encode or decode strings with HTML entities
+
+=head1 SYNOPSIS
+
+ use HTML::Entities;
+
+ $a = "Våre norske tegn bør æres";
+ decode_entities($a);
+ encode_entities($a, "\200-\377");
+
+For example, this:
+
+ $input = "vis-à -vis Beyoncé's naïve\npapier-mâché résumé";
+ print encode_entities($input), "\n"
+
+Prints this out:
+
+ vis-à-vis Beyoncé's naïve
+ papier-mâché résumé
+
+=head1 DESCRIPTION
+
+This module deals with encoding and decoding of strings with HTML
+character entities. The module provides the following functions:
+
+=over 4
+
+=item decode_entities( $string, ... )
+
+This routine replaces HTML entities found in the $string with the
+corresponding Unicode character. Under perl 5.6 and earlier only
+characters in the Latin-1 range are replaced. Unrecognized
+entities are left alone.
+
+If multiple strings are provided as argument they are each decoded
+separately and the same number of strings are returned.
+
+If called in void context the arguments are decoded in-place.
+
+This routine is exported by default.
+
+=item _decode_entities( $string, \%entity2char )
+
+=item _decode_entities( $string, \%entity2char, $expand_prefix )
+
+This will in-place replace HTML entities in $string. The %entity2char
+hash must be provided. Named entities not found in the %entity2char
+hash are left alone. Numeric entities are expanded unless their value
+overflow.
+
+The keys in %entity2char are the entity names to be expanded and their
+values are what they should expand into. The values do not have to be
+single character strings. If a key has ";" as suffix,
+then occurrences in $string are only expanded if properly terminated
+with ";". Entities without ";" will be expanded regardless of how
+they are terminated for compatiblity with how common browsers treat
+entities in the Latin-1 range.
+
+If $expand_prefix is TRUE then entities without trailing ";" in
+%entity2char will even be expanded as a prefix of a longer
+unrecognized name. The longest matching name in %entity2char will be
+used. This is mainly present for compatibility with an MSIE
+misfeature.
+
+ $string = "foo bar";
+ _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
+ print $string; # will print "foo bar"
+
+This routine is exported by default.
+
+=item encode_entities( $string )
+
+=item encode_entities( $string, $unsafe_chars )
+
+This routine replaces unsafe characters in $string with their entity
+representation. A second argument can be given to specify which
+characters to consider unsafe (i.e., which to escape). The default set
+of characters to encode are control chars, high-bit chars, and the
+C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >>
+characters. But this, for example, would encode I<just> the
+C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters:
+
+ $encoded = encode_entities($input, '<>&"');
+
+This routine is exported by default.
+
+=item encode_entities_numeric( $string )
+
+=item encode_entities_numeric( $string, $unsafe_chars )
+
+This routine works just like encode_entities, except that the replacement
+entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
+example, C<encode_entities("r\xF4le")> returns "rôle", but
+C<encode_entities_numeric("r\xF4le")> returns "rôle".
+
+This routine is I<not> exported by default. But you can always
+export it with C<use HTML::Entities qw(encode_entities_numeric);>
+or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
+
+=back
+
+All these routines modify the string passed as the first argument, if
+called in a void context. In scalar and array contexts, the encoded or
+decoded string is returned (without changing the input string).
+
+If you prefer not to import these routines into your namespace, you can
+call them as:
+
+ use HTML::Entities ();
+ $decoded = HTML::Entities::decode($a);
+ $encoded = HTML::Entities::encode($a);
+ $encoded = HTML::Entities::encode_numeric($a);
+
+The module can also export the %char2entity and the %entity2char
+hashes, which contain the mapping from all characters to the
+corresponding entities (and vice versa, respectively).
+
+=head1 COPYRIGHT
+
+Copyright 1995-2006 Gisle Aas. All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+use strict;
+use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
+use vars qw(%entity2char %char2entity);
+
+require 5.004;
+require Exporter;
+@ISA = qw(Exporter);
+
+@EXPORT = qw(encode_entities decode_entities _decode_entities);
+@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
+
+$VERSION = sprintf("%d.%02d", q$Revision: 1.35 $ =~ /(\d+)\.(\d+)/);
+sub Version { $VERSION; }
+
+require HTML::Parser; # for fast XS implemented decode_entities
+
+
+%entity2char = (
+ # Some normal chars that have special meaning in SGML context
+ amp => '&', # ampersand
+'gt' => '>', # greater than
+'lt' => '<', # less than
+ quot => '"', # double quote
+ apos => "'", # single quote
+
+ # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
+ AElig => chr(198), # capital AE diphthong (ligature)
+ Aacute => chr(193), # capital A, acute accent
+ Acirc => chr(194), # capital A, circumflex accent
+ Agrave => chr(192), # capital A, grave accent
+ Aring => chr(197), # capital A, ring
+ Atilde => chr(195), # capital A, tilde
+ Auml => chr(196), # capital A, dieresis or umlaut mark
+ Ccedil => chr(199), # capital C, cedilla
+ ETH => chr(208), # capital Eth, Icelandic
+ Eacute => chr(201), # capital E, acute accent
+ Ecirc => chr(202), # capital E, circumflex accent
+ Egrave => chr(200), # capital E, grave accent
+ Euml => chr(203), # capital E, dieresis or umlaut mark
+ Iacute => chr(205), # capital I, acute accent
+ Icirc => chr(206), # capital I, circumflex accent
+ Igrave => chr(204), # capital I, grave accent
+ Iuml => chr(207), # capital I, dieresis or umlaut mark
+ Ntilde => chr(209), # capital N, tilde
+ Oacute => chr(211), # capital O, acute accent
+ Ocirc => chr(212), # capital O, circumflex accent
+ Ograve => chr(210), # capital O, grave accent
+ Oslash => chr(216), # capital O, slash
+ Otilde => chr(213), # capital O, tilde
+ Ouml => chr(214), # capital O, dieresis or umlaut mark
+ THORN => chr(222), # capital THORN, Icelandic
+ Uacute => chr(218), # capital U, acute accent
+ Ucirc => chr(219), # capital U, circumflex accent
+ Ugrave => chr(217), # capital U, grave accent
+ Uuml => chr(220), # capital U, dieresis or umlaut mark
+ Yacute => chr(221), # capital Y, acute accent
+ aacute => chr(225), # small a, acute accent
+ acirc => chr(226), # small a, circumflex accent
+ aelig => chr(230), # small ae diphthong (ligature)
+ agrave => chr(224), # small a, grave accent
+ aring => chr(229), # small a, ring
+ atilde => chr(227), # small a, tilde
+ auml => chr(228), # small a, dieresis or umlaut mark
+ ccedil => chr(231), # small c, cedilla
+ eacute => chr(233), # small e, acute accent
+ ecirc => chr(234), # small e, circumflex accent
+ egrave => chr(232), # small e, grave accent
+ eth => chr(240), # small eth, Icelandic
+ euml => chr(235), # small e, dieresis or umlaut mark
+ iacute => chr(237), # small i, acute accent
+ icirc => chr(238), # small i, circumflex accent
+ igrave => chr(236), # small i, grave accent
+ iuml => chr(239), # small i, dieresis or umlaut mark
+ ntilde => chr(241), # small n, tilde
+ oacute => chr(243), # small o, acute accent
+ ocirc => chr(244), # small o, circumflex accent
+ ograve => chr(242), # small o, grave accent
+ oslash => chr(248), # small o, slash
+ otilde => chr(245), # small o, tilde
+ ouml => chr(246), # small o, dieresis or umlaut mark
+ szlig => chr(223), # small sharp s, German (sz ligature)
+ thorn => chr(254), # small thorn, Icelandic
+ uacute => chr(250), # small u, acute accent
+ ucirc => chr(251), # small u, circumflex accent
+ ugrave => chr(249), # small u, grave accent
+ uuml => chr(252), # small u, dieresis or umlaut mark
+ yacute => chr(253), # small y, acute accent
+ yuml => chr(255), # small y, dieresis or umlaut mark
+
+ # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
+ copy => chr(169), # copyright sign
+ reg => chr(174), # registered sign
+ nbsp => chr(160), # non breaking space
+
+ # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
+ iexcl => chr(161),
+ cent => chr(162),
+ pound => chr(163),
+ curren => chr(164),
+ yen => chr(165),
+ brvbar => chr(166),
+ sect => chr(167),
+ uml => chr(168),
+ ordf => chr(170),
+ laquo => chr(171),
+'not' => chr(172), # not is a keyword in perl
+ shy => chr(173),
+ macr => chr(175),
+ deg => chr(176),
+ plusmn => chr(177),
+ sup1 => chr(185),
+ sup2 => chr(178),
+ sup3 => chr(179),
+ acute => chr(180),
+ micro => chr(181),
+ para => chr(182),
+ middot => chr(183),
+ cedil => chr(184),
+ ordm => chr(186),
+ raquo => chr(187),
+ frac14 => chr(188),
+ frac12 => chr(189),
+ frac34 => chr(190),
+ iquest => chr(191),
+'times' => chr(215), # times is a keyword in perl
+ divide => chr(247),
+
+ ( $] > 5.007 ? (
+ 'OElig;' => chr(338),
+ 'oelig;' => chr(339),
+ 'Scaron;' => chr(352),
+ 'scaron;' => chr(353),
+ 'Yuml;' => chr(376),
+ 'fnof;' => chr(402),
+ 'circ;' => chr(710),
+ 'tilde;' => chr(732),
+ 'Alpha;' => chr(913),
+ 'Beta;' => chr(914),
+ 'Gamma;' => chr(915),
+ 'Delta;' => chr(916),
+ 'Epsilon;' => chr(917),
+ 'Zeta;' => chr(918),
+ 'Eta;' => chr(919),
+ 'Theta;' => chr(920),
+ 'Iota;' => chr(921),
+ 'Kappa;' => chr(922),
+ 'Lambda;' => chr(923),
+ 'Mu;' => chr(924),
+ 'Nu;' => chr(925),
+ 'Xi;' => chr(926),
+ 'Omicron;' => chr(927),
+ 'Pi;' => chr(928),
+ 'Rho;' => chr(929),
+ 'Sigma;' => chr(931),
+ 'Tau;' => chr(932),
+ 'Upsilon;' => chr(933),
+ 'Phi;' => chr(934),
+ 'Chi;' => chr(935),
+ 'Psi;' => chr(936),
+ 'Omega;' => chr(937),
+ 'alpha;' => chr(945),
+ 'beta;' => chr(946),
+ 'gamma;' => chr(947),
+ 'delta;' => chr(948),
+ 'epsilon;' => chr(949),
+ 'zeta;' => chr(950),
+ 'eta;' => chr(951),
+ 'theta;' => chr(952),
+ 'iota;' => chr(953),
+ 'kappa;' => chr(954),
+ 'lambda;' => chr(955),
+ 'mu;' => chr(956),
+ 'nu;' => chr(957),
+ 'xi;' => chr(958),
+ 'omicron;' => chr(959),
+ 'pi;' => chr(960),
+ 'rho;' => chr(961),
+ 'sigmaf;' => chr(962),
+ 'sigma;' => chr(963),
+ 'tau;' => chr(964),
+ 'upsilon;' => chr(965),
+ 'phi;' => chr(966),
+ 'chi;' => chr(967),
+ 'psi;' => chr(968),
+ 'omega;' => chr(969),
+ 'thetasym;' => chr(977),
+ 'upsih;' => chr(978),
+ 'piv;' => chr(982),
+ 'ensp;' => chr(8194),
+ 'emsp;' => chr(8195),
+ 'thinsp;' => chr(8201),
+ 'zwnj;' => chr(8204),
+ 'zwj;' => chr(8205),
+ 'lrm;' => chr(8206),
+ 'rlm;' => chr(8207),
+ 'ndash;' => chr(8211),
+ 'mdash;' => chr(8212),
+ 'lsquo;' => chr(8216),
+ 'rsquo;' => chr(8217),
+ 'sbquo;' => chr(8218),
+ 'ldquo;' => chr(8220),
+ 'rdquo;' => chr(8221),
+ 'bdquo;' => chr(8222),
+ 'dagger;' => chr(8224),
+ 'Dagger;' => chr(8225),
+ 'bull;' => chr(8226),
+ 'hellip;' => chr(8230),
+ 'permil;' => chr(8240),
+ 'prime;' => chr(8242),
+ 'Prime;' => chr(8243),
+ 'lsaquo;' => chr(8249),
+ 'rsaquo;' => chr(8250),
+ 'oline;' => chr(8254),
+ 'frasl;' => chr(8260),
+ 'euro;' => chr(8364),
+ 'image;' => chr(8465),
+ 'weierp;' => chr(8472),
+ 'real;' => chr(8476),
+ 'trade;' => chr(8482),
+ 'alefsym;' => chr(8501),
+ 'larr;' => chr(8592),
+ 'uarr;' => chr(8593),
+ 'rarr;' => chr(8594),
+ 'darr;' => chr(8595),
+ 'harr;' => chr(8596),
+ 'crarr;' => chr(8629),
+ 'lArr;' => chr(8656),
+ 'uArr;' => chr(8657),
+ 'rArr;' => chr(8658),
+ 'dArr;' => chr(8659),
+ 'hArr;' => chr(8660),
+ 'forall;' => chr(8704),
+ 'part;' => chr(8706),
+ 'exist;' => chr(8707),
+ 'empty;' => chr(8709),
+ 'nabla;' => chr(8711),
+ 'isin;' => chr(8712),
+ 'notin;' => chr(8713),
+ 'ni;' => chr(8715),
+ 'prod;' => chr(8719),
+ 'sum;' => chr(8721),
+ 'minus;' => chr(8722),
+ 'lowast;' => chr(8727),
+ 'radic;' => chr(8730),
+ 'prop;' => chr(8733),
+ 'infin;' => chr(8734),
+ 'ang;' => chr(8736),
+ 'and;' => chr(8743),
+ 'or;' => chr(8744),
+ 'cap;' => chr(8745),
+ 'cup;' => chr(8746),
+ 'int;' => chr(8747),
+ 'there4;' => chr(8756),
+ 'sim;' => chr(8764),
+ 'cong;' => chr(8773),
+ 'asymp;' => chr(8776),
+ 'ne;' => chr(8800),
+ 'equiv;' => chr(8801),
+ 'le;' => chr(8804),
+ 'ge;' => chr(8805),
+ 'sub;' => chr(8834),
+ 'sup;' => chr(8835),
+ 'nsub;' => chr(8836),
+ 'sube;' => chr(8838),
+ 'supe;' => chr(8839),
+ 'oplus;' => chr(8853),
+ 'otimes;' => chr(8855),
+ 'perp;' => chr(8869),
+ 'sdot;' => chr(8901),
+ 'lceil;' => chr(8968),
+ 'rceil;' => chr(8969),
+ 'lfloor;' => chr(8970),
+ 'rfloor;' => chr(8971),
+ 'lang;' => chr(9001),
+ 'rang;' => chr(9002),
+ 'loz;' => chr(9674),
+ 'spades;' => chr(9824),
+ 'clubs;' => chr(9827),
+ 'hearts;' => chr(9829),
+ 'diams;' => chr(9830),
+ ) : ())
+);
+
+
+# Make the opposite mapping
+while (my($entity, $char) = each(%entity2char)) {
+ $entity =~ s/;\z//;
+ $char2entity{$char} = "&$entity;";
+}
+delete $char2entity{"'"}; # only one-way decoding
+
+# Fill in missing entities
+for (0 .. 255) {
+ next if exists $char2entity{chr($_)};
+ $char2entity{chr($_)} = "&#$_;";
+}
+
+my %subst; # compiled encoding regexps
+
+sub decode_entities_old
+{
+ my $array;
+ if (defined wantarray) {
+ $array = [@_]; # copy
+ } else {
+ $array = \@_; # modify in-place
+ }
+ my $c;
+ for (@$array) {
+ s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
+ s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
+ s/(&(\w+);?)/$entity2char{$2} || $1/eg;
+ }
+ wantarray ? @$array : $array->[0];
+}
+
+sub encode_entities
+{
+ my $ref;
+ if (defined wantarray) {
+ my $x = $_[0];
+ $ref = \$x; # copy
+ } else {
+ $ref = \$_[0]; # modify in-place
+ }
+ if (defined $_[1] and length $_[1]) {
+ unless (exists $subst{$_[1]}) {
+ # Because we can't compile regex we fake it with a cached sub
+ my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
+ $subst{$_[1]} = eval $code;
+ die( $@ . " while trying to turn range: \"$_[1]\"\n "
+ . "into code: $code\n "
+ ) if $@;
+ }
+ &{$subst{$_[1]}}($$ref);
+ } else {
+ # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
+ $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
+ }
+ $$ref;
+}
+
+sub encode_entities_numeric {
+ local %char2entity;
+ return &encode_entities; # a goto &encode_entities wouldn't work
+}
+
+
+sub num_entity {
+ sprintf "&#x%X;", ord($_[0]);
+}
+
+# Set up aliases
+*encode = \&encode_entities;
+*encode_numeric = \&encode_entities_numeric;
+*encode_numerically = \&encode_entities_numeric;
+*decode = \&decode_entities;
+
+1;
--- /dev/null
+package HTML::Filter;
+
+use strict;
+use vars qw(@ISA $VERSION);
+
+require HTML::Parser;
+@ISA=qw(HTML::Parser);
+
+$VERSION = sprintf("%d.%02d", q$Revision: 2.11 $ =~ /(\d+)\.(\d+)/);
+
+sub declaration { $_[0]->output("<!$_[1]>") }
+sub process { $_[0]->output($_[2]) }
+sub comment { $_[0]->output("<!--$_[1]-->") }
+sub start { $_[0]->output($_[4]) }
+sub end { $_[0]->output($_[2]) }
+sub text { $_[0]->output($_[1]) }
+
+sub output { print $_[1] }
+
+1;
+
+__END__
+
+=head1 NAME
+
+HTML::Filter - Filter HTML text through the parser
+
+=head1 NOTE
+
+B<This module is deprecated.> The C<HTML::Parser> now provides the
+functionally of C<HTML::Filter> much more efficiently with the the
+C<default> handler.
+
+=head1 SYNOPSIS
+
+ require HTML::Filter;
+ $p = HTML::Filter->new->parse_file("index.html");
+
+=head1 DESCRIPTION
+
+C<HTML::Filter> is an HTML parser that by default prints the
+original text of each HTML element (a slow version of cat(1) basically).
+The callback methods may be overridden to modify the filtering for some
+HTML elements and you can override output() method which is called to
+print the HTML text.
+
+C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that
+the document should be given to the parser by calling the $p->parse()
+or $p->parse_file() methods.
+
+=head1 EXAMPLES
+
+The first example is a filter that will remove all comments from an
+HTML file. This is achieved by simply overriding the comment method
+to do nothing.
+
+ package CommentStripper;
+ require HTML::Filter;
+ @ISA=qw(HTML::Filter);
+ sub comment { } # ignore comments
+
+The second example shows a filter that will remove any E<lt>TABLE>s
+found in the HTML file. We specialize the start() and end() methods
+to count table tags and then make output not happen when inside a
+table.
+
+ package TableStripper;
+ require HTML::Filter;
+ @ISA=qw(HTML::Filter);
+ sub start
+ {
+ my $self = shift;
+ $self->{table_seen}++ if $_[0] eq "table";
+ $self->SUPER::start(@_);
+ }
+
+ sub end
+ {
+ my $self = shift;
+ $self->SUPER::end(@_);
+ $self->{table_seen}-- if $_[0] eq "table";
+ }
+
+ sub output
+ {
+ my $self = shift;
+ unless ($self->{table_seen}) {
+ $self->SUPER::output(@_);
+ }
+ }
+
+If you want to collect the parsed text internally you might want to do
+something like this:
+
+ package FilterIntoString;
+ require HTML::Filter;
+ @ISA=qw(HTML::Filter);
+ sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
+ sub filtered_html { join("", @{$_[0]->{fhtml}}) }
+
+=head1 SEE ALSO
+
+L<HTML::Parser>
+
+=head1 COPYRIGHT
+
+Copyright 1997-1999 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
--- /dev/null
+package HTML::HeadParser;
+
+=head1 NAME
+
+HTML::HeadParser - Parse <HEAD> section of a HTML document
+
+=head1 SYNOPSIS
+
+ require HTML::HeadParser;
+ $p = HTML::HeadParser->new;
+ $p->parse($text) and print "not finished";
+
+ $p->header('Title') # to access <title>....</title>
+ $p->header('Content-Base') # to access <base href="http://...">
+ $p->header('Foo') # to access <meta http-equiv="Foo" content="...">
+
+=head1 DESCRIPTION
+
+The C<HTML::HeadParser> is a specialized (and lightweight)
+C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
+section of an HTML document. The parse() method
+will return a FALSE value as soon as some E<lt>BODY> element or body
+text are found, and should not be called again after this.
+
+Note that the C<HTML::HeadParser> might get confused if raw undecoded
+UTF-8 is passed to the parse() method. Make sure the strings are
+properly decoded before passing them on.
+
+The C<HTML::HeadParser> keeps a reference to a header object, and the
+parser will update this header object as the various elements of the
+E<lt>HEAD> section of the HTML document are recognized. The following
+header fields are affected:
+
+=over 4
+
+=item Content-Base:
+
+The I<Content-Base> header is initialized from the E<lt>base
+href="..."> element.
+
+=item Title:
+
+The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
+element.
+
+=item Isindex:
+
+The I<Isindex> header will be added if there is a E<lt>isindex>
+element in the E<lt>head>. The header value is initialized from the
+I<prompt> attribute if it is present. If no I<prompt> attribute is
+given it will have '?' as the value.
+
+=item X-Meta-Foo:
+
+All E<lt>meta> elements will initialize headers with the prefix
+"C<X-Meta->" on the name. If the E<lt>meta> element contains a
+C<http-equiv> attribute, then it will be honored as the header name.
+
+=back
+
+=head1 METHODS
+
+The following methods (in addition to those provided by the
+superclass) are available:
+
+=over 4
+
+=cut
+
+
+require HTML::Parser;
+@ISA = qw(HTML::Parser);
+
+use HTML::Entities ();
+
+use strict;
+use vars qw($VERSION $DEBUG);
+#$DEBUG = 1;
+$VERSION = sprintf("%d.%02d", q$Revision: 2.22 $ =~ /(\d+)\.(\d+)/);
+
+=item $hp = HTML::HeadParser->new
+
+=item $hp = HTML::HeadParser->new( $header )
+
+The object constructor. The optional $header argument should be a
+reference to an object that implement the header() and push_header()
+methods as defined by the C<HTTP::Headers> class. Normally it will be
+of some class that isa or delegates to the C<HTTP::Headers> class.
+
+If no $header is given C<HTML::HeadParser> will create an
+C<HTTP::Header> object by itself (initially empty).
+
+=cut
+
+sub new
+{
+ my($class, $header) = @_;
+ unless ($header) {
+ require HTTP::Headers;
+ $header = HTTP::Headers->new;
+ }
+
+ my $self = $class->SUPER::new(api_version => 2,
+ ignore_elements => [qw(script style)],
+ );
+ $self->{'header'} = $header;
+ $self->{'tag'} = ''; # name of active element that takes textual content
+ $self->{'text'} = ''; # the accumulated text associated with the element
+ $self;
+}
+
+=item $hp->header;
+
+Returns a reference to the header object.
+
+=item $hp->header( $key )
+
+Returns a header value. It is just a shorter way to write
+C<$hp-E<gt>header-E<gt>header($key)>.
+
+=cut
+
+sub header
+{
+ my $self = shift;
+ return $self->{'header'} unless @_;
+ $self->{'header'}->header(@_);
+}
+
+sub as_string # legacy
+{
+ my $self = shift;
+ $self->{'header'}->as_string;
+}
+
+sub flush_text # internal
+{
+ my $self = shift;
+ my $tag = $self->{'tag'};
+ my $text = $self->{'text'};
+ $text =~ s/^\s+//;
+ $text =~ s/\s+$//;
+ $text =~ s/\s+/ /g;
+ print "FLUSH $tag => '$text'\n" if $DEBUG;
+ if ($tag eq 'title') {
+ HTML::Entities::decode($text);
+ $self->{'header'}->push_header(Title => $text);
+ }
+ $self->{'tag'} = $self->{'text'} = '';
+}
+
+# This is an quote from the HTML3.2 DTD which shows which elements
+# that might be present in a <HEAD>...</HEAD>. Also note that the
+# <HEAD> tags themselves might be missing:
+#
+# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
+# SCRIPT* & META* & LINK*">
+#
+# <!ELEMENT HEAD O O (%head.content)>
+
+
+sub start
+{
+ my($self, $tag, $attr) = @_; # $attr is reference to a HASH
+ print "START[$tag]\n" if $DEBUG;
+ $self->flush_text if $self->{'tag'};
+ if ($tag eq 'meta') {
+ my $key = $attr->{'http-equiv'};
+ if (!defined($key) || !length($key)) {
+ return unless $attr->{'name'};
+ $key = "X-Meta-\u$attr->{'name'}";
+ }
+ $self->{'header'}->push_header($key => $attr->{content});
+ } elsif ($tag eq 'base') {
+ return unless exists $attr->{href};
+ $self->{'header'}->push_header('Content-Base' => $attr->{href});
+ } elsif ($tag eq 'isindex') {
+ # This is a non-standard header. Perhaps we should just ignore
+ # this element
+ $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
+ } elsif ($tag =~ /^(?:title|script|style)$/) {
+ # Just remember tag. Initialize header when we see the end tag.
+ $self->{'tag'} = $tag;
+ } elsif ($tag eq 'link') {
+ return unless exists $attr->{href};
+ # <link href="http:..." rel="xxx" rev="xxx" title="xxx">
+ my $h_val = "<" . delete($attr->{href}) . ">";
+ for (sort keys %{$attr}) {
+ $h_val .= qq(; $_="$attr->{$_}");
+ }
+ $self->{'header'}->push_header(Link => $h_val);
+ } elsif ($tag eq 'head' || $tag eq 'html') {
+ # ignore
+ } else {
+ # stop parsing
+ $self->eof;
+ }
+}
+
+sub end
+{
+ my($self, $tag) = @_;
+ print "END[$tag]\n" if $DEBUG;
+ $self->flush_text if $self->{'tag'};
+ $self->eof if $tag eq 'head';
+}
+
+sub text
+{
+ my($self, $text) = @_;
+ $text =~ s/\x{FEFF}//; # drop Unicode BOM if found
+ print "TEXT[$text]\n" if $DEBUG;
+ my $tag = $self->{tag};
+ if (!$tag && $text =~ /\S/) {
+ # Normal text means start of body
+ $self->eof;
+ return;
+ }
+ return if $tag ne 'title';
+ $self->{'text'} .= $text;
+}
+
+1;
+
+__END__
+
+=back
+
+=head1 EXAMPLE
+
+ $h = HTTP::Headers->new;
+ $p = HTML::HeadParser->new($h);
+ $p->parse(<<EOT);
+ <title>Stupid example</title>
+ <base href="http://www.linpro.no/lwp/">
+ Normal text starts here.
+ EOT
+ undef $p;
+ print $h->title; # should print "Stupid example"
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTTP::Headers>
+
+The C<HTTP::Headers> class is distributed as part of the
+I<libwww-perl> package. If you don't have that distribution installed
+you need to provide the $header argument to the C<HTML::HeadParser>
+constructor with your own object that implements the documented
+protocol.
+
+=head1 COPYRIGHT
+
+Copyright 1996-2001 Gisle Aas. All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
--- /dev/null
+package HTML::LinkExtor;
+
+# $Id: LinkExtor.pm,v 1.33 2003/10/10 10:20:56 gisle Exp $
+
+require HTML::Parser;
+@ISA = qw(HTML::Parser);
+$VERSION = sprintf("%d.%02d", q$Revision: 1.33 $ =~ /(\d+)\.(\d+)/);
+
+=head1 NAME
+
+HTML::LinkExtor - Extract links from an HTML document
+
+=head1 SYNOPSIS
+
+ require HTML::LinkExtor;
+ $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
+ sub cb {
+ my($tag, %links) = @_;
+ print "$tag @{[%links]}\n";
+ }
+ $p->parse_file("index.html");
+
+=head1 DESCRIPTION
+
+I<HTML::LinkExtor> is an HTML parser that extracts links from an
+HTML document. The I<HTML::LinkExtor> is a subclass of
+I<HTML::Parser>. This means that the document should be given to the
+parser by calling the $p->parse() or $p->parse_file() methods.
+
+=cut
+
+use strict;
+use HTML::Tagset ();
+
+# legacy (some applications grabs this hash directly)
+use vars qw(%LINK_ELEMENT);
+*LINK_ELEMENT = \%HTML::Tagset::linkElements;
+
+=over 4
+
+=item $p = HTML::LinkExtor->new
+
+=item $p = HTML::LinkExtor->new( $callback )
+
+=item $p = HTML::LinkExtor->new( $callback, $base )
+
+The constructor takes two optional arguments. The first is a reference
+to a callback routine. It will be called as links are found. If a
+callback is not provided, then links are just accumulated internally
+and can be retrieved by calling the $p->links() method.
+
+The $base argument is an optional base URL used to absolutize all URLs found.
+You need to have the I<URI> module installed if you provide $base.
+
+The callback is called with the lowercase tag name as first argument,
+and then all link attributes as separate key/value pairs. All
+non-link attributes are removed.
+
+=cut
+
+sub new
+{
+ my($class, $cb, $base) = @_;
+ my $self = $class->SUPER::new(
+ start_h => ["_start_tag", "self,tagname,attr"],
+ report_tags => [keys %HTML::Tagset::linkElements],
+ );
+ $self->{extractlink_cb} = $cb;
+ if ($base) {
+ require URI;
+ $self->{extractlink_base} = URI->new($base);
+ }
+ $self;
+}
+
+sub _start_tag
+{
+ my($self, $tag, $attr) = @_;
+
+ my $base = $self->{extractlink_base};
+ my $links = $HTML::Tagset::linkElements{$tag};
+ $links = [$links] unless ref $links;
+
+ my @links;
+ my $a;
+ for $a (@$links) {
+ next unless exists $attr->{$a};
+ push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
+ : $attr->{$a});
+ }
+ return unless @links;
+ $self->_found_link($tag, @links);
+}
+
+sub _found_link
+{
+ my $self = shift;
+ my $cb = $self->{extractlink_cb};
+ if ($cb) {
+ &$cb(@_);
+ } else {
+ push(@{$self->{'links'}}, [@_]);
+ }
+}
+
+=item $p->links
+
+Returns a list of all links found in the document. The returned
+values will be anonymous arrays with the follwing elements:
+
+ [$tag, $attr => $url1, $attr2 => $url2,...]
+
+The $p->links method will also truncate the internal link list. This
+means that if the method is called twice without any parsing
+between them the second call will return an empty list.
+
+Also note that $p->links will always be empty if a callback routine
+was provided when the I<HTML::LinkExtor> was created.
+
+=cut
+
+sub links
+{
+ my $self = shift;
+ exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
+}
+
+# We override the parse_file() method so that we can clear the links
+# before we start a new file.
+sub parse_file
+{
+ my $self = shift;
+ delete $self->{'links'};
+ $self->SUPER::parse_file(@_);
+}
+
+=back
+
+=head1 EXAMPLE
+
+This is an example showing how you can extract links from a document
+received using LWP:
+
+ use LWP::UserAgent;
+ use HTML::LinkExtor;
+ use URI::URL;
+
+ $url = "http://www.perl.org/"; # for instance
+ $ua = LWP::UserAgent->new;
+
+ # Set up a callback that collect image links
+ my @imgs = ();
+ sub callback {
+ my($tag, %attr) = @_;
+ return if $tag ne 'img'; # we only look closer at <img ...>
+ push(@imgs, values %attr);
+ }
+
+ # Make the parser. Unfortunately, we don't know the base yet
+ # (it might be diffent from $url)
+ $p = HTML::LinkExtor->new(\&callback);
+
+ # Request document and parse it as it arrives
+ $res = $ua->request(HTTP::Request->new(GET => $url),
+ sub {$p->parse($_[0])});
+
+ # Expand all image URLs to absolute ones
+ my $base = $res->base;
+ @imgs = map { $_ = url($_, $base)->abs; } @imgs;
+
+ # Print them out
+ print join("\n", @imgs), "\n";
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
+
+=head1 COPYRIGHT
+
+Copyright 1996-2001 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+1;
--- /dev/null
+package HTML::PullParser;
+
+# $Id: PullParser.pm,v 2.9 2006/04/26 08:00:28 gisle Exp $
+
+require HTML::Parser;
+@ISA=qw(HTML::Parser);
+$VERSION = sprintf("%d.%02d", q$Revision: 2.9 $ =~ /(\d+)\.(\d+)/);
+
+use strict;
+use Carp ();
+
+sub new
+{
+ my($class, %cnf) = @_;
+
+ # Construct argspecs for the various events
+ my %argspec;
+ for (qw(start end text declaration comment process default)) {
+ my $tmp = delete $cnf{$_};
+ next unless defined $tmp;
+ $argspec{$_} = $tmp;
+ }
+ Carp::croak("Info not collected for any events")
+ unless %argspec;
+
+ my $file = delete $cnf{file};
+ my $doc = delete $cnf{doc};
+ Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
+ if defined($file) && defined($doc);
+ Carp::croak("No 'doc' or 'file' given to parse from")
+ unless defined($file) || defined($doc);
+
+ # Create object
+ $cnf{api_version} = 3;
+ my $self = $class->SUPER::new(%cnf);
+
+ my $accum = $self->{pullparser_accum} = [];
+ while (my($event, $argspec) = each %argspec) {
+ $self->SUPER::handler($event => $accum, $argspec);
+ }
+
+ if (defined $doc) {
+ $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
+ $self->{pullparser_str_pos} = 0;
+ }
+ else {
+ if (!ref($file) && ref(\$file) ne "GLOB") {
+ require IO::File;
+ $file = IO::File->new($file, "r") || return;
+ }
+
+ $self->{pullparser_file} = $file;
+ }
+ $self;
+}
+
+
+sub handler
+{
+ Carp::croak("Can't set handlers for HTML::PullParser");
+}
+
+
+sub get_token
+{
+ my $self = shift;
+ while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
+ if (my $f = $self->{pullparser_file}) {
+ # must try to parse more from the file
+ my $buf;
+ if (read($f, $buf, 512)) {
+ $self->parse($buf);
+ } else {
+ $self->eof;
+ $self->{pullparser_eof}++;
+ delete $self->{pullparser_file};
+ }
+ }
+ elsif (my $sref = $self->{pullparser_str_ref}) {
+ # must try to parse more from the scalar
+ my $pos = $self->{pullparser_str_pos};
+ my $chunk = substr($$sref, $pos, 512);
+ $self->parse($chunk);
+ $pos += length($chunk);
+ if ($pos < length($$sref)) {
+ $self->{pullparser_str_pos} = $pos;
+ }
+ else {
+ $self->eof;
+ $self->{pullparser_eof}++;
+ delete $self->{pullparser_str_ref};
+ delete $self->{pullparser_str_pos};
+ }
+ }
+ else {
+ die;
+ }
+ }
+ shift @{$self->{pullparser_accum}};
+}
+
+
+sub unget_token
+{
+ my $self = shift;
+ unshift @{$self->{pullparser_accum}}, @_;
+ $self;
+}
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+HTML::PullParser - Alternative HTML::Parser interface
+
+=head1 SYNOPSIS
+
+ use HTML::PullParser;
+
+ $p = HTML::PullParser->new(file => "index.html",
+ start => 'event, tagname, @attr',
+ end => 'event, tagname',
+ ignore_elements => [qw(script style)],
+ ) || die "Can't open: $!";
+ while (my $token = $p->get_token) {
+ #...do something with $token
+ }
+
+=head1 DESCRIPTION
+
+The HTML::PullParser is an alternative interface to the HTML::Parser class.
+It basically turns the HTML::Parser inside out. You associate a file
+(or any IO::Handle object or string) with the parser at construction time and
+then repeatedly call $parser->get_token to obtain the tags and text
+found in the parsed document.
+
+The following methods are provided:
+
+=over 4
+
+=item $p = HTML::PullParser->new( file => $file, %options )
+
+=item $p = HTML::PullParser->new( doc => \$doc, %options )
+
+A C<HTML::PullParser> can be made to parse from either a file or a
+literal document based on whether the C<file> or C<doc> option is
+passed to the parser's constructor.
+
+The C<file> passed in can either be a file name or a file handle
+object. If a file name is passed, and it can't be opened for reading,
+then the constructor will return an undefined value and $! will tell
+you why it failed. Otherwise the argument is taken to be some object
+that the C<HTML::PullParser> can read() from when it needs more data.
+The stream will be read() until EOF, but not closed.
+
+A C<doc> can be passed plain or as a reference
+to a scalar. If a reference is passed then the value of this scalar
+should not be changed before all tokens have been extracted.
+
+Next the information to be returned for the different token types must
+be set up. This is done by simply associating an argspec (as defined
+in L<HTML::Parser>) with the events you have an interest in. For
+instance, if you want C<start> tokens to be reported as the string
+C<'S'> followed by the tagname and the attributes you might pass an
+C<start>-option like this:
+
+ $p = HTML::PullParser->new(
+ doc => $document_to_parse,
+ start => '"S", tagname, @attr',
+ end => '"E", tagname',
+ );
+
+At last other C<HTML::Parser> options, like C<ignore_tags>, and
+C<unbroken_text>, can be passed in. Note that you should not use the
+I<event>_h options to set up parser handlers. That would confuse the
+inner logic of C<HTML::PullParser>.
+
+=item $token = $p->get_token
+
+This method will return the next I<token> found in the HTML document,
+or C<undef> at the end of the document. The token is returned as an
+array reference. The content of this array match the argspec set up
+during C<HTML::PullParser> construction.
+
+=item $p->unget_token( @tokens )
+
+If you find out you have read too many tokens you can push them back,
+so that they are returned again the next time $p->get_token is called.
+
+=back
+
+=head1 EXAMPLES
+
+The 'eg/hform' script shows how we might parse the form section of
+HTML::Documents using HTML::PullParser.
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTML::TokeParser>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2001 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
--- /dev/null
+package HTML::TokeParser;
+
+# $Id: TokeParser.pm,v 2.37 2006/04/26 08:00:28 gisle Exp $
+
+require HTML::PullParser;
+@ISA=qw(HTML::PullParser);
+$VERSION = sprintf("%d.%02d", q$Revision: 2.37 $ =~ /(\d+)\.(\d+)/);
+
+use strict;
+use Carp ();
+use HTML::Entities qw(decode_entities);
+use HTML::Tagset ();
+
+my %ARGS =
+(
+ start => "'S',tagname,attr,attrseq,text",
+ end => "'E',tagname,text",
+ text => "'T',text,is_cdata",
+ process => "'PI',token0,text",
+ comment => "'C',text",
+ declaration => "'D',text",
+
+ # options that default on
+ unbroken_text => 1,
+);
+
+
+sub new
+{
+ my $class = shift;
+ my %cnf;
+ if (@_ == 1) {
+ my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
+ %cnf = ($type => $_[0]);
+ }
+ else {
+ %cnf = @_;
+ }
+
+ my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
+
+ my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
+
+ $self->{textify} = $textify;
+ $self;
+}
+
+
+sub get_tag
+{
+ my $self = shift;
+ my $token;
+ while (1) {
+ $token = $self->get_token || return undef;
+ my $type = shift @$token;
+ next unless $type eq "S" || $type eq "E";
+ substr($token->[0], 0, 0) = "/" if $type eq "E";
+ return $token unless @_;
+ for (@_) {
+ return $token if $token->[0] eq $_;
+ }
+ }
+}
+
+
+sub _textify {
+ my($self, $token) = @_;
+ my $tag = $token->[1];
+ return undef unless exists $self->{textify}{$tag};
+
+ my $alt = $self->{textify}{$tag};
+ my $text;
+ if (ref($alt)) {
+ $text = &$alt(@$token);
+ } else {
+ $text = $token->[2]{$alt || "alt"};
+ $text = "[\U$tag]" unless defined $text;
+ }
+ return $text;
+}
+
+
+sub get_text
+{
+ my $self = shift;
+ my @text;
+ while (my $token = $self->get_token) {
+ my $type = $token->[0];
+ if ($type eq "T") {
+ my $text = $token->[1];
+ decode_entities($text) unless $token->[2];
+ push(@text, $text);
+ } elsif ($type =~ /^[SE]$/) {
+ my $tag = $token->[1];
+ if ($type eq "S") {
+ if (defined(my $text = _textify($self, $token))) {
+ push(@text, $text);
+ next;
+ }
+ } else {
+ $tag = "/$tag";
+ }
+ if (!@_ || grep $_ eq $tag, @_) {
+ $self->unget_token($token);
+ last;
+ }
+ push(@text, " ")
+ if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
+ }
+ }
+ join("", @text);
+}
+
+
+sub get_trimmed_text
+{
+ my $self = shift;
+ my $text = $self->get_text(@_);
+ $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
+ $text;
+}
+
+sub get_phrase {
+ my $self = shift;
+ my @text;
+ while (my $token = $self->get_token) {
+ my $type = $token->[0];
+ if ($type eq "T") {
+ my $text = $token->[1];
+ decode_entities($text) unless $token->[2];
+ push(@text, $text);
+ } elsif ($type =~ /^[SE]$/) {
+ my $tag = $token->[1];
+ if ($type eq "S") {
+ if (defined(my $text = _textify($self, $token))) {
+ push(@text, $text);
+ next;
+ }
+ }
+ if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
+ $self->unget_token($token);
+ last;
+ }
+ push(@text, " ") if $tag eq "br";
+ }
+ }
+ my $text = join("", @text);
+ $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
+ $text;
+}
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+HTML::TokeParser - Alternative HTML::Parser interface
+
+=head1 SYNOPSIS
+
+ require HTML::TokeParser;
+ $p = HTML::TokeParser->new("index.html") ||
+ die "Can't open: $!";
+ $p->empty_element_tags(1); # configure its behaviour
+
+ while (my $token = $p->get_token) {
+ #...
+ }
+
+=head1 DESCRIPTION
+
+The C<HTML::TokeParser> is an alternative interface to the
+C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
+predeclared set of token types. If you wish the tokens to be reported
+differently you probably want to use the C<HTML::PullParser> directly.
+
+The following methods are available:
+
+=over 4
+
+=item $p = HTML::TokeParser->new( $filename, %opt );
+
+=item $p = HTML::TokeParser->new( $filehandle, %opt );
+
+=item $p = HTML::TokeParser->new( \$document, %opt );
+
+The object constructor argument is either a file name, a file handle
+object, or the complete document to be parsed. Extra options can be
+provided as key/value pairs and are processed as documented by the base
+classes.
+
+If the argument is a plain scalar, then it is taken as the name of a
+file to be opened and parsed. If the file can't be opened for
+reading, then the constructor will return C<undef> and $! will tell
+you why it failed.
+
+If the argument is a reference to a plain scalar, then this scalar is
+taken to be the literal document to parse. The value of this
+scalar should not be changed before all tokens have been extracted.
+
+Otherwise the argument is taken to be some object that the
+C<HTML::TokeParser> can read() from when it needs more data. Typically
+it will be a filehandle of some kind. The stream will be read() until
+EOF, but not closed.
+
+A newly constructed C<HTML::TokeParser> differ from its base classes
+by having the C<unbroken_text> attribute enabled by default. See
+L<HTML::Parser> for a description of this and other attributes that
+influence how the document is parsed. It is often a good idea to enable
+C<empty_element_tags> behaviour.
+
+Note that the parsing result will likely not be valid if raw undecoded
+UTF-8 is used as a source. When parsing UTF-8 encoded files turn
+on UTF-8 decoding:
+
+ open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
+ my $p = HTML::TokeParser->new( $fh );
+ # ...
+
+If a $filename is passed to the constructor the file will be opened in
+raw mode and the parsing result will only be valid if its content is
+Latin-1 or pure ASCII.
+
+If parsing from an UTF-8 encoded string buffer decode it first:
+
+ utf8::decode($document);
+ my $p = HTML::TokeParser->new( \$document );
+ # ...
+
+=item $p->get_token
+
+This method will return the next I<token> found in the HTML document,
+or C<undef> at the end of the document. The token is returned as an
+array reference. The first element of the array will be a string
+denoting the type of this token: "S" for start tag, "E" for end tag,
+"T" for text, "C" for comment, "D" for declaration, and "PI" for
+process instructions. The rest of the token array depend on the type
+like this:
+
+ ["S", $tag, $attr, $attrseq, $text]
+ ["E", $tag, $text]
+ ["T", $text, $is_data]
+ ["C", $text]
+ ["D", $text]
+ ["PI", $token0, $text]
+
+where $attr is a hash reference, $attrseq is an array reference and
+the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
+details.
+
+=item $p->unget_token( @tokens )
+
+If you find you have read too many tokens you can push them back,
+so that they are returned the next time $p->get_token is called.
+
+=item $p->get_tag
+
+=item $p->get_tag( @tags )
+
+This method returns the next start or end tag (skipping any other
+tokens), or C<undef> if there are no more tags in the document. If
+one or more arguments are given, then we skip tokens until one of the
+specified tag types is found. For example:
+
+ $p->get_tag("font", "/font");
+
+will find the next start or end tag for a font-element.
+
+The tag information is returned as an array reference in the same form
+as for $p->get_token above, but the type code (first element) is
+missing. A start tag will be returned like this:
+
+ [$tag, $attr, $attrseq, $text]
+
+The tagname of end tags are prefixed with "/", i.e. end tag is
+returned like this:
+
+ ["/$tag", $text]
+
+=item $p->get_text
+
+=item $p->get_text( @endtags )
+
+This method returns all text found at the current position. It will
+return a zero length string if the next token is not text. Any
+entities will be converted to their corresponding character.
+
+If one or more arguments are given, then we return all text occurring
+before the first of the specified tags found. For example:
+
+ $p->get_text("p", "br");
+
+will return the text up to either a paragraph of linebreak element.
+
+The text might span tags that should be I<textified>. This is
+controlled by the $p->{textify} attribute, which is a hash that
+defines how certain tags can be treated as text. If the name of a
+start tag matches a key in this hash then this tag is converted to
+text. The hash value is used to specify which tag attribute to obtain
+the text from. If this tag attribute is missing, then the upper case
+name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
+hash value can also be a subroutine reference. In this case the
+routine is called with the start tag token content as its argument and
+the return value is treated as the text.
+
+The default $p->{textify} value is:
+
+ {img => "alt", applet => "alt"}
+
+This means that <IMG> and <APPLET> tags are treated as text, and that
+the text to substitute can be found in the ALT attribute.
+
+=item $p->get_trimmed_text
+
+=item $p->get_trimmed_text( @endtags )
+
+Same as $p->get_text above, but will collapse any sequences of white
+space to a single space character. Leading and trailing white space is
+removed.
+
+=item $p->get_phrase
+
+This will return all text found at the current position ignoring any
+phrasal-level tags. Text is extracted until the first non
+phrasal-level tag. Textification of tags is the same as for
+get_text(). This method will collapse white space in the same way as
+get_trimmed_text() does.
+
+The definition of <i>phrasal-level tags</i> is obtained from the
+HTML::Tagset module.
+
+=back
+
+=head1 EXAMPLES
+
+This example extracts all links from a document. It will print one
+line for each link, containing the URL and the textual description
+between the <A>...</A> tags:
+
+ use HTML::TokeParser;
+ $p = HTML::TokeParser->new(shift||"index.html");
+
+ while (my $token = $p->get_tag("a")) {
+ my $url = $token->[1]{href} || "-";
+ my $text = $p->get_trimmed_text("/a");
+ print "$url\t$text\n";
+ }
+
+This example extract the <TITLE> from the document:
+
+ use HTML::TokeParser;
+ $p = HTML::TokeParser->new(shift||"index.html");
+ if ($p->get_tag("title")) {
+ my $title = $p->get_trimmed_text;
+ print "Title: $title\n";
+ }
+
+=head1 SEE ALSO
+
+L<HTML::PullParser>, L<HTML::Parser>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2005 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
--- /dev/null
+#!/usr/bin/perl
+
+($progname = $0) =~ s,.*/,,;
+
+print "/* This file is autogenerated by $progname */\n";
+
+print <<'EOT';
+
+#define HCTYPE_SPACE 0x01
+#define HCTYPE_NAME_FIRST 0x02
+#define HCTYPE_NAME_CHAR 0x04
+#define HCTYPE_NOT_SPACE_GT 0x08
+#define HCTYPE_NOT_SPACE_EQ_GT 0x10
+#define HCTYPE_NOT_SPACE_SLASH_GT 0x20
+#define HCTYPE_NOT_SPACE_EQ_SLASH_GT 0x40
+
+#define HCTYPE(c) hctype[(unsigned char)(c)]
+#define isHCTYPE(c, w) (HCTYPE(c) & (w))
+
+#define isHSPACE(c) isHCTYPE(c, HCTYPE_SPACE)
+#define isHNAME_FIRST(c) isHCTYPE(c, HCTYPE_NAME_FIRST)
+#define isHNAME_CHAR(c) isHCTYPE(c, HCTYPE_NAME_CHAR)
+#define isHNOT_SPACE_GT(c) isHCTYPE(c, HCTYPE_NOT_SPACE_GT)
+
+typedef unsigned char hctype_t;
+
+EOT
+
+print "static hctype_t hctype[] = {\n";
+
+for my $c (0 .. 255) {
+ print " " unless $c % 8;
+
+ local $_ = chr($c);
+ my $v = 0;
+ if (/^\s$/) { # isSPACE
+ $v |= 0x1
+ }
+ elsif ($_ ne ">") {
+ $v |= 0x08;
+ $v |= 0x10 if $_ ne "=";
+ $v |= 0x20 if $_ ne "/";
+ $v |= 0x40 if $_ ne "=";
+ }
+
+ if (/^[\w.\-:]$/) {
+ $v |= 0x4;
+ $v |= 0x2 unless /^[\d.-]$/; # XML allow /[:_]/ as first char
+ }
+
+ printf "0x%02x, ", $v;
+ unless (($c+1) % 8) {
+ printf " /* %3d - %3d */\n", $c - 7, $c;
+ }
+}
+print "};\n";
+
--- /dev/null
+#!/usr/bin/perl
+
+($progname = $0) =~ s,.*/,,;
+
+print "/* This file is autogenerated by $progname */\n";
+
+print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);\n";
+print "static PFUNC parsefunc[] = {\n";
+
+for my $c (0..255) {
+ local $_ = chr($c);
+ my $func = "null";
+ if (/^[A-Za-z]$/) {
+ $func = "start";
+ }
+ elsif ($_ eq "/") {
+ $func = "end";
+ }
+ elsif ($_ eq "!") {
+ $func = "decl";
+ }
+ elsif ($_ eq "?") {
+ $func = "process";
+ }
+ printf " %-15s /* %3d */\n", "parse_$func,", $c;
+}
+
+print "};\n";
--- /dev/null
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+ok(!$p->handler("start"), "API version 3");
+
+my $failed;
+eval {
+ my $p = HTML::Parser->new(api_version => 4);
+ $failed++;
+};
+like($@, qr/^API version 4 not supported/);
+ok(!$failed, "API version 4");
+
+$p = HTML::Parser->new(api_version => 2);
+
+is($p->handler("start"), "start", "API version 2");
+
+
--- /dev/null
+use Test::More tests => 6;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+eval {
+ $p->handler(end => "end", q(xyzzy));
+};
+like($@, qr/^Unrecognized identifier xyzzy in argspec/);
+
+
+eval {
+ $p->handler(end => "end", q(tagname text));
+};
+like($@, qr/^Missing comma separator in argspec/);
+
+
+eval {
+ $p->handler(end => "end", q(tagname, "text));
+};
+like($@, qr/^Unterminated literal string in argspec/);
+
+
+eval {
+ $p->handler(end => "end", q(tagname, "t\\t"));
+};
+like($@, qr/^Backslash reserved for literal string in argspec/);
+
+eval {
+ $p->handler(end => "end", '"' . ("x" x 256) . '"');
+};
+like($@, qr/^Literal string is longer than 255 chars in argspec/);
+
+$p->handler(end => sub { is(length(shift), 255) },
+ '"' . ("x" x 255) . '"');
+$p->parse("</x>");
+
+
--- /dev/null
+
+use strict;
+require HTML::Parser;
+
+my $decl = '<!ENTITY nbsp CDATA " " -- no-break space -->';
+my $com1 = '<!-- Comment -->';
+my $com2 = '<!-- Comment -- -- Comment -->';
+my $start = '<a href="foo">';
+my $end = '</a>';
+my $empty = "<IMG SRC='foo'/>";
+my $proc = '<? something completely different ?>';
+
+my @argspec = qw( self offset length
+ event tagname tag token0
+ text
+ is_cdata dtext
+ tokens
+ tokenpos
+ attr
+ attrseq );
+
+my @result = ();
+my $p = HTML::Parser -> new(default_h => [\@result, join(',', @argspec)],
+ strict_comment => 1, xml_mode => 1);
+
+my @tests =
+ ( # string, expected results
+ $decl => [[$p, 0, 52, 'declaration', 'ENTITY', '!ENTITY', 'ENTITY',
+ '<!ENTITY nbsp CDATA " " -- no-break space -->',
+ undef, undef,
+ ['ENTITY', 'nbsp', 'CDATA', '" "', '-- no-break space --'],
+ [2, 6, 9, 4, 16, 5, 22, 8, 31, 20],
+ undef, undef ]],
+ $com1 => [[$p, 0, 16, 'comment', ' Comment ', '# Comment ', ' Comment ',
+ '<!-- Comment -->',
+ undef, undef,
+ [' Comment '],
+ [4, 9],
+ undef, undef ]],
+ $com2 => [[$p, 0, 30, 'comment', ' Comment ', '# Comment ', ' Comment ',
+ '<!-- Comment -- -- Comment -->',
+ undef, undef,
+ [' Comment ', ' Comment '],
+ [4, 9, 18, 9],
+ undef, undef ]],
+ $start => [[$p, 0, 14, 'start', 'a', 'a', 'a',
+ '<a href="foo">',
+ undef, undef,
+ ['a', 'href', '"foo"'],
+ [1, 1, 3, 4, 8, 5],
+ {'href', 'foo'}, ['href'] ]],
+ $end => [[$p, 0, 4, 'end', 'a', '/a', 'a',
+ '</a>',
+ undef, undef,
+ ['a'],
+ [2, 1],
+ undef, undef ]],
+ $empty => [[$p, 0, 16, 'start', 'IMG', 'IMG', 'IMG',
+ "<IMG SRC='foo'/>",
+ undef, undef,
+ ['IMG', 'SRC', "'foo'"],
+ [1, 3, 5, 3, 9, 5],
+ {'SRC', 'foo'}, ['SRC'] ],
+ [$p, 16, 0, 'end', 'IMG', '/IMG', 'IMG',
+ '',
+ undef, undef,
+ ['IMG'],
+ undef,
+ undef, undef ],
+ ],
+ $proc => [[$p, 0, 36, 'process', ' something completely different ',
+ '? something completely different ',
+ ' something completely different ',
+ '<? something completely different ?>',
+ undef, undef,
+ [' something completely different '],
+ [2, 32],
+ undef, undef ]],
+ "$end\n$end" => [[$p, 0, 4, 'end', 'a', '/a', 'a',
+ '</a>',
+ undef, undef,
+ ['a'],
+ [2, 1],
+ undef, undef],
+ [$p, 4, 1, 'text', undef, undef, undef,
+ "\n",
+ '', "\n",
+ undef,
+ undef,
+ undef, undef],
+ [$p, 5, 4, 'end', 'a', '/a', 'a',
+ '</a>',
+ undef, undef,
+ ['a'],
+ [2, 1],
+ undef, undef ]],
+ );
+
+use Test::More;
+plan tests => @tests / 2;
+
+sub string_tag {
+ my (@pieces) = @_;
+ my $part;
+ foreach $part ( @pieces ) {
+ if (!defined $part) {
+ $part = 'undef';
+ }
+ elsif (!ref $part) {
+ $part = "'$part'" if $part !~ /^\d+$/;
+ }
+ elsif ('ARRAY' eq ref $part ) {
+ $part = '[' . join(', ', string_tag(@$part)) . ']';
+ }
+ elsif ('HASH' eq ref $part ) {
+ $part = '{' . join(',', string_tag(%$part)) . '}';
+ }
+ else {
+ $part = '<' . ref($part) . '>';
+ }
+ }
+ return join(", ", @pieces );
+}
+
+my $i = 0;
+TEST:
+while (@tests) {
+ my($html, $expected) = splice @tests, 0, 2;
+ ++$i;
+
+ @result = ();
+ $p->parse($html)->eof;
+
+ shift(@result) if $result[0][3] eq "start_document";
+ pop(@result) if $result[-1][3] eq "end_document";
+
+ # Compare results for each element expected
+ foreach (@$expected) {
+ my $want = string_tag($_);
+ my $got = string_tag(shift @result);
+ if ($want ne $got) {
+ is($want, $got);
+ next TEST;
+ }
+ }
+
+ pass;
+}
--- /dev/null
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser;
+
+my @start;
+my @text;
+
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(start => \@start, '@{tagname, @attr}');
+$p->handler(text => \@text, '@{dtext}');
+$p->parse(<<EOT)->eof;
+Hi
+<a href="abc">Foo</a><b>:-)</b>
+EOT
+
+is("@start", "a href abc b");
+
+is(join("", @text), "Hi\nFoo:-)\n");
+
+
--- /dev/null
+use strict;
+use Test::More tests => 2;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new();
+$p->attr_encoded(1);
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ $text .= "]";
+ }, "tagname,attr");
+
+my $html = <<'EOT';
+<tag arg="&<>">
+EOT
+
+$p->parse($html)->eof;
+
+is($text, 'S[tag arg=&<>]');
+
+$text = "";
+$p->attr_encoded(0);
+$p->parse($html)->eof;
+
+is($text, 'S[tag arg=&<>]');
--- /dev/null
+use Test::More tests => 47;
+
+use strict;
+use HTML::Parser;
+
+my @expected;
+my $p = HTML::Parser->new(api_version => 3,
+ unbroken_text => 1,
+ default_h => [\@expected, '@{event, text}'],
+ );
+
+my $doc = <<'EOT';
+<title>Hi</title>
+<h1>Ho ho</h1>
+<--comment->
+EOT
+
+$p->parse($doc)->eof;
+#use Data::Dump; Data::Dump::dump(@expected);
+
+for my $i (1..length($doc)) {
+ my @t;
+ $p->handler(default => \@t);
+ $p->parse(chunk($doc, $i));
+
+ # check that we got the same stuff
+ #diag "X:", join(":", @t);
+ #diag "Y:", join(":", @expected);
+ is(join(":", @t), join(":", @expected));
+}
+
+sub chunk {
+ my $str = shift;
+ my $size = shift || 1;
+ sub {
+ my $res = substr($str, 0, $size);
+ #diag "...$res";
+ substr($str, 0, $size) = "";
+ $res;
+ }
+}
+
+# Test croking behaviour
+$p->handler(default => []);
+
+eval {
+ $p->parse(sub { die "Hi" });
+};
+like($@, qr/^Hi/);
--- /dev/null
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new();
+$p->case_sensitive(1);
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr, $attrseq) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ if (@$attrseq) { $text.=" Order:" ; }
+ for my $k (@$attrseq) {
+ $text .= " $k";
+ }
+ $text .= "]";
+ }, "tagname,attr,attrseq");
+$p->handler(end =>
+ sub {
+ my ($tag) = @_;
+ $text .= "E[$tag]";
+ }, "tagname");
+
+my $html = <<'EOT';
+<tAg aRg="Value" arg="other value"></tAg>
+EOT
+my $cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]';
+my $ci = 'S[tag arg=Value Order: arg arg]E[tag]';
+
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->parse($html)->eof;
+is($text, $ci);
+
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$html = <<'EOT';
+<tAg aRg="Value" arg="other value"></tAg>
+<iGnOrE></ignore>
+EOT
+$p->ignore_tags('ignore');
+$cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]S[iGnOrE]';
+$ci = 'S[tag arg=Value Order: arg arg]E[tag]';
+
+$text = "";
+$p->case_sensitive(0);
+$p->xml_mode(0);
+$p->parse($html)->eof;
+is($text, $ci);
+
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(0);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+
--- /dev/null
+use Test::More;
+
+require HTML::Parser;
+
+package P; @ISA = qw(HTML::Parser);
+
+my @result;
+sub start
+{
+ my($self, $tag, $attr) = @_;
+ push @result, "START[$tag]";
+ for (sort keys %$attr) {
+ push @result, "\t$_: " . $attr->{$_};
+ }
+ $start++;
+}
+
+sub end
+{
+ my($self, $tag) = @_;
+ push @result, "END[$tag]";
+ $end++;
+}
+
+sub text
+{
+ my $self = shift;
+ push @result, "TEXT[$_[0]]";
+ $text++;
+}
+
+sub comment
+{
+ my $self = shift;
+ push @result, "COMMENT[$_[0]]";
+ $comment++;
+}
+
+sub declaration
+{
+ my $self = shift;
+ push @result, "DECLARATION[$_[0]]";
+ $declaration++;
+}
+
+package main;
+
+
+@tests =
+ (
+ '<a ">' => ['START[a]', "\t\": \""],
+ '<a/>' => ['START[a/]',],
+ '<a />' => ['START[a]', "\t/: /"],
+ '<a a/>' => ['START[a]', "\ta/: a/"],
+ '<a a/=/>' => ['START[a]', "\ta/: /"],
+ '<a x="foo bar">' => ['START[a]', "\tx: foo\xA0bar"],
+ '<a x="foo bar">' => ['START[a]', "\tx: foo bar"],
+ '<Ã¥ >' => ['TEXT[<Ã¥]', 'TEXT[ >]'],
+ '2 < 5' => ['TEXT[2 ]', 'TEXT[<]', 'TEXT[ 5]'],
+ '2 <5> 2' => ['TEXT[2 ]', 'TEXT[<5>]', 'TEXT[ 2]'],
+ '2 <a' => ['TEXT[2 ]', 'TEXT[<a]'],
+ '2 <a> 2' => ['TEXT[2 ]', 'START[a]', 'TEXT[ 2]'],
+ '2 <a href=foo' => ['TEXT[2 ]', 'TEXT[<a href=foo]'],
+ "2 <a href='foo bar'> 2" =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'],
+ '2 <a href=foo bar> 2' =>
+ ['TEXT[2 ]', 'START[a]', "\tbar: bar", "\thref: foo", 'TEXT[ 2]'],
+ '2 <a href="foo bar"> 2' =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'],
+ '2 <a href="foo\'bar"> 2' =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo'bar", 'TEXT[ 2]'],
+ "2 <a href='foo\"bar'> 2" =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'],
+ "2 <a href='foo"bar'> 2" =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'],
+ '2 <a.b> 2' => ['TEXT[2 ]', 'START[a.b]', 'TEXT[ 2]'],
+ '2 <a.b-12 a.b = 2 a> 2' =>
+ ['TEXT[2 ]', 'START[a.b-12]', "\ta: a", "\ta.b: 2", 'TEXT[ 2]'],
+ '2 <a_b> 2' => ['TEXT[2 ]', 'START[a_b]', 'TEXT[ 2]'],
+ '<!ENTITY nbsp CDATA " " -- no-break space -->' =>
+ ['DECLARATION[ENTITY nbsp CDATA " " -- no-break space --]'],
+ '<!-- comment -->' => ['COMMENT[ comment ]'],
+ '<!-- comment -- --- comment -->' =>
+ ['COMMENT[ comment ]', 'COMMENT[- comment ]'],
+ '<!-- comment <!-- not comment --> comment -->' =>
+ ['COMMENT[ comment <!]', 'COMMENT[> comment ]'],
+ '<!-- <a href="foo"> -->' => ['COMMENT[ <a href="foo"> ]'],
+ );
+
+plan tests => @tests / 2;
+
+my $i = 0;
+TEST:
+while (@tests) {
+ ++$i;
+ my ($html, $expected) = splice @tests, 0, 2;
+ @result = ();
+
+ $p = new P;
+ $p->strict_comment(1);
+ $p->parse($html)->eof;
+
+ ok(eq_array($expected, \@result)) or diag("Expected: @$expected\n",
+ "Got: @result\n");
+}
--- /dev/null
+use Test::More tests => 1;
+
+use strict;
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3);
+my @com;
+$p->handler(comment => sub { push(@com, shift) }, "token0");
+$p->handler(default => sub { push(@com, shift() . "[" . shift() . "]") }, "event, text");
+
+$p->parse("<foo><><!><!-><!--><!---><!----><!-----><!------>");
+$p->parse("<!--+--");
+$p->parse("\n\n");
+$p->parse(">");
+$p->parse("<!a'b>");
+$p->parse("<!--foo--->");
+$p->parse("<!--foo---->");
+$p->parse("<!--foo----->-->");
+$p->parse("<foo>");
+$p->parse("<!3453><!-3456><!FOO><>");
+$p->eof;
+
+my $com = join(":", @com);
+is($com, "start_document[]:start[<foo>]:text[<>]::-:><!-::-:--:+:a'b:foo-:foo--:foo---:text[-->]:start[<foo>]:3453:-3456:FOO:text[<>]:end_document[]");
--- /dev/null
+#!/usr/bin/perl
+
+# This test will simply run the parser on random junk.
+
+my $no_tests = shift || 3;
+use Test::More;
+plan tests => $no_tests;
+
+use HTML::Parser ();
+
+my $file = "junk$$.html";
+die if -e $file;
+
+for (1..$no_tests) {
+
+ open(JUNK, ">$file") || die;
+ for (1 .. rand(5000)) {
+ for (1 .. rand(200)) {
+ print JUNK pack("N", rand(2**32));
+ }
+ print JUNK ("<", "&", ">")[rand(3)]; # make these a bit more likely
+ }
+ close(JUNK);
+
+ #diag "Parse @{[-s $file]} bytes of junk";
+
+ HTML::Parser->new->parse_file($file);
+ pass();
+
+ #print_mem();
+}
+
+unlink($file);
+
+
+sub print_mem
+{
+ # this probably only works on Linux
+ open(STAT, "/proc/self/status") || return;
+ while (<STAT>) {
+ diag $_ if /^VmSize/;
+ }
+}
--- /dev/null
+use Test::More tests => 2;
+
+use HTML::Parser;
+my $res = "";
+
+sub decl
+{
+ my $t = shift;
+ $res .= "[" . join("\n", map "<$_>", @$t) . "]";
+}
+
+sub text
+{
+ $res .= shift;
+}
+
+my $p = HTML::Parser->new(declaration_h => [\&decl, "tokens"],
+ default_h => [\&text, "text"],
+ );
+
+$p->parse(<<EOT)->eof;
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" --<comment>--
+ "http://www.w3.org/TR/html40/strict.dtd">
+
+<!ENTITY foo "<!-- foo -->">
+<!Entity foo "<!-- foo -->">
+
+<!row --> foo
+EOT
+
+is($res, <<EOT);
+[<DOCTYPE>
+<HTML>
+<PUBLIC>
+<"-//W3C//DTD HTML 4.01//EN">
+<--<comment>-->
+<"http://www.w3.org/TR/html40/strict.dtd">]
+
+[<ENTITY>
+<foo>
+<"<!-- foo -->">]
+[<Entity>
+<foo>
+<"<!-- foo -->">]
+
+<!row --> foo
+EOT
+
+$res = "";
+$p->parse(<<EOT)->eof;
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[]>
+EOT
+is($res, <<EOT);
+[<DOCTYPE>
+<html>
+<PUBLIC>
+<"-//W3C//DTD XHTML 1.0 Strict//EN">
+<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<[]>]
+EOT
+
--- /dev/null
+use strict;
+use Test::More tests => 3;
+
+my $text = "";
+use HTML::Parser ();
+my $p = HTML::Parser->new(default_h => [sub { $text .= shift }, "text"],
+ );
+
+my $html = <<'EOT';
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+ "http://www.w3.org/TR/html40/strict.dtd">
+
+<title>foo</title>
+<!-- comment <a> -->
+<?process instruction>
+
+EOT
+
+$p->parse($html)->eof;
+
+is($text, $html);
+
+$text = "";
+$p->handler(start => sub { }, "");
+$p->handler(declaration => sub { }, "");
+$p->parse($html)->eof;
+
+my $html2;
+$html2 = $html;
+$html2 =~ s/<title>//;
+$html2 =~ s/<!DOCTYPE[^>]*>//;
+
+is($text, $html2);
+
+$text = "";
+$p->handler(start => undef);
+$p->parse($html)->eof;
+
+$html2 = $html;
+$html2 =~ s/<!DOCTYPE[^>]*>//;
+
+is($text, $html2);
--- /dev/null
+#!perl -w
+
+use Test;
+plan tests => 6;
+
+
+use HTML::Parser;
+use File::Spec;
+
+my $events;
+my $p = HTML::Parser->new(default_h => [sub { $events .= "$_[0]\n";}, "event"]);
+
+$events = "";
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse_file(File::Spec->devnull);
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->parse("");
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->parse("<a>");
+$p->eof;
+ok($events, "start_document\nstart\nend_document\n");
+
+$events = "";
+$p->parse("<a> ");
+$p->eof;
+ok($events, "start_document\nstart\ntext\nend_document\n");
--- /dev/null
+#!perl -w
+
+use strict;
+use Test::More tests => 2;
+
+use HTML::Parser ();
+
+my $dtext = "";
+my $text = "";
+
+sub append
+{
+ $dtext .= shift;
+ $text .= shift;
+}
+
+my $p = HTML::Parser->new(text_h => [\&append, "dtext, text"],
+ default_h => [\&append, "text, text" ],
+ );
+
+my $doc = <<'EOT';
+<title>å</title>
+<a href="fooå">ååAA<A>AA</a>
+<?å>
+foo bar
+foo bar
+&xyzzy
+&xyzzy;
+<!-- � -->
+
+ÿ
+ÿ
+ÿG
+<!-- Ā -->
+�
+�
+&
+&#
+&#x
+<xmp>å</xmp>
+<script>å</script>
+<ScRIPT>å</scRIPT>
+<skript>å</script>
+EOT
+
+$p->parse($doc)->eof;
+
+is($text, $doc);
+is($dtext, <<"EOT");
+<title>Ã¥</title>
+<a href="fooå">ååAA<A>AA</a>
+<?å>
+foo\240bar
+foo\240bar
+&xyzzy
+&xyzzy;
+<!-- � -->
+\1
+\377
+\377
+\377G
+<!-- Ā -->
+�
+�
+&
+&#
+&#x
+<xmp>å</xmp>
+<script>å</script>
+<ScRIPT>å</scRIPT>
+<skript>Ã¥</script>
+EOT
--- /dev/null
+use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
+
+use Test::More tests => 12;
+
+$a = "Våre norske tegn bør æres";
+
+decode_entities($a);
+
+is($a, "Våre norske tegn bør æres");
+
+encode_entities($a);
+
+is($a, "Våre norske tegn bør æres");
+
+decode_entities($a);
+encode_entities_numeric($a);
+
+is($a, "Våre norske tegn bør æres");
+
+$a = "<&>\"'";
+is(encode_entities($a), "<&>"'");
+is(encode_entities_numeric($a), "<&>"'");
+
+$a = "abcdef";
+is(encode_entities($a, 'a-c'), "abcdef");
+
+
+# See how well it does against rfc1866...
+$ent = $plain = "";
+while (<DATA>) {
+ next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/;
+ $ent .= "&$1;";
+ $plain .= chr($2);
+}
+
+$a = $ent;
+decode_entities($a);
+is($a, $plain);
+
+# Try decoding when the ";" are left out
+$a = $ent,
+$a =~ s/;//g;
+decode_entities($a);
+is($a, $plain);
+
+
+$a = $plain;
+encode_entities($a);
+is($a, $ent);
+
+
+# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au>
+# Subject: HTML entities problem with 5.11
+# To: libwww-perl@ics.uci.edu
+# Date: Fri, 05 Sep 1997 16:56:55 +1000
+# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU>
+#
+# Hi. I've got a problem that has surfaced with the changes to
+# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening
+# in the process of encoding then decoding special entities. Eg, what goes
+# in as "abc&def&ghi" comes out as "abc&def;&ghi;".
+
+is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;");
+
+# Decoding of '
+is(decode_entities("'"), "'");
+is(encode_entities("'", "'"), "'");
+
+
+__END__
+# Quoted from rfc1866.txt
+
+14. Proposed Entities
+
+ The HTML DTD references the "Added Latin 1" entity set, which only
+ supplies named entities for a subset of the non-ASCII characters in
+ [ISO-8859-1], namely the accented characters. The following entities
+ should be supported so that all ISO 8859-1 characters may only be
+ referenced symbolically. The names for these entities are taken from
+ the appendixes of [SGML].
+
+ <!ENTITY nbsp CDATA " " -- no-break space -->
+ <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark -->
+ <!ENTITY cent CDATA "¢" -- cent sign -->
+ <!ENTITY pound CDATA "£" -- pound sterling sign -->
+ <!ENTITY curren CDATA "¤" -- general currency sign -->
+ <!ENTITY yen CDATA "¥" -- yen sign -->
+ <!ENTITY brvbar CDATA "¦" -- broken (vertical) bar -->
+ <!ENTITY sect CDATA "§" -- section sign -->
+ <!ENTITY uml CDATA "¨" -- umlaut (dieresis) -->
+ <!ENTITY copy CDATA "©" -- copyright sign -->
+ <!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine -->
+ <!ENTITY laquo CDATA "«" -- angle quotation mark, left -->
+ <!ENTITY not CDATA "¬" -- not sign -->
+ <!ENTITY shy CDATA "­" -- soft hyphen -->
+ <!ENTITY reg CDATA "®" -- registered sign -->
+ <!ENTITY macr CDATA "¯" -- macron -->
+ <!ENTITY deg CDATA "°" -- degree sign -->
+ <!ENTITY plusmn CDATA "±" -- plus-or-minus sign -->
+ <!ENTITY sup2 CDATA "²" -- superscript two -->
+ <!ENTITY sup3 CDATA "³" -- superscript three -->
+ <!ENTITY acute CDATA "´" -- acute accent -->
+ <!ENTITY micro CDATA "µ" -- micro sign -->
+ <!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) -->
+ <!ENTITY middot CDATA "·" -- middle dot -->
+ <!ENTITY cedil CDATA "¸" -- cedilla -->
+ <!ENTITY sup1 CDATA "¹" -- superscript one -->
+ <!ENTITY ordm CDATA "º" -- ordinal indicator, masculine -->
+ <!ENTITY raquo CDATA "»" -- angle quotation mark, right -->
+ <!ENTITY frac14 CDATA "¼" -- fraction one-quarter -->
+ <!ENTITY frac12 CDATA "½" -- fraction one-half -->
+ <!ENTITY frac34 CDATA "¾" -- fraction three-quarters -->
+ <!ENTITY iquest CDATA "¿" -- inverted question mark -->
+ <!ENTITY Agrave CDATA "À" -- capital A, grave accent -->
+ <!ENTITY Aacute CDATA "Á" -- capital A, acute accent -->
+ <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent -->
+
+
+
+Berners-Lee & Connolly Standards Track [Page 75]
+\f
+RFC 1866 Hypertext Markup Language - 2.0 November 1995
+
+
+ <!ENTITY Atilde CDATA "Ã" -- capital A, tilde -->
+ <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark -->
+ <!ENTITY Aring CDATA "Å" -- capital A, ring -->
+ <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) -->
+ <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla -->
+ <!ENTITY Egrave CDATA "È" -- capital E, grave accent -->
+ <!ENTITY Eacute CDATA "É" -- capital E, acute accent -->
+ <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent -->
+ <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark -->
+ <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent -->
+ <!ENTITY Iacute CDATA "Í" -- capital I, acute accent -->
+ <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent -->
+ <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark -->
+ <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic -->
+ <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde -->
+ <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent -->
+ <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent -->
+ <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent -->
+ <!ENTITY Otilde CDATA "Õ" -- capital O, tilde -->
+ <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark -->
+ <!ENTITY times CDATA "×" -- multiply sign -->
+ <!ENTITY Oslash CDATA "Ø" -- capital O, slash -->
+ <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent -->
+ <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent -->
+ <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent -->
+ <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark -->
+ <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent -->
+ <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic -->
+ <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) -->
+ <!ENTITY agrave CDATA "à" -- small a, grave accent -->
+ <!ENTITY aacute CDATA "á" -- small a, acute accent -->
+ <!ENTITY acirc CDATA "â" -- small a, circumflex accent -->
+ <!ENTITY atilde CDATA "ã" -- small a, tilde -->
+ <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark -->
+ <!ENTITY aring CDATA "å" -- small a, ring -->
+ <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) -->
+ <!ENTITY ccedil CDATA "ç" -- small c, cedilla -->
+ <!ENTITY egrave CDATA "è" -- small e, grave accent -->
+ <!ENTITY eacute CDATA "é" -- small e, acute accent -->
+ <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent -->
+ <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark -->
+ <!ENTITY igrave CDATA "ì" -- small i, grave accent -->
+ <!ENTITY iacute CDATA "í" -- small i, acute accent -->
+ <!ENTITY icirc CDATA "î" -- small i, circumflex accent -->
+ <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark -->
+ <!ENTITY eth CDATA "ð" -- small eth, Icelandic -->
+ <!ENTITY ntilde CDATA "ñ" -- small n, tilde -->
+ <!ENTITY ograve CDATA "ò" -- small o, grave accent -->
+
+
+
+Berners-Lee & Connolly Standards Track [Page 76]
+\f
+RFC 1866 Hypertext Markup Language - 2.0 November 1995
+
+
+ <!ENTITY oacute CDATA "ó" -- small o, acute accent -->
+ <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent -->
+ <!ENTITY otilde CDATA "õ" -- small o, tilde -->
+ <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark -->
+ <!ENTITY divide CDATA "÷" -- divide sign -->
+ <!ENTITY oslash CDATA "ø" -- small o, slash -->
+ <!ENTITY ugrave CDATA "ù" -- small u, grave accent -->
+ <!ENTITY uacute CDATA "ú" -- small u, acute accent -->
+ <!ENTITY ucirc CDATA "û" -- small u, circumflex accent -->
+ <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark -->
+ <!ENTITY yacute CDATA "ý" -- small y, acute accent -->
+ <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic -->
+ <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark -->
--- /dev/null
+#!perl -w
+
+use strict;
+use Test::More tests => 9;
+
+use HTML::Entities qw(_decode_entities);
+
+eval {
+ _decode_entities("<", undef);
+};
+like($@, qr/^Can't inline decode readonly string/);
+
+eval {
+ my $a = "";
+ _decode_entities($a, $a);
+};
+like($@, qr/^2nd argument must be hash reference/);
+
+eval {
+ my $a = "";
+ _decode_entities($a, []);
+};
+like($@, qr/^2nd argument must be hash reference/);
+
+$a = "<";
+_decode_entities($a, undef);
+is($a, "<");
+
+_decode_entities($a, { "lt" => "<" });
+is($a, "<");
+
+my $x = "x" x 20;
+
+my $err;
+for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") {
+ my $a = $_;
+ $a =~ s/:/&a;/g;
+ my $b = $_;
+ $b =~ s/:/$x/g;
+ _decode_entities($a, { "a" => $x });
+ if ($a ne $b) {
+ diag "Something went wrong with '$_'";
+ $err++;
+ }
+}
+ok(!$err);
+
+$a = "foo bar";
+_decode_entities($a, \%HTML::Entities::entity2char);
+is($a, "foo\xA0bar");
+
+$a = "foo bar";
+_decode_entities($a, \%HTML::Entities::entity2char);
+is($a, "foo bar");
+
+_decode_entities($a, \%HTML::Entities::entity2char, 1);
+is($a, "foo\xA0bar");
--- /dev/null
+#!/usr/bin/perl -w
+
+use Test::More tests => 12;
+use strict;
+
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]);
+$p->ignore_elements("script");
+$p->unbroken_text(1);
+
+$p->handler(default => [], "event, text");
+$p->parse(<<"EOT")->eof;
+<html><head><title>foo</title><Script language="Perl">
+ while (<B>) {
+ # ...
+ }
+</Script><body>
+This is an <i>italic</i> and <b>bold</b> text.
+</body>
+</html>
+EOT
+
+my $t = join("||", map join("|", @$_), @{$p->handler("default")});
+#diag $t;
+
+is($t, "start_document|||start|<html>||start|<head>||start|<title>||text|foo||end|</title>||start|<body>||text|
+This is an italic and bold text.
+||end|</body>||text|
+||end|</html>||text|
+||end_document|", 'ignore_elements');
+
+
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags("a");
+$p->handler(start => sub {
+ my($tagname, %attr) = @_;
+ ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start');
+ }, 'tagname, @attr');
+$p->handler(end => sub {
+ my $tagname = shift;
+ is($tagname, "a", 'report_tags end');
+ }, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+
+This is <a href="#a">very nice</a> example.
+
+EOT
+
+
+#------------------------------------------------------
+
+my @tags;
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(a em));
+$p->ignore_tags(qw(em));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+
+This is <em>yet another</em> <a href="#a">very nice</a> example.
+
+EOT
+is(join('|', @tags), 'a', 'report_tags followed by ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset report_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->report_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+<h3>Next example</h3>
+
+EOT
+is(join('|', @tags), 'h3', 'report_tags replaces filter');
+
+
+#------------------------------------------------------
+
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h1 h2));
+$p->ignore_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+<h3>Next example</h3>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->ignore_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'ignore_tags before report_tags');
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_elements("script");
+my $res="";
+$p->handler(default=> sub {$res.=$_[0];}, 'text');
+$p->parse(<<'EOT')->eof;
+A <script> B </script> C </script> D <script> E </script> F
+EOT
+is($res,"A C D F\n","ignore </script> without <script> correctly");
--- /dev/null
+use Test::More tests => 3;
+
+my $HTML = <<EOT;
+
+<!DOCTYPE HTML>
+<!-- comment
+<h1>Foo</h1>
+-->
+
+<H1
+>Bar</H1
+>
+
+<Table><tr><td>1<td>2<td>3
+<tr>
+</table>
+
+<?process>
+
+EOT
+
+use HTML::Filter;
+use SelectSaver;
+
+my $tmpfile = "test-$$.htm";
+die "$tmpfile already exists" if -e $tmpfile;
+
+open(HTML, ">$tmpfile") or die "$!";
+
+{
+ my $save = new SelectSaver(HTML);
+ HTML::Filter->new->parse($HTML)->eof;
+}
+close(HTML);
+
+open(HTML, $tmpfile) or die "$!";
+local($/) = undef;
+my $FILTERED = <HTML>;
+close(HTML);
+
+#print $FILTERED;
+is($FILTERED, $HTML);
+
+{
+ package MyFilter;
+ @ISA=qw(HTML::Filter);
+ sub comment {}
+ sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
+ sub filtered_html { join("", @{$_[0]->{fhtml}}) }
+}
+
+my $f2 = MyFilter->new->parse_file($tmpfile)->filtered_html;
+unlink($tmpfile) or warn "Can't unlink $tmpfile: $!";
+
+#diag $f2;
+
+unlike($f2, qr/Foo/);
+like($f2, qr/Bar/);
+
+
--- /dev/null
+use Test::More tests => 6;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->handler(start => sub { my $attr = shift; is($attr->{testno}, 1) },
+ "attr");
+$p->handler(end => sub { shift->eof }, "self");
+my $text;
+$p->handler(text => sub { $text = shift }, "text");
+
+is($p->parse("<foo testno=1>"), $p);
+
+$text = '';
+ok(!$p->parse("</foo><foo testno=999>"));
+ok(!$text);
+
+$p->handler(end => sub { $p->parse("foo"); }, "");
+eval {
+ $p->parse("</foo>");
+};
+like($@, qr/Parse loop not allowed/);
+
+# We used to get into an infinite loop if the eof triggered
+# handler called ->eof
+
+use HTML::Parser;
+$p = HTML::Parser->new(api_version => 3);
+
+my $i;
+$p->handler("default" =>
+ sub {
+ my $p=shift;
+ #++$i; diag "$i @_";
+ $p->eof;
+ }, "self, event");
+$p->parse("Foo");
+$p->eof;
+
+# We used to sometimes trigger events after a handler signaled eof
+my $title='';
+$p = HTML::Parser->new(api_version => 3,);
+$p->handler(start=> \&title_handler, 'tagname, self');
+$p->parse("<head><title>foo</title>\n</head>");
+is($title, "foo");
+
+sub title_handler {
+ return if shift ne 'title';
+ my $self = shift;
+ $self->handler(text => sub { $title .= shift}, 'dtext');
+ $self->handler(end => sub { shift->eof if shift eq 'title' }, 'tagname, self');
+}
--- /dev/null
+# Test handler method
+
+use Test::More tests => 11;
+
+my $testno;
+
+use HTML::Parser;
+{
+ package MyParser;
+ use vars qw(@ISA);
+ @ISA=(HTML::Parser);
+
+ sub foo
+ {
+ Test::More::is($_[1]{testno}, Test::More->builder->current_test + 1);
+ }
+
+ sub bar
+ {
+ Test::More::is($_[1], Test::More->builder->current_test + 1);
+ }
+}
+
+$p = MyParser->new(api_version => 3);
+
+eval {
+ $p->handler(foo => "foo", "foo");
+};
+
+like($@, qr/^No handler for foo events/);
+
+eval {
+ $p->handler(start => "foo", "foo");
+};
+like($@, qr/^Unrecognized identifier foo in argspec/);
+
+my $h = $p->handler(start => "foo", "self,tagname");
+ok(!defined($h));
+
+$x = \substr("xfoo", 1);
+$p->handler(start => $$x, "self,attr");
+$p->parse("<a testno=4>");
+
+$p->handler(start => \&MyParser::foo, "self,attr");
+$p->parse("<a testno=5>");
+
+$p->handler(start => "foo");
+$p->parse("<a testno=6>");
+
+$p->handler(start => "bar", "self,'7'");
+$p->parse("<a>");
+
+eval {
+ $p->handler(start => {}, "self");
+};
+like($@, qr/^Only code or array references allowed as handler/);
+
+$a = [];
+$p->handler(start => $a);
+$h = $p->handler("start");
+is($p->handler("start", "foo"), $a);
+
+is($p->handler("start", \&MyParser::foo, ""), "foo");
+
+is($p->handler("start"), \&MyParser::foo);
+
+
--- /dev/null
+use Test::More tests => 1;
+
+eval {
+ require HTML::HeadParser;
+ $p = HTML::HeadParser->new;
+};
+
+SKIP: {
+skip $@, 1 if $@ =~ /^Can't locate HTTP/;
+
+$p = HTML::HeadParser->new($h);
+$p->parse(<<EOT);
+<title>Stupid example</title>
+<base href="http://www.sn.no/libwww-perl/">
+Normal text starts here.
+EOT
+$h = $p->header;
+undef $p;
+is($h->title, "Stupid example");
+}
--- /dev/null
+#!perl -w
+
+use strict;
+use Test::More tests => 11;
+
+{ package H;
+ sub new { bless {}, shift; }
+
+ sub header {
+ my $self = shift;
+ my $key = uc(shift);
+ my $old = $self->{$key};
+ if (@_) { $self->{$key} = shift; }
+ $old;
+ }
+
+ sub push_header {
+ my($self, $k, $v) = @_;
+ $k = uc($k);
+ if (exists $self->{$k}) {
+ $self->{$k} = [ $self->{$k} ] unless ref $self->{$k};
+ push(@{$self->{$k}}, $v);
+ } else {
+ $self->{$k} = $v;
+ }
+ }
+
+ sub as_string {
+ my $self = shift;
+ my $str = "";
+ for (sort keys %$self) {
+ if (ref($self->{$_})) {
+ my $v;
+ for $v (@{$self->{$_}}) {
+ $str .= "$_: $v\n";
+ }
+ } else {
+ $str .= "$_: $self->{$_}\n";
+ }
+ }
+ $str;
+ }
+}
+
+
+my $HTML = <<'EOT';
+
+<title>Å være eller å ikke være</title>
+<meta http-equiv="Expires" content="Soon">
+<meta http-equiv="Foo" content="Bar">
+<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas">
+
+<script>
+
+ "</script>"
+ ignore this
+
+</script>
+
+<base href="http://www.sn.no">
+<meta name="Keywords" content="test, test, test,...">
+<meta name="Keywords" content="more">
+
+Dette er vanlig tekst. Denne teksten definerer også slutten på
+<head> delen av dokumentet.
+
+<style>
+
+ "</style>"
+ ignore this too
+
+</style>
+
+<isindex>
+
+Dette er også vanlig tekst som ikke skal blir parset i det hele tatt.
+
+EOT
+
+$| = 1;
+
+#$HTML::HeadParser::DEBUG = 1;
+require HTML::HeadParser;
+my $p = HTML::HeadParser->new( H->new );
+
+if ($p->parse($HTML)) {
+ fail("Need more data which should not happen");
+} else {
+ #diag $p->as_string;
+ pass();
+}
+
+like($p->header('Title'), qr/Å være eller å ikke være/);
+is($p->header('Expires'), 'Soon');
+is($p->header('Content-Base'), 'http://www.sn.no');
+like($p->header('Link'), qr/<mailto:gisle\@aas.no>/);
+
+# This header should not be present because the head ended
+ok(!$p->header('Isindex'));
+
+
+# Try feeding one char at a time
+my $expected = $p->as_string;
+my $nl = 1;
+$p = HTML::HeadParser->new(H->new);
+while ($HTML =~ /(.)/sg) {
+ #print STDERR '#' if $nl;
+ #print STDERR $1;
+ $nl = $1 eq "\n";
+ $p->parse($1) or last;
+}
+is($p->as_string, $expected);
+
+
+# Try reading it from a file
+my $file = "hptest$$.html";
+die "$file already exists" if -e $file;
+
+open(FILE, ">$file") or die "Can't create $file: $!";
+binmode(FILE);
+print FILE $HTML;
+print FILE "<p>This is more content...</p>\n" x 2000;
+print FILE "<title>Buuuh!</title>\n" x 200;
+close FILE or die "Can't close $file: $!";
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+is($p->header("Title"), "Å være eller å ikke være");
+
+
+# We got into an infinite loop on data without tags and no EOL.
+# This was actually a HTML::Parser bug.
+open(FILE, ">$file") or die "Can't create $file: $!";
+print FILE "Foo";
+close(FILE);
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+ok(!$p->as_string);
+
+SKIP: {
+ skip "Need Unicode support", 2 if $] < 5.008;
+
+ # Test that the Unicode BOM does not confuse us?
+ $p = HTML::HeadParser->new(H->new);
+ ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>"));
+ $p->eof;
+
+ is($p->header("title"), "Hi <foo>");
+}
--- /dev/null
+
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser ();
+
+my $html = '<A href="foo">text</A>';
+
+my $text = '';
+my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']);
+$p->parse($html)->eof;
+is($text, $html);
+
+$text = '';
+$p->handler(start => "");
+$p->parse($html)->eof;
+is($text, 'text</A>');
+
+$text = '';
+$p->handler(end => 0);
+$p->parse($html)->eof;
+is($text, 'text');
+
+$text = '';
+$p->handler(start => undef);
+$p->parse($html)->eof;
+is($text, '<A href="foo">text');
--- /dev/null
+# Exercise the tokenpos buffer allocation routines by feeding it
+# very large tags.
+
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->handler("start" =>
+ sub {
+ my $tp = shift;
+ #diag int(@$tp), " - ", join(", ", @$tp);
+ is(@$tp, 2 + 26 * 6 * 4);
+ }, "tokenpos");
+
+$p->handler("declaration" =>
+ sub {
+ my $t = shift;
+ #diag int(@$t), " - @$t";
+ is(@$t, 26 * 6 * 2 + 1);
+ }, "tokens");
+
+$p->parse("<a ");
+for ("aa" .. "fz") {
+ $p->parse("$_=1 ");
+}
+$p->parse(">");
+
+$p->parse("<!DOCTYPE ");
+for ("aa" .. "fz") {
+ $p->parse("$_ -- $_ -- ");
+}
+$p->parse(">");
+$p->eof;
+exit;
+
--- /dev/null
+# This test that HTML::LinkExtor really absolutize links correctly
+# when a base URL is given to the constructor.
+
+use Test::More tests => 5;
+require HTML::LinkExtor;
+
+SKIP: {
+eval {
+ require URI;
+};
+skip $@, 5 if $@;
+
+# Try with base URL and the $p->links interface.
+$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html");
+$p->parse(<<HTML)->eof;
+<head>
+<base href="http://www.sn.no/">
+</head>
+<body background="http://www.sn.no/sn.gif">
+
+This is <A HREF="link.html">link</a> and an <img SRC="img.jpg"
+lowsrc="img.gif" alt="Image">.
+HTML
+
+@p = $p->links;
+
+# There should be 4 links in the document
+is(@p, 4);
+
+for (@p) {
+ ($t, %attr) = @$_ if $_->[0] eq 'img';
+}
+
+is($t, 'img');
+
+is(delete $attr{src}, "http://www.sn.no/foo/img.jpg");
+
+is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif");
+
+ok(!scalar(keys %attr)); # there should be no more attributes
+}
--- /dev/null
+use Test::More tests => 4;
+
+require HTML::LinkExtor;
+
+$HTML = <<HTML;
+<head>
+<base href="http://www.sn.no/">
+</head>
+<body background="http://www.sn.no/sn.gif">
+
+This is <A HREF="link.html">link</a> and an <img SRC="img.jpg"
+lowsrc="img.gif" alt="Image">.
+HTML
+
+
+# Try the callback interface
+$links = "";
+$p = HTML::LinkExtor->new(
+ sub {
+ my($tag, %links) = @_;
+ #diag "$tag @{[%links]}";
+ $links .= "$tag @{[%links]}\n";
+ });
+
+$p->parse($HTML); $p->eof;
+
+ok($links =~ m|^base href http://www\.sn\.no/$|m);
+ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m);
+ok($links =~ m|^a href link\.html$|m);
+
+# Used to be problems when using the links method on a document with
+# no links it it. This is a test to prove that it works.
+$p = new HTML::LinkExtor;
+$p->parse("this is a document with no links"); $p->eof;
+@a = $p->links;
+is(@a, 0);
--- /dev/null
+# Check that the magic signature at the top of struct p_state works and that we
+# catch modifications to _hparser_xs_state gracefully
+
+use Test::More tests => 5;
+
+use HTML::Parser;
+
+$p = HTML::Parser->new(api_version => 3);
+
+$p->xml_mode(1);
+
+# We should not be able to simply modify this stuff
+eval {
+ ${$p->{_hparser_xs_state}} += 4;
+};
+like($@, qr/^Modification of a read-only value attempted/);
+
+
+my $x = delete $p->{_hparser_xs_state};
+
+eval {
+ $p->xml_mode(1);
+};
+like($@, qr/^Can't find '_hparser_xs_state'/);
+
+$p->{_hparser_xs_state} = \($$x + 16);
+
+eval {
+ $p->xml_mode(1);
+};
+like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/);
+
+$p->{_hparser_xs_state} = 33;
+eval {
+ $p->xml_mode(1);
+};
+like($@, qr/^_hparser_xs_state element is not a reference/);
+
+$p->{_hparser_xs_state} = $x;
+
+ok($p->xml_mode(0));
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+my $tag;
+my $text;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"],
+ text_h => [sub { $text .= shift }, "dtext"],
+ );
+
+
+use Test::More tests => 14;
+
+SKIP: {
+eval {
+ $p->marked_sections(1);
+};
+skip $@, 14 if $@;
+
+$p->parse("<![[foo]]>");
+is($text, "foo");
+
+$p->parse("<![TEMP INCLUDE[bar]]>");
+is($text, "foobar");
+
+$p->parse("<![ INCLUDE -- IGNORE -- [foo<![IGNORE[bar]]>]]>\n<br>");
+is($text, "foobarfoo\n");
+
+$text = "";
+$p->parse("<![ CDATA [<foo");
+$p->parse("<![IGNORE[bar]]>,bar>]]><br>");
+is($text, "<foo<![IGNORE[bar,bar>]]>");
+
+$text = "";
+$p->parse("<![ RCDATA [å<a>]]><![CDATA[å<a>]]>å<a><br>");
+is($text, "Ã¥<a>å<a>Ã¥");
+is($tag, "br");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA CDATA IGNORE [fooå<a>]]><br>");
+is($text, "");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA CDATA [fooå<a>]]><br>");
+is($text, "fooå<a>");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA [fooå<a>]]><br>");
+is($text, "fooå<a>");
+
+$text = "";
+$p->parse("<![INCLUDE [fooå<a>]]><br>");
+is($text, "fooå");
+
+$text = "";
+$p->parse("<![[fooå<a>]]><br>");
+is($text, "fooå");
+
+# offsets/line/column numbers
+$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"],
+ marked_sections => 1,
+ );
+$p->parse(<<'EOT')->eof;
+<title>Test</title>
+<![CDATA
+ [fooå<a>
+]]>
+<![[
+INCLUDE
+STUFF
+]]>
+ <h1>Test</h1>
+EOT
+
+my @x;
+sub x {
+ my($line, $col, $offset, $event, $text) = @_;
+ $text =~ s/\n/\\n/g;
+ $text =~ s/ /./g;
+ push(@x, "$line.$col:$offset $event \"$text\"\n");
+}
+
+#diag @x;
+is(join("", @x), <<'EOT');
+1.0:0 start_document ""
+1.0:0 start "<title>"
+1.7:7 text "Test"
+1.11:11 end "</title>"
+1.19:19 text "\n"
+3.3:32 text "fooå<a>\n"
+4.3:49 text "\n"
+5.4:54 text "\nINCLUDE\nSTUFF\n"
+8.3:72 text "\n.."
+9.2:75 start "<h1>"
+9.6:79 text "Test"
+9.10:83 end "</h1>"
+9.15:88 text "\n"
+10.0:89 end_document ""
+EOT
+
+my $doc = "<Tag><![CDATA[This is cdata]]></Tag>";
+my $result = "";
+$p = HTML::Parser->new(
+ marked_sections => 1,
+ handlers => {
+ default => [ sub { $result .= join("",@_); }, "skipped_text,text" ]
+ }
+)->parse($doc)->eof;
+is($doc, $result);
+
+$text = "";
+$p = HTML::Parser->new(
+ text_h => [sub { $text .= shift }, "dtext"],
+ marked_sections => 1,
+);
+
+$p->parse("<![CDATA[foo [1]]]>");
+is($text, "foo [1]", "CDATA text ending in square bracket");
+
+} # SKIP
--- /dev/null
+#!perl -w
+
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 2;
+
+my $TEXT = "";
+sub h
+{
+ my($event, $tagname, $text) = @_;
+ for ($event, $tagname, $text) {
+ if (defined) {
+ s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+ }
+ else {
+ $_ = "<undef>";
+ }
+ }
+
+ $TEXT .= "[$event,$tagname,$text]\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"]);
+$p->parse("<a>");
+$p->parse("</a f>");
+$p->parse("</a 'foo<>' 'bar>' x>");
+$p->parse("</a \"foo<>\"");
+$p->parse(" \"bar>\" x>");
+$p->parse("</ foo bar>");
+$p->parse("</ \"<>\" >");
+$p->parse("<!--comment>text<!--comment><p");
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,]
+[start,a,<a>]
+[end,a,</a f>]
+[end,a,</a 'foo<>' 'bar>' x>]
+[end,a,</a "foo<>" "bar>" x>]
+[comment, foo bar,</ foo bar>]
+[comment, "<>" ,</ "<>" >]
+[comment,comment,<!--comment>]
+[text,<undef>,text]
+[comment,comment,<!--comment>]
+[comment,p,<p]
+[end_document,<undef>,]
+EOT
+
+$TEXT = "";
+$p->parse("<!comment>");
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,]
+[comment,comment,<!comment>]
+[end_document,<undef>,]
+EOT
--- /dev/null
+use strict;
+use HTML::Parser ();
+use Test::More tests => 1;
+
+my $HTML = <<'EOT';
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+ "http://www.w3.org/TR/html40/strict.dtd">
+
+<foo bar baz=3>heisan
+</foo> <?process>
+<!-- comment -->
+<xmp>xmp</xmp>
+
+EOT
+
+my $p = HTML::Parser->new(api_version => 3);
+
+my $sum_len = 0;
+my $count = 0;
+my $err;
+
+$p->handler(default =>
+ sub {
+ my($offset, $length, $offset_end, $line, $col, $text) = @_;
+ my $copy = $text;
+ $copy =~ s/\n/\\n/g;
+ substr($copy, 30) = "..." if length($copy) > 32;
+ #diag sprintf ">>> %d.%d %s", $line, $col, $copy;
+ if ($offset != $sum_len) {
+ diag "offset mismatch $offset vs $sum_len";
+ $err++;
+ }
+ if ($offset_end != $offset + $length) {
+ diag "offset_end $offset_end wrong";
+ $err++;
+ }
+ if ($length != length($text)) {
+ diag "length mismatch";
+ $err++;
+ }
+ if (substr($HTML, $offset, $length) ne $text) {
+ diag "content mismatch";
+ $err++;
+ }
+ $sum_len += $length;
+ $count++;
+ },
+ 'offset,length,offset_end,line,column,text');
+
+for (split(//, $HTML)) {
+ $p->parse($_);
+}
+$p->eof;
+
+ok($count > 5 && !$err);
+
+
--- /dev/null
+# Test option setting methods
+
+use Test::More tests => 10;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3,
+ xml_mode => 1);
+my $old;
+
+$old = $p->boolean_attribute_value("foo");
+ok(!defined $old);
+
+$old = $p->boolean_attribute_value();
+is($old, "foo");
+
+$old = $p->boolean_attribute_value(undef);
+is($old, "foo");
+ok(!defined($p->boolean_attribute_value));
+
+ok($p->xml_mode(0));
+ok(!$p->xml_mode);
+
+my $seen_buggy_comment_warning;
+$SIG{__WARN__} =
+ sub {
+ local $_ = shift;
+ $seen_buggy_comment_warning++
+ if /^netscape_buggy_comment\(\) is deprecated/;
+ };
+
+ok(!$p->strict_comment(1));
+ok($p->strict_comment);
+ok(!$p->netscape_buggy_comment);
+ok($seen_buggy_comment_warning);
--- /dev/null
+use Test::More tests => 6;
+
+my $filename = "file$$.htm";
+die "$filename is already there" if -e $filename;
+open(FILE, ">$filename") || die "Can't create $filename: $!";
+print FILE <<'EOT'; close(FILE);
+<title>Heisan</title>
+EOT
+
+{
+ package MyParser;
+ require HTML::Parser;
+ @ISA=qw(HTML::Parser);
+
+ sub start
+ {
+ my($self, $tag, $attr) = @_;
+ Test::More::is($tag, "title");
+ }
+}
+
+MyParser->new->parse_file($filename);
+open(FILE, $filename) || die;
+MyParser->new->parse_file(*FILE);
+seek(FILE, 0, 0) || die;
+MyParser->new->parse_file(\*FILE);
+close(FILE);
+
+require IO::File;
+my $io = IO::File->new($filename) || die;
+MyParser->new->parse_file($io);
+$io->seek(0, 0) || die;
+MyParser->new->parse_file(*$io);
+
+my $text = '';
+$io->seek(0, 0) || die;
+MyParser->new(
+ start_h => [ sub{ shift->eof; }, "self" ],
+ text_h => [ sub{ $text = shift; }, "text" ])->parse_file(*$io);
+ok(!$text);
+
+close($io); # needed because of bug in perl
+undef($io);
+
+unlink($filename) or warn "Can't unlink $filename: $!";
--- /dev/null
+use Test::More tests => 7;
+
+$HTML = <<'HTML';
+
+<!DOCTYPE HTML>
+
+<body>
+
+Various entities. The parser must never break them in the middle:
+
+/
+/
+È
+௖
+
+å-Å
+
+<ul>
+<li><a href="foo 'bar' baz>" id=33>This is a link</a>
+<li><a href='foo "bar" baz> å' id=34>This is another one</a>
+</ul>
+
+<p><div align="center"><img src="http://www.perl.com/perl.gif"
+alt="camel"></div>
+
+<!-- this is
+a comment --> and this is not.
+
+<!-- this is the kind of >comment< -- --> that Netscape hates -->
+
+< this > was not a tag. <this is/not either>
+
+</body>
+
+HTML
+
+#-------------------------------------------------------------------
+
+{
+ package P;
+ require HTML::Parser;
+ @ISA=qw(HTML::Parser);
+ $OUT='';
+ $COUNT=0;
+
+ sub new
+ {
+ my $class = shift;
+ my $self = $class->SUPER::new;
+ $OUT = '';
+ die "Can only have one" if $COUNT++;
+ $self;
+ }
+
+ sub DESTROY
+ {
+ my $self = shift;
+ eval { $self->SUPER::DESTROY; };
+ $COUNT--;
+ }
+
+ sub declaration
+ {
+ my($self, $decl) = @_;
+ $OUT .= "[[$decl]]|";
+ }
+
+ sub start
+ {
+ my($self, $tag, $attr) = @_;
+ $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr);
+ $attr = "/$attr" if length $attr;
+ $OUT .= "<<$tag$attr>>|";
+ }
+
+ sub end
+ {
+ my($self, $tag) = @_;
+ $OUT .= ">>$tag<<|";
+ }
+
+ sub comment
+ {
+ my($self, $comment) = @_;
+ $OUT .= "##$comment##|";
+ }
+
+ sub text
+ {
+ my($self, $text) = @_;
+ #$text =~ s/\n/\\n/g;
+ #$text =~ s/\t/\\t/g;
+ #$text =~ s/ /·/g;
+ $OUT .= "$text|";
+ }
+
+ sub result
+ {
+ $OUT;
+ }
+}
+
+for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") {
+#for $chunksize (1) {
+ if ($chunksize =~ /^file/) {
+ #print "Parsing from $chunksize";
+ } else {
+ #print "Parsing using $chunksize byte chunks";
+ }
+ my $p = P->new;
+
+ if ($chunksize =~ /^file/) {
+ # First we must create the file
+ my $tmpfile = "tmp-$$.html";
+ my $file = $tmpfile;
+ die "$file already exists" if -e $file;
+ open(FILE, ">$file") or die "Can't create $file: $!";
+ binmode FILE;
+ print FILE $HTML;
+ close(FILE);
+
+ if ($chunksize eq "filehandle") {
+ require FileHandle;
+ my $fh = FileHandle->new($file) || die "Can't open $file: $!";
+ $file = $fh;
+ }
+
+ # then we can parse it.
+ $p->parse_file($file);
+ close $file if $chunksize eq "filehandle";
+ unlink($tmpfile) || warn "Can't unlink $tmpfile: $!";
+ } else {
+ my $copy = $HTML;
+ while (length $copy) {
+ my $chunk = substr($copy, 0, $chunksize);
+ substr($copy, 0, $chunksize) = '';
+ $p->parse($chunk);
+ }
+ $p->eof;
+ }
+
+ my $res = $p->result;
+ my $bad;
+
+ # Then we start looking for things that should not happen
+ if ($res =~ /\s\|\s/) {
+ diag "broken space";
+ $bad++;
+ }
+ for (
+ # Make sure entities are not broken
+ '/', '/', 'È', '௖', '', 'å', 'Å',
+
+ # Some elements that should be produced
+ "|[[DOCTYPE HTML]]|",
+ "|## this is\na comment ##|",
+ "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|",
+ '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>',
+ "|>>ul<<|", "|>>body<<|\n\n|",
+ )
+ {
+ if (index($res, $_) < 0) {
+ diag "Can't find '$_' in parsed document";
+ $bad++;
+ }
+ }
+
+ diag $res if $bad || $ENV{PRINT_RESULTS};
+
+ # And we check that we get the same result all the time
+ $res =~ s/\|//g; # remove all break marks
+ if ($last_res && $res ne $last_res) {
+ diag "The result is not the same as last time";
+ $bad++;
+ }
+ $last_res = $res;
+
+ unless ($res =~ /Various entities/) {
+ diag "Some text must be missing";
+ $bad++;
+ }
+
+ ok(!$bad);
+}
--- /dev/null
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser;
+
+my @a;
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(default => \@a, '@{event, text, is_cdata}');
+$p->parse(<<EOT)->eof;
+<xmp><foo></xmp>x<plaintext><foo>
+</plaintext>
+foo
+EOT
+
+for (@a) {
+ $_ = "" unless defined;
+}
+
+my $doc = join(":", @a);
+
+#diag $doc;
+
+is($doc, "start_document:::start:<xmp>::text:<foo>:1:end:</xmp>::text:x::start:<plaintext>::text:<foo>
+</plaintext>
+foo
+:1:end_document::");
+
+@a = ();
+$p->closing_plaintext('yep, emulate gecko');
+$p->parse(<<EOT)->eof;
+<plaintext><foo>
+</plaintext>foo<b></b>
+EOT
+
+for (@a) {
+ $_ = "" unless defined;
+}
+
+$doc = join(":", @a);
+
+#diag $doc;
+
+is($doc, "start_document:::start:<plaintext>::text:<foo>
+:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text:
+::end_document::");
--- /dev/null
+use Test::More;
+eval "use Test::Pod 1.00";
+plan skip_all => "Test::Pod 1.00 required for testing POD" if $@;
+all_pod_files_ok();
--- /dev/null
+use strict;
+
+use Test::More tests => 12;
+
+my $pi;
+my $orig;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(process_h => [sub { $pi = shift; $orig = shift; },
+ "token0,text"]
+ );
+
+$p->parse("<a><?foo><a>");
+
+is($pi, "foo");
+is($orig, "<?foo>");
+
+$p->parse("<a><?><a>");
+is($pi, "");
+is($orig, "<?>");
+
+$p->parse("<a><?
+foo
+><a>");
+is($pi, "\nfoo\n");
+is($orig, "<?\nfoo\n>");
+
+for (qw(< a > < ? b a r > < a >)) {
+ $p->parse($_);
+}
+
+is($pi, "bar");
+is($orig, "<?bar>");
+
+$p->xml_mode(1);
+
+$p->parse("<a><?foo>bar??><a>");
+is($pi, "foo>bar?");
+is($orig, "<?foo>bar??>");
+
+$p->parse("<a><??></a>");
+is($pi, "");
+is($orig, "<??>");
--- /dev/null
+use Test::More tests => 3;
+
+use HTML::PullParser;
+
+my $doc = <<'EOT';
+<title>Title</title>
+<style> h1 { background: white }
+<foo>
+</style>
+<H1 ID="3">Heading</H1>
+<!-- ignore this -->
+
+This is a text with a <A HREF="http://www.sol.no" name="l1">link</a>.
+EOT
+
+my $p = HTML::PullParser->new(doc => $doc,
+ start => 'event,tagname,@attr',
+ end => 'event,tagname',
+ text => 'event,dtext',
+
+ ignore_elements => [qw(script style)],
+ unbroken_text => 1,
+ boolean_attribute_value => 1,
+ );
+
+my $t = $p->get_token;
+is($t->[0], "start");
+is($t->[1], "title");
+$p->unget_token($t);
+
+my @a;
+while (my $t = $p->get_token) {
+ for (@$t) {
+ s/\s/./g;
+ }
+ push(@a, join("|", @$t));
+}
+
+my $res = join("\n", @a, "");
+#diag $res;
+is($res, <<'EOT');
+start|title
+text|Title
+end|title
+text|..
+start|h1|id|3
+text|Heading
+end|h1
+text|...This.is.a.text.with.a.
+start|a|href|http://www.sol.no|name|l1
+text|link
+end|a
+text|..
+EOT
+
--- /dev/null
+#!perl -w
+
+use strict;
+use Test;
+plan tests => 1;
+
+use HTML::Parser;
+
+my $TEXT = "";
+sub h
+{
+ my($event, $tagname, $text) = @_;
+ for ($event, $tagname, $text) {
+ if (defined) {
+ s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+ }
+ else {
+ $_ = "<undef>";
+ }
+ }
+
+ $TEXT .= "[$event,$tagname,$text]\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"], empty_element_tags => 1);
+$p->parse(q(<tr><td align="center" height="100"><script src="whatever"/><SCRIPT language="JavaScript1.1">bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');</SCRIPT></td></tr>));
+$p->eof;
+
+ok($TEXT, <<'EOT');
+[start_document,<undef>,]
+[start,tr,<tr>]
+[start,td,<td align="center" height="100">]
+[start,script,<script src="whatever"/>]
+[end,script,]
+[start,script,<SCRIPT language="JavaScript1.1">]
+[text,<undef>,bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');]
+[end,script,</SCRIPT>]
+[end,td,</td>]
+[end,tr,</tr>]
+[end_document,<undef>,]
+EOT
--- /dev/null
+use Test::More tests => 3;
+
+use strict;
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->report_tags("a");
+
+my @doc;
+
+$p->handler(start => \&a_handler, "skipped_text, text");
+$p->handler(end_document => \@doc, '@{skipped_text}');
+
+$p->parse(<<EOT)->eof;
+<title>hi</title>
+<h1><a href="foo">link</a></h1>
+and <a foo="">some</a> text.
+EOT
+
+sub a_handler {
+ push(@doc, shift);
+ my $text = shift;
+ push(@doc, uc($text));
+}
+
+
+is(join("", @doc), <<'EOT');
+<title>hi</title>
+<h1><A HREF="FOO">link</a></h1>
+and <A FOO="">some</a> text.
+EOT
+
+#
+# Comment stripper. Interaction with "" handlers.
+#
+my $doc = <<EOT;
+<html>text</html>
+<!-- comment -->
+and some more <b>text</b>.
+EOT
+(my $expected = $doc) =~ s/<!--.*?-->//;
+
+$p = HTML::Parser->new(api_version => 3);
+$p->handler(comment => "");
+$p->handler(end_document => sub {
+ my $stripped = shift;
+ #diag $stripped;
+ is($stripped, $expected);
+ }, "skipped_text");
+for (split(//, $doc)) {
+ $p->parse($_);
+}
+$p->eof;
+
+#
+# Interaction with unbroken text
+#
+my @x;
+$p = HTML::Parser->new(api_version => 3, unbroken_text => 1);
+$p->handler(text => \@x, '@{"X", skipped_text, text}');
+$p->handler(end => "");
+$p->handler(end_document => \@x, '@{"Y", skipped_text}');
+
+$doc = "a a<a>b b</a>c c<x>d d</x>e";
+
+for (split(//, $doc)) {
+ $p->parse($_);
+}
+$p->eof;
+
+#diag join(":", @x);
+is(join(":", @x), "X::a a:X:<a>:b bc c:X:<x>:d de:Y:");
+
--- /dev/null
+#!perl -w
+
+# HTML-Parser 3.33 and older used to core dump on this program because
+# of missing SPAGAIN calls in parse() XS code. It was not prepared for
+# the stack to get realloced.
+
+$| = 1;
+
+use Test::More tests => 1;
+
+use HTML::Parser;
+my $x = HTML::Parser->new(api_version => 3);
+my @row;
+$x->handler(end => sub { push(@row, (1) x 505); 1 }, "tagname");
+$x->parse("</TD>");
+
+pass;
--- /dev/null
+use Test::More tests => 1;
+
+use strict;
+use HTML::Parser;
+
+my $html = <<'EOT';
+<html>
+<title>This is a <nice> title</title>
+<!--comment-->
+<script language="perl">while (<DATA>) { & }</script>
+
+<FORM>
+
+<textarea name="foo" cols=50 rows=10>
+
+foo
+<foo>
+<!--comment-->
+&
+foo
+</FORM>
+
+</textarea>
+
+</FORM>
+
+</html>
+EOT
+
+my $dump = "";
+sub tdump {
+ my @a = @_;
+ for (@a) {
+ $_ = "<undef>" unless defined;
+ s/\n/\\n/g;
+ }
+ $dump .= join("|", @a) . "\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]);
+$p->parse($html)->eof;
+
+#diag $dump;
+
+is($dump, <<'EOT');
+start_document||<undef>|<undef>
+start|<html>|<undef>|<undef>
+text|\n|\n|
+start|<title>|<undef>|<undef>
+text|This is a <nice> title|This is a <nice> title|
+end|</title>|<undef>|<undef>
+text|\n|\n|
+comment|<!--comment-->|<undef>|<undef>
+text|\n|\n|
+start|<script language="perl">|<undef>|<undef>
+text|while (<DATA>) { & }|while (<DATA>) { & }|1
+end|</script>|<undef>|<undef>
+text|\n\n|\n\n|
+start|<FORM>|<undef>|<undef>
+text|\n\n|\n\n|
+start|<textarea name="foo" cols=50 rows=10>|<undef>|<undef>
+text|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n|
+end|</textarea>|<undef>|<undef>
+text|\n\n|\n\n|
+end|</FORM>|<undef>|<undef>
+text|\n\n|\n\n|
+end|</html>|<undef>|<undef>
+text|\n|\n|
+end_document||<undef>|<undef>
+EOT
--- /dev/null
+# Verify thread safety.
+
+use Config;
+use Test::More;
+
+BEGIN {
+ plan(skip_all => "Not configured for threads")
+ unless $Config{useithreads} && $] >= 5.008;
+ plan(tests => 1);
+}
+
+use threads;
+use HTML::Parser;
+
+my $ok=0;
+
+sub start
+{
+ my($tag,$attr)=@_;
+
+ $ok += ($tag eq "foo");
+ $ok += (defined($attr->{param}) && $attr->{param} eq "bar");
+}
+
+my $p = HTML::Parser->new
+ (api_version => 3,
+ handlers => {
+ start => [\&start, "tagname,attr"],
+ });
+
+$p->parse("<foo pa");
+
+$ok=async {
+ $p->parse("ram=bar>");
+ $ok;
+}->join();
+
+is($ok,2);
+
--- /dev/null
+use Test::More tests => 17;
+
+use strict;
+use HTML::TokeParser;
+
+# First we create an HTML document to test
+
+my $file = "ttest$$.htm";
+die "$file already exists" if -e $file;
+
+open(F, ">$file") or die "Can't create $file: $!";
+print F <<'EOT'; close(F);
+
+<!--This is a test-->
+<html><head><title>
+ This is the <title>
+</title>
+
+ <base href="http://www.perl.com">
+</head>
+
+<body background="bg.gif">
+
+ <h1>This is the <b>title</b> again
+ </h1>
+
+ And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl"> <!--nice isn't it-->Institute</a>
+
+ <br/><? process instruction >
+
+</body>
+</html>
+
+EOT
+
+END { unlink($file) || warn "Can't unlink $file: $!"; }
+
+
+my $p;
+
+
+$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
+ok($p->unbroken_text);
+if ($p->get_tag("foo", "title")) {
+ my $title = $p->get_trimmed_text;
+ #diag "Title: $title";
+ is($title, "This is the <title>");
+}
+undef($p);
+
+# Test with reference to glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(\*F);
+my $scount = 0;
+my $ecount = 0;
+my $tcount = 0;
+my $pcount = 0;
+while (my $token = $p->get_token) {
+ $scount++ if $token->[0] eq "S";
+ $ecount++ if $token->[0] eq "E";
+ $pcount++ if $token->[0] eq "PI";
+}
+undef($p);
+close F;
+
+# Test with glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(*F);
+$tcount++ while $p->get_tag;
+undef($p);
+close F;
+
+# Test with plain file name
+$p = HTML::TokeParser->new($file) || die;
+$tcount++ while $p->get_tag;
+undef($p);
+
+#diag "Number of tokens found: $tcount/2 = $scount + $ecount";
+is($tcount, 34);
+is($scount, 10);
+is($ecount, 7);
+is($pcount, 1);
+is($tcount/2, $scount + $ecount);
+
+ok(!HTML::TokeParser->new("/noT/thEre/$$"));
+
+
+$p = HTML::TokeParser->new($file) || die;
+$p->get_tag("a");
+my $atext = $p->get_text;
+undef($p);
+
+is($atext, "Perl\240Institute");
+
+# test parsing of embeded document
+$p = HTML::TokeParser->new(\<<HTML);
+<title>Title</title>
+<H1>
+Heading
+</h1>
+HTML
+
+ok($p->get_tag("h1"));
+is($p->get_trimmed_text, "Heading");
+undef($p);
+
+# test parsing of large embedded documents
+my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022;
+
+#use Time::HiRes qw(time);
+my $start = time;
+$p = HTML::TokeParser->new(\$doc);
+#diag "Construction time: ", time - $start;
+
+my $count;
+while (my $t = $p->get_token) {
+ $count++ if $t->[0] eq "S";
+}
+#diag "Parse time: ", time - $start;
+
+is($count, 2022);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+<H1>This is a heading</H1>
+This is s<b>o</b>me<hr>text.
+<br />
+This is some more text.
+<p>
+This is even some more.
+EOT
+
+$p->get_tag("/h1");
+
+my $t = $p->get_trimmed_text("br", "p");
+is($t, "This is some text.");
+
+$p->get_tag;
+
+$t = $p->get_trimmed_text("br", "p");
+is($t,"This is some more text.");
+
+undef($p);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+<H1>This is a <b>bold</b> heading</H1>
+This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>.
+<p>
+This is even some more.
+EOT
+
+$p->get_tag("h1");
+
+$t = $p->get_phrase;
+is($t, "This is a bold heading");
+
+$t = $p->get_phrase;
+is($t, "");
+
+$p->get_tag;
+
+$t = $p->get_phrase;
+is($t, "This is some italic text. This is some more text.");
+
+undef($p);
--- /dev/null
+# Test Unicode entities
+
+use HTML::Entities;
+
+use Test::More tests => 27;
+
+SKIP: {
+skip "This perl does not support Unicode or Unicode entities not selected",
+ 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT;
+
+is(decode_entities("&euro"), "&euro");
+is(decode_entities("€"), "\x{20AC}");
+
+is(decode_entities("å"), "Ã¥");
+is(decode_entities("å"), "Ã¥");
+
+is(decode_entities("񺄠"), chr(500000));
+
+is(decode_entities("􏿽"), "\x{10FFFD}");
+
+is(decode_entities(""), "\x{FFFC}");
+
+
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities(""), "\x{FFFD}");
+is(decode_entities("�"), chr(0xFFFD));
+is(decode_entities("�"), chr(0xFFFD));
+
+is(decode_entities("�"), "\0");
+is(decode_entities("�"), "\0");
+is(decode_entities("�"), "\0");
+is(decode_entities("�"), "\0");
+
+is(decode_entities("&#ååå࿿"), "&#ååå\x{FFF}");
+
+# This might fail when we get more than 64 bit UVs
+is(decode_entities("�"), "�");
+is(decode_entities("�"), "�");
+
+my $err;
+for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) {
+ my $a = join("", map chr, $_->[0] .. $_->[1]);
+
+ my $e = encode_entities($a);
+ my $d = decode_entities($e);
+
+ unless ($d eq $a) {
+ diag "Wrong decoding in range $_->[0] .. $_->[1]";
+ # use Devel::Peek; Dump($a); Dump($d);
+ $err++;
+ }
+}
+ok(!$err);
+
+
+is(decode_entities("��"), chr(0x100085));
+
+is(decode_entities("��"), chr(0x100085));
+
+is(decode_entities("�"), chr(0xFFFD));
+
+is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}");
+}
--- /dev/null
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 3;
+
+my $text = "";
+sub text
+{
+ my $cdata = shift() ? "CDATA" : "TEXT";
+ my($offset, $line, $col, $t) = @_;
+ $text .= "[$cdata:$offset:$line.$col:$t]";
+}
+
+sub tag
+{
+ $text .= shift;
+}
+
+my $p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ start_h => [\&tag, "text"],
+ end_h => [\&tag, "text"],
+ );
+
+$p->parse("foo ");
+$p->parse("bar ");
+$p->parse("<foo>");
+$p->parse("bar\n");
+$p->parse("</foo>");
+$p->parse("<xmp>xmp</xmp>");
+$p->parse("atend");
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foo bar ]<foo>[TEXT:13:1.13:bar\n]</foo><xmp>[CDATA:28:2.11:xmp]</xmp>");
+
+$text = "";
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:37:2.20:atend]");
+
+
+$p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ );
+
+$text = "";
+$p->parse("foo");
+$p->parse("<foo");
+$p->parse(">bar\n");
+$p->parse("foo<xm");
+$p->parse("p>xmp");
+$p->parse("</xmp");
+$p->parse(">bar");
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]");
+
+
--- /dev/null
+#!perl -w
+
+use strict;
+use Test::More tests => 2;
+use HTML::Parser;
+
+SKIP: {
+skip "This perl does not support Unicode", 2 if $] < 5.008;
+
+my @parsed;
+my $p = HTML::Parser->new(
+ api_version => 3,
+ start_h => [\@parsed, 'tag, attr'],
+);
+
+my @warn;
+$SIG{__WARN__} = sub {
+ push(@warn, $_[0]);
+};
+
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>");
+$p->eof;
+
+#use Encode;
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>" . chr(0x263A));
+$p->eof;
+
+$p->parse("\xFF\xFE<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\xFE\xFF<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\0\0\xFF\xFE<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\xFE\xFF\0\0<head>Hi there</head>");
+$p->eof;
+
+is(join("", @warn), <<EOT);
+Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line 21.
+Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line 25.
+Parsing of undecoded UTF-16 at $0 line 28.
+Parsing of undecoded UTF-16 at $0 line 31.
+Parsing of undecoded UTF-32 at $0 line 34.
+Parsing of undecoded UTF-32 at $0 line 37.
+EOT
+
+@warn = ();
+
+$p = HTML::Parser->new(
+ api_version => 3,
+ start_h => [\@parsed, 'tag'],
+);
+
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>");
+$p->eof;
+ok(!@warn);
+}
--- /dev/null
+#!perl -w
+
+use strict;
+use HTML::Parser;
+use Test::More tests => 103;
+
+SKIP: {
+skip "This perl does not support Unicode", 103 if $] < 5.008;
+
+my @warn;
+$SIG{__WARN__} = sub {
+ push(@warn, $_[0]);
+};
+
+my @parsed;
+my $p = HTML::Parser->new(
+ api_version => 3,
+ default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'],
+);
+
+my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile ☺</h1>\x{0420}";
+is(length($doc), 46);
+
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "<title>");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 0x263A);
+is($parsed[2][2], chr(0x263A));
+is($parsed[2][3], 7);
+is($parsed[2][4], 1);
+is($parsed[2][5], 8);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "</title>");
+is($parsed[3][3], 8);
+is($parsed[3][6], 8);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "<h1 id=\x{2600} f>");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
+is($parsed[4][8]{id}, "\x{2600}");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile ☺");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[7][0], "text");
+is($parsed[7][1], "\x{0420}");
+is($parsed[7][2], "\x{0420}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+is(@warn, 0);
+
+# Try to parse it as an UTF8 encoded string
+utf8::encode($doc);
+is(length($doc), 51);
+
+@parsed = ();
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "<title>");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 226);
+is($parsed[2][1], "\xE2\x98\xBA");
+is($parsed[2][2], "\xE2\x98\xBA");
+is($parsed[2][3], 7);
+is($parsed[2][4], 3);
+is($parsed[2][5], 10);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "</title>");
+is($parsed[3][3], 10);
+is($parsed[3][6], 10);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
+is($parsed[4][8]{id}, "\xE2\x98\x80");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile ☺");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+my $file = "test-$$.html";
+open(my $fh, ">:utf8", $file) || die;
+print $fh <<EOT;
+\x{FEFF}
+<title>\x{263A} Love! </title>
+<h1 id=♥\x{2665}>♥ Love \x{2665}<h1>
+EOT
+close($fh) || die;
+
+@warn = ();
+@parsed = ();
+$p->parse_file($file);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \xE2\x99\xA5");
+is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage
+is($parsed[10][3], -s $file);
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw:utf8", $file) || die;
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\x{2665}");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \x{2665}");
+is($parsed[7][2], "\x{2665} Love \x{2665}");
+is($parsed[10][3], (-s $file) - 2 * 4);
+is(@warn, 0);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw", $file) || die;
+$p->utf8_mode(1);
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "♥ Love \xE2\x99\xA5");
+is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
+is($parsed[10][3], -s $file);
+is(@warn, 0);
+
+unlink($file);
+
+@parsed = ();
+$p->parse(q(<a href="a=1&lang=2×=3">foo</a>))->eof;
+is(@parsed, "5");
+is($parsed[1][0], "start");
+is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
+
+ok(!HTML::Entities::_probably_utf8_chunk(""));
+ok(!HTML::Entities::_probably_utf8_chunk("f"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
+}
--- /dev/null
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(xml_mode => 1,
+ );
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ $text .= "]";
+ }, "tagname,attr");
+$p->handler(end =>
+ sub {
+ $text .= "E[" . shift() . "]";
+ }, "tagname");
+$p->handler(process =>
+ sub {
+ $text .= "PI[" . shift() . "]";
+ }, "token0");
+$p->handler(text =>
+ sub {
+ $text .= shift;
+ }, "text");
+
+my $xml = <<'EOT';
+<?xml version="1.0"?>
+<?IS10744:arch name="html"?><!-- comment -->
+<DOC>
+<title html="h1">My first architectual document</title>
+<author html="address">Geir Ove Gronmo, grove@infotek.no</author>
+<para>This is the first paragraph in this document</para>
+<para html="p">This is the second paragraph</para>
+<para/>
+<xmp><foo></foo></xmp>
+</DOC>
+EOT
+
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"]
+PI[IS10744:arch name="html"]
+S[DOC]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para]E[para]
+S[xmp]S[foo]E[foo]E[xmp]
+E[DOC]
+EOT
+
+$text = "";
+$p->xml_mode(0);
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"?]
+PI[IS10744:arch name="html"?]
+S[doc]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para/]
+S[xmp]<foo></foo>E[xmp]
+E[doc]
+EOT
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+ xml_mode => 1);
+
+$p->handler("end" =>
+ sub {
+ my($tagname, $text) = @_;
+ is($tagname, "Xyzzy");
+ ok(!length($text));
+ }, "tagname,text");
+$p->parse("<Xyzzy foo=bar/>and some more")->eof;
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+ empty_element_tags => 1);
+
+$p->handler("end" =>
+ sub {
+ my($tagname, $text) = @_;
+ is($tagname, "xyzzy");
+ ok(!length($text));
+ }, "tagname,text");
+$p->parse("<Xyzzy foo=bar/>and some more")->eof;
+
+$p = HTML::Parser->new(
+ api_version => 3,
+ xml_pic => 1,
+);
+
+$p->handler(
+ "process" => sub {
+ my($text, $t0) = @_;
+ is($text, "<?foo > bar?>");
+ is($t0, "foo > bar");
+ }, "text, token0");
+$p->parse("<?foo > bar?> and then")->eof;
--- /dev/null
+struct token_pos
+{
+ char *beg;
+ char *end;
+};
+typedef struct token_pos token_pos_t;
+
+#define dTOKENS(init_lim) \
+ token_pos_t token_buf[init_lim]; \
+ int token_lim = init_lim; \
+ token_pos_t *tokens = token_buf; \
+ int num_tokens = 0
+
+#define PUSH_TOKEN(p_beg, p_end) \
+ STMT_START { \
+ ++num_tokens; \
+ if (num_tokens == token_lim) \
+ tokens_grow(&tokens, &token_lim, (bool)(tokens != token_buf)); \
+ tokens[num_tokens-1].beg = p_beg; \
+ tokens[num_tokens-1].end = p_end; \
+ } STMT_END
+
+#define FREE_TOKENS \
+ STMT_START { \
+ if (tokens != token_buf) \
+ Safefree(tokens); \
+ } STMT_END
+
+static void
+tokens_grow(token_pos_t **token_ptr, int *token_lim_ptr, bool tokens_on_heap)
+{
+ int new_lim = *token_lim_ptr;
+ if (new_lim < 4)
+ new_lim = 4;
+ new_lim *= 2;
+
+ if (tokens_on_heap) {
+ Renew(*token_ptr, new_lim, token_pos_t);
+ }
+ else {
+ token_pos_t *new_tokens;
+ int i;
+ New(57, new_tokens, new_lim, token_pos_t);
+ for (i = 0; i < *token_lim_ptr; i++)
+ new_tokens[i] = (*token_ptr)[i];
+ *token_ptr = new_tokens;
+ }
+ *token_lim_ptr = new_lim;
+}
--- /dev/null
+PSTATE* T_PSTATE
+
+INPUT
+T_PSTATE
+ $var = get_pstate_hv(aTHX_ $arg)
--- /dev/null
+/* $Id: util.c,v 2.30 2006/03/22 09:15:17 gisle Exp $
+ *
+ * Copyright 1999-2006, Gisle Aas.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the same terms as Perl itself.
+ */
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+
+EXTERN SV*
+sv_lower(pTHX_ SV* sv)
+{
+ STRLEN len;
+ char *s = SvPV_force(sv, len);
+ for (; len--; s++)
+ *s = toLOWER(*s);
+ return sv;
+}
+
+EXTERN int
+strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case)
+{
+ while (n--) {
+ if (ignore_case) {
+ if (toLOWER(*s1) != toLOWER(*s2))
+ return 0;
+ }
+ else {
+ if (*s1 != *s2)
+ return 0;
+ }
+ s1++;
+ s2++;
+ }
+ return 1;
+}
+
+static void
+grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e)
+{
+ /*
+ SvPVX ---> AAAAAA...BBBBBB
+ ^ ^ ^
+ t s e
+ */
+ STRLEN t_offset = *t - SvPVX(sv);
+ STRLEN s_offset = *s - SvPVX(sv);
+ STRLEN e_offset = *e - SvPVX(sv);
+
+ SvGROW(sv, e_offset + grow + 1);
+
+ *t = SvPVX(sv) + t_offset;
+ *s = SvPVX(sv) + s_offset;
+ *e = SvPVX(sv) + e_offset;
+
+ Move(*s, *s+grow, *e - *s, char);
+ *s += grow;
+ *e += grow;
+}
+
+EXTERN SV*
+decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix)
+{
+ STRLEN len;
+ char *s = SvPV_force(sv, len);
+ char *t = s;
+ char *end = s + len;
+ char *ent_start;
+
+ char *repl;
+ STRLEN repl_len;
+#ifdef UNICODE_HTML_PARSER
+ char buf[UTF8_MAXLEN];
+ int repl_utf8;
+ int high_surrogate = 0;
+#else
+ char buf[1];
+#endif
+
+#if defined(__GNUC__) && defined(UNICODE_HTML_PARSER)
+ /* gcc -Wall reports this variable as possibly used uninitialized */
+ repl_utf8 = 0;
+#endif
+
+ while (s < end) {
+ assert(t <= s);
+
+ if ((*t++ = *s++) != '&')
+ continue;
+
+ ent_start = s;
+ repl = 0;
+
+ if (*s == '#') {
+ UV num = 0;
+ UV prev = 0;
+ int ok = 0;
+ s++;
+ if (*s == 'x' || *s == 'X') {
+ s++;
+ while (*s) {
+ char *tmp = strchr(PL_hexdigit, *s);
+ if (!tmp)
+ break;
+ num = num << 4 | ((tmp - PL_hexdigit) & 15);
+ if (prev && num <= prev) {
+ /* overflow */
+ ok = 0;
+ break;
+ }
+ prev = num;
+ s++;
+ ok = 1;
+ }
+ }
+ else {
+ while (isDIGIT(*s)) {
+ num = num * 10 + (*s - '0');
+ if (prev && num < prev) {
+ /* overflow */
+ ok = 0;
+ break;
+ }
+ prev = num;
+ s++;
+ ok = 1;
+ }
+ }
+ if (ok) {
+#ifdef UNICODE_HTML_PARSER
+ if (!SvUTF8(sv) && num <= 255) {
+ buf[0] = (char) num;
+ repl = buf;
+ repl_len = 1;
+ repl_utf8 = 0;
+ }
+ else {
+ char *tmp;
+ if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */
+ if (high_surrogate != 0) {
+ t -= 3; /* Back up past 0xFFFD */
+ num = ((high_surrogate - 0xD800) << 10) +
+ (num - 0xDC00) + 0x10000;
+ high_surrogate = 0;
+ } else {
+ num = 0xFFFD;
+ }
+ }
+ else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */
+ high_surrogate = num;
+ num = 0xFFFD;
+ }
+ else {
+ high_surrogate = 0;
+ /* otherwise invalid? */
+ if ((num >= 0xFDD0 && num <= 0xFDEF) ||
+ ((num & 0xFFFE) == 0xFFFE) ||
+ num > 0x10FFFF)
+ {
+ num = 0xFFFD;
+ }
+ }
+
+ tmp = (char*)uvuni_to_utf8((U8*)buf, num);
+ repl = buf;
+ repl_len = tmp - buf;
+ repl_utf8 = 1;
+ }
+#else
+ if (num <= 255) {
+ buf[0] = (char) num & 0xFF;
+ repl = buf;
+ repl_len = 1;
+ }
+#endif
+ }
+ }
+ else {
+ char *ent_name = s;
+ while (isALNUM(*s))
+ s++;
+ if (ent_name != s && entity2char) {
+ SV** svp;
+ if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) ||
+ (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0)))
+ )
+ {
+ repl = SvPV(*svp, repl_len);
+#ifdef UNICODE_HTML_PARSER
+ repl_utf8 = SvUTF8(*svp);
+#endif
+ }
+ else if (expand_prefix) {
+ char *ss = s - 1;
+ while (ss > ent_name) {
+ svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0);
+ if (svp) {
+ repl = SvPV(*svp, repl_len);
+#ifdef UNICODE_HTML_PARSER
+ repl_utf8 = SvUTF8(*svp);
+#endif
+ s = ss;
+ break;
+ }
+ ss--;
+ }
+ }
+ }
+#ifdef UNICODE_HTML_PARSER
+ high_surrogate = 0;
+#endif
+ }
+
+ if (repl) {
+ char *repl_allocated = 0;
+ if (*s == ';')
+ s++;
+ t--; /* '&' already copied, undo it */
+
+#ifdef UNICODE_HTML_PARSER
+ if (*s != '&') {
+ high_surrogate = 0;
+ }
+
+ if (!SvUTF8(sv) && repl_utf8) {
+ /* need to upgrade sv before we continue */
+ STRLEN before_gap_len = t - SvPVX(sv);
+ char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len);
+ STRLEN after_gap_len = end - s;
+ char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len);
+
+ sv_setpvn(sv, before_gap, before_gap_len);
+ sv_catpvn(sv, after_gap, after_gap_len);
+ SvUTF8_on(sv);
+
+ Safefree(before_gap);
+ Safefree(after_gap);
+
+ s = t = SvPVX(sv) + before_gap_len;
+ end = SvPVX(sv) + before_gap_len + after_gap_len;
+ }
+ else if (SvUTF8(sv) && !repl_utf8) {
+ repl = (char*)bytes_to_utf8((U8*)repl, &repl_len);
+ repl_allocated = repl;
+ }
+#endif
+
+ if (t + repl_len > s) {
+ /* need to grow the string */
+ grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end);
+ }
+
+ /* copy replacement string into string */
+ while (repl_len--)
+ *t++ = *repl++;
+
+ if (repl_allocated)
+ Safefree(repl_allocated);
+ }
+ else {
+ while (ent_start < s)
+ *t++ = *ent_start++;
+ }
+ }
+
+ *t = '\0';
+ SvCUR_set(sv, t - SvPVX(sv));
+
+ return sv;
+}
+
+#ifdef UNICODE_HTML_PARSER
+static bool
+has_hibit(char *s, char *e)
+{
+ while (s < e) {
+ U8 ch = *s++;
+ if (!UTF8_IS_INVARIANT(ch)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+EXTERN bool
+probably_utf8_chunk(pTHX_ char *s, STRLEN len)
+{
+ char *e = s + len;
+ STRLEN clen;
+
+ /* ignore partial utf8 char at end of buffer */
+ while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1)))
+ e--;
+ if (s < e && UTF8_IS_START((U8)*(e - 1)))
+ e--;
+ clen = len - (e - s);
+ if (clen && UTF8SKIP(e) == clen) {
+ /* all promised continuation bytes are present */
+ e = s + len;
+ }
+
+ if (!has_hibit(s, e))
+ return 0;
+
+ return is_utf8_string((U8*)s, e - s);
+}
+#endif
--- /dev/null
+package HTML::Tagset;
+
+use strict;
+
+=head1 NAME
+
+HTML::Tagset - data tables useful in parsing HTML
+
+=head1 VERSION
+
+Version 3.20
+
+=cut
+
+use vars qw( $VERSION );
+
+$VERSION = '3.20';
+
+=head1 SYNOPSIS
+
+ use HTML::Tagset;
+ # Then use any of the items in the HTML::Tagset package
+ # as need arises
+
+=head1 DESCRIPTION
+
+This module contains several data tables useful in various kinds of
+HTML parsing operations.
+
+Note that all tag names used are lowercase.
+
+In the following documentation, a "hashset" is a hash being used as a
+set -- the hash conveys that its keys are there, and the actual values
+associated with the keys are not significant. (But what values are
+there, are always true.)
+
+=cut
+
+use vars qw(
+ $VERSION
+ %emptyElement %optionalEndTag %linkElements %boolean_attr
+ %isHeadElement %isBodyElement %isPhraseMarkup
+ %is_Possible_Strict_P_Content
+ %isHeadOrBodyElement
+ %isList %isTableElement %isFormElement
+ %isKnown %canTighten
+ @p_closure_barriers
+ %isCDATA_Parent
+);
+
+=head1 VARIABLES
+
+Note that none of these variables are exported.
+
+=head2 hashset %HTML::Tagset::emptyElement
+
+This hashset has as values the tag-names (GIs) of elements that cannot
+have content. (For example, "base", "br", "hr".) So
+C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
+C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
+
+=cut
+
+%emptyElement = map {; $_ => 1 } qw(base link meta isindex
+ img br hr wbr
+ input area param
+ embed bgsound spacer
+ basefont col frame
+ ~comment ~literal
+ ~declaration ~pi
+ );
+# The "~"-initial names are for pseudo-elements used by HTML::Entities
+# and TreeBuilder
+
+=head2 hashset %HTML::Tagset::optionalEndTag
+
+This hashset lists tag-names for elements that can have content, but whose
+end-tags are generally, "safely", omissible. Example:
+C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
+
+=cut
+
+%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
+
+=head2 hash %HTML::Tagset::linkElements
+
+Values in this hash are tagnames for elements that might contain
+links, and the value for each is a reference to an array of the names
+of attributes whose values can be links.
+
+=cut
+
+%linkElements =
+(
+ 'a' => ['href'],
+ 'applet' => ['archive', 'codebase', 'code'],
+ 'area' => ['href'],
+ 'base' => ['href'],
+ 'bgsound' => ['src'],
+ 'blockquote' => ['cite'],
+ 'body' => ['background'],
+ 'del' => ['cite'],
+ 'embed' => ['pluginspage', 'src'],
+ 'form' => ['action'],
+ 'frame' => ['src', 'longdesc'],
+ 'iframe' => ['src', 'longdesc'],
+ 'ilayer' => ['background'],
+ 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
+ 'input' => ['src', 'usemap'],
+ 'ins' => ['cite'],
+ 'isindex' => ['action'],
+ 'head' => ['profile'],
+ 'layer' => ['background', 'src'],
+ 'link' => ['href'],
+ 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
+ 'q' => ['cite'],
+ 'script' => ['src', 'for'],
+ 'table' => ['background'],
+ 'td' => ['background'],
+ 'th' => ['background'],
+ 'tr' => ['background'],
+ 'xmp' => ['href'],
+);
+
+=head2 hash %HTML::Tagset::boolean_attr
+
+This hash (not hashset) lists what attributes of what elements can be
+printed without showing the value (for example, the "noshade" attribute
+of "hr" elements). For elements with only one such attribute, its value
+is simply that attribute name. For elements with many such attributes,
+the value is a reference to a hashset containing all such attributes.
+
+=cut
+
+%boolean_attr = (
+# TODO: make these all hashes
+ 'area' => 'nohref',
+ 'dir' => 'compact',
+ 'dl' => 'compact',
+ 'hr' => 'noshade',
+ 'img' => 'ismap',
+ 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
+ 'menu' => 'compact',
+ 'ol' => 'compact',
+ 'option' => 'selected',
+ 'select' => 'multiple',
+ 'td' => 'nowrap',
+ 'th' => 'nowrap',
+ 'ul' => 'compact',
+);
+
+#==========================================================================
+# List of all elements from Extensible HTML version 1.0 Transitional DTD:
+#
+# a abbr acronym address applet area b base basefont bdo big
+# blockquote body br button caption center cite code col colgroup
+# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
+# head hr html i iframe img input ins isindex kbd label legend li
+# link map menu meta noframes noscript object ol optgroup option p
+# param pre q s samp script select small span strike strong style
+# sub sup table tbody td textarea tfoot th thead title tr tt u ul
+# var
+#
+# Varia from Mozilla source internal table of tags:
+# Implemented:
+# xmp listing wbr nobr frame frameset noframes ilayer
+# layer nolayer spacer embed multicol
+# But these are unimplemented:
+# sound?? keygen?? server??
+# Also seen here and there:
+# marquee?? app?? (both unimplemented)
+#==========================================================================
+
+=head2 hashset %HTML::Tagset::isPhraseMarkup
+
+This hashset contains all phrasal-level elements.
+
+=cut
+
+%isPhraseMarkup = map {; $_ => 1 } qw(
+ span abbr acronym q sub sup
+ cite code em kbd samp strong var dfn strike
+ b i u s tt small big
+ a img br
+ wbr nobr blink
+ font basefont bdo
+ spacer embed noembed
+); # had: center, hr, table
+
+
+=head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
+
+This hashset contains all phrasal-level elements that be content of a
+P element, for a strict model of HTML.
+
+=cut
+
+%is_Possible_Strict_P_Content = (
+ %isPhraseMarkup,
+ %isFormElement,
+ map {; $_ => 1} qw( object script map )
+ # I've no idea why there's these latter exceptions.
+ # I'm just following the HTML4.01 DTD.
+);
+
+#from html4 strict:
+#<!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
+#
+#<!ENTITY % phrase "EM | STRONG | DFN | CODE |
+# SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
+#
+#<!ENTITY % special
+# "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
+#
+#<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
+#
+#<!-- %inline; covers inline or "text-level" elements -->
+#<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
+
+=head2 hashset %HTML::Tagset::isHeadElement
+
+This hashset contains all elements that elements that should be
+present only in the 'head' element of an HTML document.
+
+=cut
+
+%isHeadElement = map {; $_ => 1 }
+ qw(title base link meta isindex script style object bgsound);
+
+=head2 hashset %HTML::Tagset::isList
+
+This hashset contains all elements that can contain "li" elements.
+
+=cut
+
+%isList = map {; $_ => 1 } qw(ul ol dir menu);
+
+=head2 hashset %HTML::Tagset::isTableElement
+
+This hashset contains all elements that are to be found only in/under
+a "table" element.
+
+=cut
+
+%isTableElement = map {; $_ => 1 }
+ qw(tr td th thead tbody tfoot caption col colgroup);
+
+=head2 hashset %HTML::Tagset::isFormElement
+
+This hashset contains all elements that are to be found only in/under
+a "form" element.
+
+=cut
+
+%isFormElement = map {; $_ => 1 }
+ qw(input select option optgroup textarea button label);
+
+=head2 hashset %HTML::Tagset::isBodyMarkup
+
+This hashset contains all elements that are to be found only in/under
+the "body" element of an HTML document.
+
+=cut
+
+%isBodyElement = map {; $_ => 1 } qw(
+ h1 h2 h3 h4 h5 h6
+ p div pre plaintext address blockquote
+ xmp listing
+ center
+
+ multicol
+ iframe ilayer nolayer
+ bgsound
+
+ hr
+ ol ul dir menu li
+ dl dt dd
+ ins del
+
+ fieldset legend
+
+ map area
+ applet param object
+ isindex script noscript
+ table
+ center
+ form
+ ),
+ keys %isFormElement,
+ keys %isPhraseMarkup, # And everything phrasal
+ keys %isTableElement,
+;
+
+
+=head2 hashset %HTML::Tagset::isHeadOrBodyElement
+
+This hashset includes all elements that I notice can fall either in
+the head or in the body.
+
+=cut
+
+%isHeadOrBodyElement = map {; $_ => 1 }
+ qw(script isindex style object map area param noscript bgsound);
+ # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
+
+
+=head2 hashset %HTML::Tagset::isKnown
+
+This hashset lists all known HTML elements.
+
+=cut
+
+%isKnown = (%isHeadElement, %isBodyElement,
+ map{; $_=>1 }
+ qw( head body html
+ frame frameset noframes
+ ~comment ~pi ~directive ~literal
+));
+ # that should be all known tags ever ever
+
+
+=head2 hashset %HTML::Tagset::canTighten
+
+This hashset lists elements that might have ignorable whitespace as
+children or siblings.
+
+=cut
+
+%canTighten = %isKnown;
+delete @canTighten{
+ keys(%isPhraseMarkup), 'input', 'select',
+ 'xmp', 'listing', 'plaintext', 'pre',
+};
+ # xmp, listing, plaintext, and pre are untightenable, and
+ # in a really special way.
+@canTighten{'hr','br'} = (1,1);
+ # exceptional 'phrasal' things that ARE subject to tightening.
+
+# The one case where I can think of my tightening rules failing is:
+# <p>foo bar<center> <em>baz quux</em> ...
+# ^-- that would get deleted.
+# But that's pretty gruesome code anyhow. You gets what you pays for.
+
+#==========================================================================
+
+=head2 array @HTML::Tagset::p_closure_barriers
+
+This array has a meaning that I have only seen a need for in
+C<HTML::TreeBuilder>, but I include it here on the off chance that someone
+might find it of use:
+
+When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p
+element we might have to minimize. At first sight, we might say that
+if there's a p anywhere in the lineage of this new p, it should be
+closed. But that's wrong. Consider this document:
+
+ <html>
+ <head>
+ <title>foo</title>
+ </head>
+ <body>
+ <p>foo
+ <table>
+ <tr>
+ <td>
+ foo
+ <p>bar
+ </td>
+ </tr>
+ </table>
+ </p>
+ </body>
+ </html>
+
+The second p is quite legally inside a much higher p.
+
+My formalization of the reason why this is legal, but this:
+
+ <p>foo<p>bar</p></p>
+
+isn't, is that something about the table constitutes a "barrier" to
+the application of the rule about what p must minimize.
+
+So C<@HTML::Tagset::p_closure_barriers> is the list of all such
+barrier-tags.
+
+=cut
+
+@p_closure_barriers = qw(
+ li blockquote
+ ul ol menu dir
+ dl dt dd
+ td th tr table caption
+ div
+ );
+
+# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
+# monkey business of barriers to minimization!
+
+=head2 hashset %isCDATA_Parent
+
+This hashset includes all elements whose content is CDATA.
+
+=cut
+
+%isCDATA_Parent = map {; $_ => 1 }
+ qw(script style xmp listing plaintext);
+
+# TODO: there's nothing else that takes CDATA children, right?
+
+# As the HTML3 DTD (Raggett 1995-04-24) noted:
+# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
+# and derive from very early versions of HTML. They require non-
+# standard parsers and will cause problems for processing
+# documents with standard SGML tools.
+
+
+=head1 CAVEATS
+
+You may find it useful to alter the behavior of modules (like
+C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s
+data tables by altering the data tables themselves. You are welcome
+to try, but be careful; and be aware that different modules may or may
+react differently to the data tables being changed.
+
+Note that it may be inappropriate to use these tables for I<producing>
+HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
+for all elements that can appear either in the head or in the body,
+such as "script". That doesn't mean that I am saying your code that
+produces HTML should feel free to put script elements in either place!
+If you are producing programs that spit out HTML, you should be
+I<intimately> familiar with the DTDs for HTML or XHTML (available at
+C<http://www.w3.org/>), and you should slavishly obey them, not
+the data tables in this document.
+
+=head1 SEE ALSO
+
+L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 1995-2000 Gisle Aas.
+
+Copyright 2000-2005 Sean M. Burke.
+
+Copyright 2005-2008 Andy Lester.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=head1 ACKNOWLEDGEMENTS
+
+Most of the code/data in this module was adapted from code written
+by Gisle Aas for C<HTML::Element>, C<HTML::TreeBuilder>, and
+C<HTML::LinkExtor>. Then it was maintained by Sean M. Burke.
+
+=head1 AUTHOR
+
+Current maintainer: Andy Lester, C<< <andy at petdance.com> >>
+
+=head1 BUGS
+
+Please report any bugs or feature requests to
+C<bug-html-tagset at rt.cpan.org>, or through the web interface at
+L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Tagset>. I will
+be notified, and then you'll automatically be notified of progress on
+your bug as I make changes.
+
+=cut
+
+1;
--- /dev/null
+
+require 5;
+# Time-stamp: "2004-12-29 20:55:15 AST"
+# Summary of, well, things.
+
+use Test;
+BEGIN {plan tests => 2};
+ok 1;
+
+use HTML::Tagset ();
+
+#chdir "t" if -e "t";
+
+{
+ my @out;
+ push @out,
+ "\n\nPerl v",
+ defined($^V) ? sprintf('%vd', $^V) : $],
+ " under $^O ",
+ (defined(&Win32::BuildNumber) and defined &Win32::BuildNumber())
+ ? ("(Win32::BuildNumber ", &Win32::BuildNumber(), ")") : (),
+ (defined $MacPerl::Version)
+ ? ("(MacPerl version $MacPerl::Version)") : (),
+ "\n"
+ ;
+
+ # Ugly code to walk the symbol tables:
+ my %v;
+ my @stack = (''); # start out in %::
+ my $this;
+ my $count = 0;
+ my $pref;
+ while(@stack) {
+ $this = shift @stack;
+ die "Too many packages?" if ++$count > 1000;
+ next if exists $v{$this};
+ next if $this eq 'main'; # %main:: is %::
+
+ #print "Peeking at $this => ${$this . '::VERSION'}\n";
+
+ if(defined ${$this . '::VERSION'} ) {
+ $v{$this} = ${$this . '::VERSION'}
+ } elsif(
+ defined *{$this . '::ISA'} or defined &{$this . '::import'}
+ or ($this ne '' and grep defined *{$_}{'CODE'}, values %{$this . "::"})
+ # If it has an ISA, an import, or any subs...
+ ) {
+ # It's a class/module with no version.
+ $v{$this} = undef;
+ } else {
+ # It's probably an unpopulated package.
+ ## $v{$this} = '...';
+ }
+
+ $pref = length($this) ? "$this\::" : '';
+ push @stack, map m/^(.+)::$/ ? "$pref$1" : (), keys %{$this . '::'};
+ #print "Stack: @stack\n";
+ }
+ push @out, " Modules in memory:\n";
+ delete @v{'', '[none]'};
+ foreach my $p (sort {lc($a) cmp lc($b)} keys %v) {
+ $indent = ' ' x (2 + ($p =~ tr/:/:/));
+ push @out, ' ', $indent, $p, defined($v{$p}) ? " v$v{$p};\n" : ";\n";
+ }
+ push @out, sprintf "[at %s (local) / %s (GMT)]\n",
+ scalar(gmtime), scalar(localtime);
+ my $x = join '', @out;
+ $x =~ s/^/#/mg;
+ print $x;
+}
+
+print "# Running",
+ (chr(65) eq 'A') ? " in an ASCII world.\n" : " in a non-ASCII world.\n",
+ "#\n",
+;
+
+print "# \@INC:\n", map("# [$_]\n", @INC), "#\n#\n";
+
+print "# \%INC:\n";
+foreach my $x (sort {lc($a) cmp lc($b)} keys %INC) {
+ print "# [$x] = [", $INC{$x} || '', "]\n";
+}
+
+ok 1;
+
--- /dev/null
+
+# Time-stamp: "2004-12-29 18:49:45 AST"
+
+BEGIN { $| = 1; print "1..1\n"; }
+END {print "not ok 1\n" unless $loaded;}
+use HTML::Tagset;
+$loaded = 1;
+print "ok 1\n";