From: Steve Peters Date: Thu, 5 Jun 2008 13:04:00 +0000 (+0000) Subject: A quick new release of Pod-Simple has removed the dependency on X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=8ba733f51b2432eafd952574f404d7f464786ad5;p=p5sagit%2Fp5-mst-13.2.git A quick new release of Pod-Simple has removed the dependency on HTML-Parser. HTML-Parser and HTML-Tagset will now be leaving us. p4raw-id: //depot/perl@34001 --- diff --git a/MANIFEST b/MANIFEST index 51c1d3d..7889c28 100644 --- a/MANIFEST +++ b/MANIFEST @@ -693,71 +693,6 @@ ext/Hash/Util/lib/Hash/Util.pm Hash::Util ext/Hash/Util/Makefile.PL Makefile for Hash::Util ext/Hash/Util/t/Util.t See if Hash::Util works ext/Hash/Util/Util.xs XS bits of Hash::Util -ext/HTML/Parser/hints/solaris.pl files for HTML::Parser -ext/HTML/Parser/hparser.c files for HTML::Parser -ext/HTML/Parser/hparser.h files for HTML::Parser -ext/HTML/Parser/lib/HTML/Entities.pm file for HTML::Entities -ext/HTML/Parser/lib/HTML/Filter.pm file for HTML::Filter -ext/HTML/Parser/lib/HTML/HeadParser.pm file for HTML::HeadParser -ext/HTML/Parser/lib/HTML/LinkExtor.pm file for HTML::LinkExtor -ext/HTML/Parser/lib/HTML/PullParser.pm file for HTML::PullParser -ext/HTML/Parser/lib/HTML/TokeParser.pm file for HTML::TokeParser -ext/HTML/Parser/Makefile.PL files for HTML::Parser -ext/HTML/Parser/mkhctype files for HTML::Parser -ext/HTML/Parser/mkpfunc files for HTML::Parser -ext/HTML/Parser/Parser.pm files for HTML::Parser -ext/HTML/Parser/Parser.xs files for HTML::Parser -ext/HTML/Parser/t/api_version.t test for HTML::Parser -ext/HTML/Parser/t/argspec2.t test for HTML::Parser -ext/HTML/Parser/t/argspec-bad.t test for HTML::Parser -ext/HTML/Parser/t/argspec.t test for HTML::Parser -ext/HTML/Parser/t/attr-encoded.t test for HTML::Parser -ext/HTML/Parser/t/callback.t test for HTML::Parser -ext/HTML/Parser/t/case-sensitive.t test for HTML::Parser -ext/HTML/Parser/t/cases.t test for HTML::Parser -ext/HTML/Parser/t/comment.t test for HTML::Parser -ext/HTML/Parser/t/crashme.t test for HTML::Parser -ext/HTML/Parser/t/declaration.t test for HTML::Parser -ext/HTML/Parser/t/default.t test for HTML::Parser -ext/HTML/Parser/t/document.t test for HTML::Parser -ext/HTML/Parser/t/dtext.t test for HTML::Parser -ext/HTML/Parser/t/entities2.t test for HTML::Parser -ext/HTML/Parser/t/entities.t test for HTML::Parser -ext/HTML/Parser/t/filter-methods.t test for HTML::Parser -ext/HTML/Parser/t/filter.t test for HTML::Parser -ext/HTML/Parser/t/handler-eof.t test for HTML::Parser -ext/HTML/Parser/t/handler.t test for HTML::Parser -ext/HTML/Parser/t/headparser-http.t test for HTML::Parser -ext/HTML/Parser/t/headparser.t test for HTML::Parser -ext/HTML/Parser/t/ignore.t test for HTML::Parser -ext/HTML/Parser/t/largetags.t test for HTML::Parser -ext/HTML/Parser/t/linkextor-base.t test for HTML::Parser -ext/HTML/Parser/t/linkextor-rel.t test for HTML::Parser -ext/HTML/Parser/t/magic.t test for HTML::Parser -ext/HTML/Parser/t/marked-sect.t test for HTML::Parser -ext/HTML/Parser/t/msie-compat.t test for HTML::Parser -ext/HTML/Parser/t/offset.t test for HTML::Parser -ext/HTML/Parser/tokenpos.h files for HTML::Parser -ext/HTML/Parser/t/options.t test for HTML::Parser -ext/HTML/Parser/t/parsefile.t test for HTML::Parser -ext/HTML/Parser/t/parser.t test for HTML::Parser -ext/HTML/Parser/t/plaintext.t test for HTML::Parser -ext/HTML/Parser/t/pod.t test for HTML::Parser -ext/HTML/Parser/t/process.t test for HTML::Parser -ext/HTML/Parser/t/pullparser.t test for HTML::Parser -ext/HTML/Parser/t/script.t test for HTML::Parser -ext/HTML/Parser/t/skipped-text.t test for HTML::Parser -ext/HTML/Parser/t/stack-realloc.t test for HTML::Parser -ext/HTML/Parser/t/textarea.t test for HTML::Parser -ext/HTML/Parser/t/threads.t test for HTML::Parser -ext/HTML/Parser/t/tokeparser.t test for HTML::Parser -ext/HTML/Parser/t/uentities.t test for HTML::Parser -ext/HTML/Parser/t/unbroken-text.t test for HTML::Parser -ext/HTML/Parser/t/unicode-bom.t test for HTML::Parser -ext/HTML/Parser/t/unicode.t test for HTML::Parser -ext/HTML/Parser/t/xml-mode.t test for HTML::Parser -ext/HTML/Parser/typemap files for HTML::Parser -ext/HTML/Parser/util.c files for HTML::Parser ext/I18N/Langinfo/fallback/const-c.inc I18N::Langinfo ext/I18N/Langinfo/fallback/const-xs.inc I18N::Langinfo ext/I18N/Langinfo/Langinfo.pm I18N::Langinfo @@ -2037,9 +1972,6 @@ lib/Getopt/Std.t See if Getopt::Std and Getopt::Long work lib/h2ph.t See if h2ph works like it should lib/h2xs.t See if h2xs produces expected lists of files lib/hostname.pl Old hostname code -lib/HTML/Tagset.pm HTML::Tagset -lib/HTML/Tagset/t/00_about_verbose.t HTML::Tagset -lib/HTML/Tagset/t/01_old_junk.t HTML::Tagset lib/I18N/Collate.pm Routines to do strxfrm-based collation lib/I18N/Collate.t See if I18N::Collate works lib/I18N/LangTags/ChangeLog I18N::LangTags diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl index 5cae67d..5b7486f 100644 --- a/Porting/Maintainers.pl +++ b/Porting/Maintainers.pl @@ -418,20 +418,6 @@ package Maintainers; 'CPAN' => 1, }, - 'HTML::Parser' => - { - 'MAINTAINER' => 'gaas', - 'FILES' => q[ext/HTML/Parser], - 'CPAN' => 1, - }, - - 'HTML::Tagset' => - { - 'MAINTAINER' => 'petdance', - 'FILES' => q[lib/HTML/Tagset.pm lib/HTML/Tagset], - 'CPAN' => 1, - }, - 'I18N::LangTags' => { 'MAINTAINER' => 'sburke', diff --git a/ext/HTML/Parser/Makefile.PL b/ext/HTML/Parser/Makefile.PL deleted file mode 100644 index 79081f7..0000000 --- a/ext/HTML/Parser/Makefile.PL +++ /dev/null @@ -1,30 +0,0 @@ -require 5.006; -use strict; -use ExtUtils::MakeMaker; - -WriteMakefile( - NAME => 'HTML::Parser', - VERSION_FROM => 'Parser.pm', - H => [ "hparser.h", "hctype.h", "tokenpos.h", "pfunc.h", - "hparser.c", "util.c", - ], - PREREQ_PM => { - 'HTML::Tagset' => 3, - 'Test::More' => 0, # only needed to run 'make test' - }, - DEFINE => "-DMARKED_SECTION", - dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, - clean => { FILES => 'hctype.h pfunc.h' }, -); - - -sub MY::postamble -{ - ' -pfunc.h : mkpfunc - $(PERL) mkpfunc >pfunc.h - -hctype.h : mkhctype - $(PERL) mkhctype >hctype.h -' -} diff --git a/ext/HTML/Parser/Parser.pm b/ext/HTML/Parser/Parser.pm deleted file mode 100644 index 72d5a98..0000000 --- a/ext/HTML/Parser/Parser.pm +++ /dev/null @@ -1,1233 +0,0 @@ -package HTML::Parser; - -# Copyright 1996-2007, Gisle Aas. -# Copyright 1999-2000, Michael A. Chase. -# -# This library is free software; you can redistribute it and/or -# modify it under the same terms as Perl itself. - -use strict; -use vars qw($VERSION @ISA); - -$VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $ - -require HTML::Entities; - -require XSLoader; -XSLoader::load('HTML::Parser', $VERSION); - -sub new -{ - my $class = shift; - my $self = bless {}, $class; - return $self->init(@_); -} - - -sub init -{ - my $self = shift; - $self->_alloc_pstate; - - my %arg = @_; - my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); - if ($api_version >= 4) { - require Carp; - Carp::croak("API version $api_version not supported " . - "by HTML::Parser $VERSION"); - } - - if ($api_version < 3) { - # Set up method callbacks compatible with HTML-Parser-2.xx - $self->handler(text => "text", "self,text,is_cdata"); - $self->handler(end => "end", "self,tagname,text"); - $self->handler(process => "process", "self,token0,text"); - $self->handler(start => "start", - "self,tagname,attr,attrseq,text"); - - $self->handler(comment => - sub { - my($self, $tokens) = @_; - for (@$tokens) { - $self->comment($_); - } - }, "self,tokens"); - - $self->handler(declaration => - sub { - my $self = shift; - $self->declaration(substr($_[0], 2, -1)); - }, "self,text"); - } - - if (my $h = delete $arg{handlers}) { - $h = {@$h} if ref($h) eq "ARRAY"; - while (my($event, $cb) = each %$h) { - $self->handler($event => @$cb); - } - } - - # In the end we try to assume plain attribute or handler - while (my($option, $val) = each %arg) { - if ($option =~ /^(\w+)_h$/) { - $self->handler($1 => @$val); - } - elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { - require Carp; - Carp::croak("Bad constructor option '$option'"); - } - else { - $self->$option($val); - } - } - - return $self; -} - - -sub parse_file -{ - my($self, $file) = @_; - my $opened; - if (!ref($file) && ref(\$file) ne "GLOB") { - # Assume $file is a filename - local(*F); - open(F, $file) || return undef; - binmode(F); # should we? good for byte counts - $opened++; - $file = *F; - } - my $chunk = ''; - while (read($file, $chunk, 512)) { - $self->parse($chunk) || last; - } - close($file) if $opened; - $self->eof; -} - - -sub netscape_buggy_comment # legacy -{ - my $self = shift; - require Carp; - Carp::carp("netscape_buggy_comment() is deprecated. " . - "Please use the strict_comment() method instead"); - my $old = !$self->strict_comment; - $self->strict_comment(!shift) if @_; - return $old; -} - -# set up method stubs -sub text { } -*start = \&text; -*end = \&text; -*comment = \&text; -*declaration = \&text; -*process = \&text; - -1; - -__END__ - - -=head1 NAME - -HTML::Parser - HTML parser class - -=head1 SYNOPSIS - - use HTML::Parser (); - - # Create parser object - $p = HTML::Parser->new( api_version => 3, - start_h => [\&start, "tagname, attr"], - end_h => [\&end, "tagname"], - marked_sections => 1, - ); - - # Parse document text chunk by chunk - $p->parse($chunk1); - $p->parse($chunk2); - #... - $p->eof; # signal end of document - - # Parse directly from file - $p->parse_file("foo.html"); - # or - open(my $fh, "<:utf8", "foo.html") || die; - $p->parse_file($fh); - -=head1 DESCRIPTION - -Objects of the C class will recognize markup and -separate it from plain text (alias data content) in HTML -documents. As different kinds of markup and text are recognized, the -corresponding event handlers are invoked. - -C is not a generic SGML parser. We have tried to -make it able to deal with the HTML that is actually "out there", and -it normally parses as closely as possible to the way the popular web -browsers do it instead of strictly following one of the many HTML -specifications from W3C. Where there is disagreement, there is often -an option that you can enable to get the official behaviour. - -The document to be parsed may be supplied in arbitrary chunks. This -makes on-the-fly parsing as documents are received from the network -possible. - -If event driven parsing does not feel right for your application, you -might want to use C. This is an C -subclass that allows a more conventional program structure. - - -=head1 METHODS - -The following method is used to construct a new C object: - -=over - -=item $p = HTML::Parser->new( %options_and_handlers ) - -This class method creates a new C object and -returns it. Key/value argument pairs may be provided to assign event -handlers or initialize parser options. The handlers and parser -options can also be set or modified later by the method calls described below. - -If a top level key is in the form "_h" (e.g., "text_h") then it -assigns a handler to that event, otherwise it initializes a parser -option. The event handler specification value must be an array -reference. Multiple handlers may also be assigned with the 'handlers -=> [%handlers]' option. See examples below. - -If new() is called without any arguments, it will create a parser that -uses callback methods compatible with version 2 of C. -See the section on "version 2 compatibility" below for details. - -The special constructor option 'api_version => 2' can be used to -initialize version 2 callbacks while still setting other options and -handlers. The 'api_version => 3' option can be used if you don't want -to set any options and don't want to fall back to v2 compatible -mode. - -Examples: - - $p = HTML::Parser->new(api_version => 3, - text_h => [ sub {...}, "dtext" ]); - -This creates a new parser object with a text event handler subroutine -that receives the original text with general entities decoded. - - $p = HTML::Parser->new(api_version => 3, - start_h => [ 'my_start', "self,tokens" ]); - -This creates a new parser object with a start event handler method -that receives the $p and the tokens array. - - $p = HTML::Parser->new(api_version => 3, - handlers => { text => [\@array, "event,text"], - comment => [\@array, "event,text"], - }); - -This creates a new parser object that stores the event type and the -original text in @array for text and comment events. - -=back - -The following methods feed the HTML document -to the C object: - -=over - -=item $p->parse( $string ) - -Parse $string as the next chunk of the HTML document. The return -value is normally a reference to the parser object (i.e. $p). -Handlers invoked should not attempt to modify the $string in-place until -$p->parse returns. - -If an invoked event handler aborts parsing by calling $p->eof, then -$p->parse() will return a FALSE value. - -=item $p->parse( $code_ref ) - -If a code reference is passed as the argument to be parsed, then the -chunks to be parsed are obtained by invoking this function repeatedly. -Parsing continues until the function returns an empty (or undefined) -result. When this happens $p->eof is automatically signaled. - -Parsing will also abort if one of the event handlers calls $p->eof. - -The effect of this is the same as: - - while (1) { - my $chunk = &$code_ref(); - if (!defined($chunk) || !length($chunk)) { - $p->eof; - return $p; - } - $p->parse($chunk) || return undef; - } - -But it is more efficient as this loop runs internally in XS code. - -=item $p->parse_file( $file ) - -Parse text directly from a file. The $file argument can be a -filename, an open file handle, or a reference to an open file -handle. - -If $file contains a filename and the file can't be opened, then the -method returns an undefined value and $! tells why it failed. -Otherwise the return value is a reference to the parser object. - -If a file handle is passed as the $file argument, then the file will -normally be read until EOF, but not closed. - -If an invoked event handler aborts parsing by calling $p->eof, -then $p->parse_file() may not have read the entire file. - -On systems with multi-byte line terminators, the values passed for the -offset and length argspecs may be too low if parse_file() is called on -a file handle that is not in binary mode. - -If a filename is passed in, then parse_file() will open the file in -binary mode. - -=item $p->eof - -Signals the end of the HTML document. Calling the $p->eof method -outside a handler callback will flush any remaining buffered text -(which triggers the C event if there is any remaining text). - -Calling $p->eof inside a handler will terminate parsing at that point -and cause $p->parse to return a FALSE value. This also terminates -parsing by $p->parse_file(). - -After $p->eof has been called, the parse() and parse_file() methods -can be invoked to feed new documents with the parser object. - -The return value from eof() is a reference to the parser object. - -=back - - -Most parser options are controlled by boolean attributes. -Each boolean attribute is enabled by calling the corresponding method -with a TRUE argument and disabled with a FALSE argument. The -attribute value is left unchanged if no argument is given. The return -value from each method is the old attribute value. - -Methods that can be used to get and/or set parser options are: - -=over - -=item $p->attr_encoded - -=item $p->attr_encoded( $bool ) - -By default, the C and C<@attr> argspecs will have general -entities for attribute values decoded. Enabling this attribute leaves -entities alone. - -=item $p->boolean_attribute_value( $val ) - -This method sets the value reported for boolean attributes inside HTML -start tags. By default, the name of the attribute is also used as its -value. This affects the values reported for C and C -argspecs. - -=item $p->case_sensitive - -=item $p->case_sensitive( $bool ) - -By default, tagnames and attribute names are down-cased. Enabling this -attribute leaves them as found in the HTML source document. - -=item $p->closing_plaintext - -=item $p->closing_plaintext( $bool ) - -By default, "plaintext" element can never be closed. Everything up to -the end of the document is parsed in CDATA mode. This historical -behaviour is what at least MSIE does. Enabling this attribute makes -closing "" tag effective and the parsing process will resume -after seeing this tag. This emulates gecko-based browsers. - -=item $p->empty_element_tags - -=item $p->empty_element_tags( $bool ) - -By default, empty element tags are not recognized as such and the "/" -before ">" is just treated like a normal name character (unless -C is enabled). Enabling this attribute make -C recognize these tags. - -Empty element tags look like start tags, but end with the character -sequence "/>" instead of ">". When recognized by C they -cause an artificial end event in addition to the start event. The -C for the artificial end event will be empty and the C -array will be undefined even though the the token array will have one -element containing the tag name. - -=item $p->marked_sections - -=item $p->marked_sections( $bool ) - -By default, section markings like are treated like -ordinary text. When this attribute is enabled section markings are -honoured. - -There are currently no events associated with the marked section -markup, but the text can be returned as C. - -=item $p->strict_comment - -=item $p->strict_comment( $bool ) - -By default, comments are terminated by the first occurrence of "-->". -This is the behaviour of most popular browsers (like Mozilla, Opera and -MSIE), but it is not correct according to the official HTML -standard. Officially, you need an even number of "--" tokens before -the closing ">" is recognized and there may not be anything but -whitespace between an even and an odd "--". - -The official behaviour is enabled by enabling this attribute. - -Enabling of 'strict_comment' also disables recognizing these forms as -comments: - - - - - -=item $p->strict_end - -=item $p->strict_end( $bool ) - -By default, attributes and other junk are allowed to be present on end tags in a -manner that emulates MSIE's behaviour. - -The official behaviour is enabled with this attribute. If enabled, -only whitespace is allowed between the tagname and the final ">". - -=item $p->strict_names - -=item $p->strict_names( $bool ) - -By default, almost anything is allowed in tag and attribute names. -This is the behaviour of most popular browsers and allows us to parse -some broken tags with invalid attribute values like: - - [PREV - -By default, "LIST]" is parsed as a boolean attribute, not as -part of the ALT value as was clearly intended. This is also what -Mozilla sees. - -The official behaviour is enabled by enabling this attribute. If -enabled, it will cause the tag above to be reported as text -since "LIST]" is not a legal attribute name. - -=item $p->unbroken_text - -=item $p->unbroken_text( $bool ) - -By default, blocks of text are given to the text handler as soon as -possible (but the parser takes care always to break text at a -boundary between whitespace and non-whitespace so single words and -entities can always be decoded safely). This might create breaks that -make it hard to do transformations on the text. When this attribute is -enabled, blocks of text are always reported in one piece. This will -delay the text event until the following (non-text) event has been -recognized by the parser. - -Note that the C argspec will give you the offset of the first -segment of text and C is the combined length of the segments. -Since there might be ignored tags in between, these numbers can't be -used to directly index in the original document file. - -=item $p->utf8_mode - -=item $p->utf8_mode( $bool ) - -Enable this option when parsing raw undecoded UTF-8. This tells the -parser that the entities expanded for strings reported by C, -C<@attr> and C should be expanded as decoded UTF-8 so they end -up compatible with the surrounding text. - -If C is enabled then it is an error to pass strings -containing characters with code above 255 to the parse() method, and -the parse() method will croak if you try. - -Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 -encoded. The character can also be represented by the entity -"♥" or "♥". If we feed the parser: - - $p->parse("\xE2\x99\xA5♥"); - -then C will be reported as "\xE2\x99\xA5\x{2665}" without -C enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. -The later string is what you want. - -This option is only available with perl-5.8 or better. - -=item $p->xml_mode - -=item $p->xml_mode( $bool ) - -Enabling this attribute changes the parser to allow some XML -constructs. This enables the behaviour controlled by individually by -the C, C, C and -C attributes and also suppresses special treatment of -elements that are parsed as CDATA for HTML. - -=item $p->xml_pic - -=item $p->xml_pic( $bool ) - -By default, I are terminated by ">". When -this attribute is enabled, processing instructions are terminated by -"?>" instead. - -=back - -As markup and text is recognized, handlers are invoked. The following -method is used to set up handlers for different events: - -=over - -=item $p->handler( event => \&subroutine, $argspec ) - -=item $p->handler( event => $method_name, $argspec ) - -=item $p->handler( event => \@accum, $argspec ) - -=item $p->handler( event => "" ); - -=item $p->handler( event => undef ); - -=item $p->handler( event ); - -This method assigns a subroutine, method, or array to handle an event. - -Event is one of C, C, C, C, C, -C, C, C or C. - -The C<\&subroutine> is a reference to a subroutine which is called to handle -the event. - -The C<$method_name> is the name of a method of $p which is called to handle -the event. - -The C<@accum> is an array that will hold the event information as -sub-arrays. - -If the second argument is "", the event is ignored. -If it is undef, the default handler is invoked for the event. - -The C<$argspec> is a string that describes the information to be reported -for the event. Any requested information that does not apply to a -specific event is passed as C. If argspec is omitted, then it -is left unchanged. - -The return value from $p->handler is the old callback routine or a -reference to the accumulator array. - -Any return values from handler callback routines/methods are always -ignored. A handler callback can request parsing to be aborted by -invoking the $p->eof method. A handler callback is not allowed to -invoke the $p->parse() or $p->parse_file() method. An exception will -be raised if it tries. - -Examples: - - $p->handler(start => "start", 'self, attr, attrseq, text' ); - -This causes the "start" method of object $p to be called for 'start' events. -The callback signature is $p->start(\%attr, \@attr_seq, $text). - - $p->handler(start => \&start, 'attr, attrseq, text' ); - -This causes subroutine start() to be called for 'start' events. -The callback signature is start(\%attr, \@attr_seq, $text). - - $p->handler(start => \@accum, '"S", attr, attrseq, text' ); - -This causes 'start' event information to be saved in @accum. -The array elements will be ['S', \%attr, \@attr_seq, $text]. - - $p->handler(start => ""); - -This causes 'start' events to be ignored. It also suppresses -invocations of any default handler for start events. It is in most -cases equivalent to $p->handler(start => sub {}), but is more -efficient. It is different from the empty-sub-handler in that -C is not reset by it. - - $p->handler(start => undef); - -This causes no handler to be associated with start events. -If there is a default handler it will be invoked. - -=back - -Filters based on tags can be set up to limit the number of events -reported. The main bottleneck during parsing is often the huge number -of callbacks made from the parser. Applying filters can improve -performance significantly. - -The following methods control filters: - -=over - -=item $p->ignore_elements( @tags ) - -Both the C event and the C event as well as any events that -would be reported in between are suppressed. The ignored elements can -contain nested occurrences of itself. Example: - - $p->ignore_elements(qw(script style)); - -The C - -å -EOT - -$p->parse($doc)->eof; - -is($text, $doc); -is($dtext, <<"EOT"); -å -ååAAAA - -foo\240bar -foo\240bar -&xyzzy -&xyzzy; - -\1 -\377 -\377 -\377G - -� -� -& -&# -&#x -&aring - - -å -EOT diff --git a/ext/HTML/Parser/t/entities.t b/ext/HTML/Parser/t/entities.t deleted file mode 100644 index b8342f5..0000000 --- a/ext/HTML/Parser/t/entities.t +++ /dev/null @@ -1,193 +0,0 @@ -use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); - -use Test::More tests => 12; - -$a = "Våre norske tegn bør æres"; - -decode_entities($a); - -is($a, "Våre norske tegn bør æres"); - -encode_entities($a); - -is($a, "Våre norske tegn bør æres"); - -decode_entities($a); -encode_entities_numeric($a); - -is($a, "Våre norske tegn bør æres"); - -$a = "<&>\"'"; -is(encode_entities($a), "<&>"'"); -is(encode_entities_numeric($a), "<&>"'"); - -$a = "abcdef"; -is(encode_entities($a, 'a-c'), "abcdef"); - - -# See how well it does against rfc1866... -$ent = $plain = ""; -while () { - next unless /^\s* -# Subject: HTML entities problem with 5.11 -# To: libwww-perl@ics.uci.edu -# Date: Fri, 05 Sep 1997 16:56:55 +1000 -# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU> -# -# Hi. I've got a problem that has surfaced with the changes to -# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening -# in the process of encoding then decoding special entities. Eg, what goes -# in as "abc&def&ghi" comes out as "abc&def;&ghi;". - -is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); - -# Decoding of ' -is(decode_entities("'"), "'"); -is(encode_entities("'", "'"), "'"); - - -__END__ -# Quoted from rfc1866.txt - -14. Proposed Entities - - The HTML DTD references the "Added Latin 1" entity set, which only - supplies named entities for a subset of the non-ASCII characters in - [ISO-8859-1], namely the accented characters. The following entities - should be supported so that all ISO 8859-1 characters may only be - referenced symbolically. The names for these entities are taken from - the appendixes of [SGML]. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Berners-Lee & Connolly Standards Track [Page 75] - -RFC 1866 Hypertext Markup Language - 2.0 November 1995 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Berners-Lee & Connolly Standards Track [Page 76] - -RFC 1866 Hypertext Markup Language - 2.0 November 1995 - - - - - - - - - - - - - - - diff --git a/ext/HTML/Parser/t/entities2.t b/ext/HTML/Parser/t/entities2.t deleted file mode 100644 index 7840c71..0000000 --- a/ext/HTML/Parser/t/entities2.t +++ /dev/null @@ -1,57 +0,0 @@ -#!perl -w - -use strict; -use Test::More tests => 9; - -use HTML::Entities qw(_decode_entities); - -eval { - _decode_entities("<", undef); -}; -like($@, qr/^Can't inline decode readonly string/); - -eval { - my $a = ""; - _decode_entities($a, $a); -}; -like($@, qr/^2nd argument must be hash reference/); - -eval { - my $a = ""; - _decode_entities($a, []); -}; -like($@, qr/^2nd argument must be hash reference/); - -$a = "<"; -_decode_entities($a, undef); -is($a, "<"); - -_decode_entities($a, { "lt" => "<" }); -is($a, "<"); - -my $x = "x" x 20; - -my $err; -for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") { - my $a = $_; - $a =~ s/:/&a;/g; - my $b = $_; - $b =~ s/:/$x/g; - _decode_entities($a, { "a" => $x }); - if ($a ne $b) { - diag "Something went wrong with '$_'"; - $err++; - } -} -ok(!$err); - -$a = "foo bar"; -_decode_entities($a, \%HTML::Entities::entity2char); -is($a, "foo\xA0bar"); - -$a = "foo bar"; -_decode_entities($a, \%HTML::Entities::entity2char); -is($a, "foo bar"); - -_decode_entities($a, \%HTML::Entities::entity2char, 1); -is($a, "foo\xA0bar"); diff --git a/ext/HTML/Parser/t/filter-methods.t b/ext/HTML/Parser/t/filter-methods.t deleted file mode 100644 index 9eccaf1..0000000 --- a/ext/HTML/Parser/t/filter-methods.t +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/perl -w - -use Test::More tests => 12; -use strict; - -use HTML::Parser; - -my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]); -$p->ignore_elements("script"); -$p->unbroken_text(1); - -$p->handler(default => [], "event, text"); -$p->parse(<<"EOT")->eof; -foo -This is an italic and bold text. - - -EOT - -my $t = join("||", map join("|", @$_), @{$p->handler("default")}); -#diag $t; - -is($t, "start_document|||start|||start|||start|||text|foo||end|||start|||text| -This is an italic and bold text. -||end|||text| -||end|||text| -||end_document|", 'ignore_elements'); - - -#------------------------------------------------------ - -$p = HTML::Parser->new(api_version => 3); -$p->report_tags("a"); -$p->handler(start => sub { - my($tagname, %attr) = @_; - ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start'); - }, 'tagname, @attr'); -$p->handler(end => sub { - my $tagname = shift; - is($tagname, "a", 'report_tags end'); - }, 'tagname'); - -$p->parse(<eof; - -

Next example

- -This is very nice example. - -EOT - - -#------------------------------------------------------ - -my @tags; -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(a em)); -$p->ignore_tags(qw(em)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<eof; - -

Next example

- -This is yet another very nice example. - -EOT -is(join('|', @tags), 'a', 'report_tags followed by ignore_tags'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(h1)); -$p->report_tags(); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<eof; - -

Next example

-

Next example

- -EOT -is(join('|', @tags), 'h1|h2', 'reset report_tags filter'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(h1 h2)); -$p->ignore_tags(qw(h2)); -$p->report_tags(qw(h1 h2)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<eof; - -

Next example

-

Next example

- -EOT -is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(h1 h2)); -$p->ignore_tags(qw(h2)); -$p->report_tags(); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<eof; - -

Next example

-

Next example

- -EOT -is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(h1 h2)); -$p->report_tags(qw(h3)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<eof; - -

Next example

-

Next example

-

Next example

- -EOT -is(join('|', @tags), 'h3', 'report_tags replaces filter'); - - -#------------------------------------------------------ - - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->ignore_tags(qw(h1 h2)); -$p->ignore_tags(qw(h3)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<eof; - -

Next example

-

Next example

-

Next example

- -EOT -is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->ignore_tags(qw(h2)); -$p->ignore_tags(); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<eof; - -

Next example

-

Next example

- -EOT -is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->ignore_tags(qw(h2)); -$p->report_tags(qw(h1 h2)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<eof; - -

Next example

-

Next example

- -EOT -is(join('|', @tags), 'h1', 'ignore_tags before report_tags'); -#------------------------------------------------------ - -$p = HTML::Parser->new(api_version => 3); -$p->ignore_elements("script"); -my $res=""; -$p->handler(default=> sub {$res.=$_[0];}, 'text'); -$p->parse(<<'EOT')->eof; -A C D F -EOT -is($res,"A C D F\n","ignore without " - ignore this - - - - - - - -Dette er vanlig tekst. Denne teksten definerer også slutten på -<head> delen av dokumentet. - -" - ignore this too - - - - - -Dette er også vanlig tekst som ikke skal blir parset i det hele tatt. - -EOT - -$| = 1; - -#$HTML::HeadParser::DEBUG = 1; -require HTML::HeadParser; -my $p = HTML::HeadParser->new( H->new ); - -if ($p->parse($HTML)) { - fail("Need more data which should not happen"); -} else { - #diag $p->as_string; - pass(); -} - -like($p->header('Title'), qr/Å være eller å ikke være/); -is($p->header('Expires'), 'Soon'); -is($p->header('Content-Base'), 'http://www.sn.no'); -like($p->header('Link'), qr//); - -# This header should not be present because the head ended -ok(!$p->header('Isindex')); - - -# Try feeding one char at a time -my $expected = $p->as_string; -my $nl = 1; -$p = HTML::HeadParser->new(H->new); -while ($HTML =~ /(.)/sg) { - #print STDERR '#' if $nl; - #print STDERR $1; - $nl = $1 eq "\n"; - $p->parse($1) or last; -} -is($p->as_string, $expected); - - -# Try reading it from a file -my $file = "hptest$$.html"; -die "$file already exists" if -e $file; - -open(FILE, ">$file") or die "Can't create $file: $!"; -binmode(FILE); -print FILE $HTML; -print FILE "

This is more content...

\n" x 2000; -print FILE "Buuuh!\n" x 200; -close FILE or die "Can't close $file: $!"; - -$p = HTML::HeadParser->new(H->new); -$p->parse_file($file); -unlink($file) or warn "Can't unlink $file: $!"; - -is($p->header("Title"), "Å være eller å ikke være"); - - -# We got into an infinite loop on data without tags and no EOL. -# This was actually a HTML::Parser bug. -open(FILE, ">$file") or die "Can't create $file: $!"; -print FILE "Foo"; -close(FILE); - -$p = HTML::HeadParser->new(H->new); -$p->parse_file($file); -unlink($file) or warn "Can't unlink $file: $!"; - -ok(!$p->as_string); - -SKIP: { - skip "Need Unicode support", 2 if $] < 5.008; - - # Test that the Unicode BOM does not confuse us? - $p = HTML::HeadParser->new(H->new); - ok($p->parse("\x{FEFF}\nHi <foo>")); - $p->eof; - - is($p->header("title"), "Hi "); -} diff --git a/ext/HTML/Parser/t/ignore.t b/ext/HTML/Parser/t/ignore.t deleted file mode 100644 index 008739e..0000000 --- a/ext/HTML/Parser/t/ignore.t +++ /dev/null @@ -1,27 +0,0 @@ - -use Test::More tests => 4; - -use strict; -use HTML::Parser (); - -my $html = 'text'; - -my $text = ''; -my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']); -$p->parse($html)->eof; -is($text, $html); - -$text = ''; -$p->handler(start => ""); -$p->parse($html)->eof; -is($text, 'text'); - -$text = ''; -$p->handler(end => 0); -$p->parse($html)->eof; -is($text, 'text'); - -$text = ''; -$p->handler(start => undef); -$p->parse($html)->eof; -is($text, 'text'); diff --git a/ext/HTML/Parser/t/largetags.t b/ext/HTML/Parser/t/largetags.t deleted file mode 100644 index a9ed3ff..0000000 --- a/ext/HTML/Parser/t/largetags.t +++ /dev/null @@ -1,38 +0,0 @@ -# Exercise the tokenpos buffer allocation routines by feeding it -# very large tags. - -use Test::More tests => 2; - -use strict; -use HTML::Parser (); - -my $p = HTML::Parser->new(api_version => 3); - -$p->handler("start" => - sub { - my $tp = shift; - #diag int(@$tp), " - ", join(", ", @$tp); - is(@$tp, 2 + 26 * 6 * 4); - }, "tokenpos"); - -$p->handler("declaration" => - sub { - my $t = shift; - #diag int(@$t), " - @$t"; - is(@$t, 26 * 6 * 2 + 1); - }, "tokens"); - -$p->parse("parse("$_=1 "); -} -$p->parse(">"); - -$p->parse("parse("$_ -- $_ -- "); -} -$p->parse(">"); -$p->eof; -exit; - diff --git a/ext/HTML/Parser/t/linkextor-base.t b/ext/HTML/Parser/t/linkextor-base.t deleted file mode 100644 index 7ef8f02..0000000 --- a/ext/HTML/Parser/t/linkextor-base.t +++ /dev/null @@ -1,41 +0,0 @@ -# This test that HTML::LinkExtor really absolutize links correctly -# when a base URL is given to the constructor. - -use Test::More tests => 5; -require HTML::LinkExtor; - -SKIP: { -eval { - require URI; -}; -skip $@, 5 if $@; - -# Try with base URL and the $p->links interface. -$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html"); -$p->parse(<eof; - - - - - -This is link and an Image. -HTML - -@p = $p->links; - -# There should be 4 links in the document -is(@p, 4); - -for (@p) { - ($t, %attr) = @$_ if $_->[0] eq 'img'; -} - -is($t, 'img'); - -is(delete $attr{src}, "http://www.sn.no/foo/img.jpg"); - -is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif"); - -ok(!scalar(keys %attr)); # there should be no more attributes -} diff --git a/ext/HTML/Parser/t/linkextor-rel.t b/ext/HTML/Parser/t/linkextor-rel.t deleted file mode 100644 index 1190a96..0000000 --- a/ext/HTML/Parser/t/linkextor-rel.t +++ /dev/null @@ -1,36 +0,0 @@ -use Test::More tests => 4; - -require HTML::LinkExtor; - -$HTML = < - - - - -This is link and an Image. -HTML - - -# Try the callback interface -$links = ""; -$p = HTML::LinkExtor->new( - sub { - my($tag, %links) = @_; - #diag "$tag @{[%links]}"; - $links .= "$tag @{[%links]}\n"; - }); - -$p->parse($HTML); $p->eof; - -ok($links =~ m|^base href http://www\.sn\.no/$|m); -ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m); -ok($links =~ m|^a href link\.html$|m); - -# Used to be problems when using the links method on a document with -# no links it it. This is a test to prove that it works. -$p = new HTML::LinkExtor; -$p->parse("this is a document with no links"); $p->eof; -@a = $p->links; -is(@a, 0); diff --git a/ext/HTML/Parser/t/magic.t b/ext/HTML/Parser/t/magic.t deleted file mode 100644 index 366f275..0000000 --- a/ext/HTML/Parser/t/magic.t +++ /dev/null @@ -1,41 +0,0 @@ -# Check that the magic signature at the top of struct p_state works and that we -# catch modifications to _hparser_xs_state gracefully - -use Test::More tests => 5; - -use HTML::Parser; - -$p = HTML::Parser->new(api_version => 3); - -$p->xml_mode(1); - -# We should not be able to simply modify this stuff -eval { - ${$p->{_hparser_xs_state}} += 4; -}; -like($@, qr/^Modification of a read-only value attempted/); - - -my $x = delete $p->{_hparser_xs_state}; - -eval { - $p->xml_mode(1); -}; -like($@, qr/^Can't find '_hparser_xs_state'/); - -$p->{_hparser_xs_state} = \($$x + 16); - -eval { - $p->xml_mode(1); -}; -like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/); - -$p->{_hparser_xs_state} = 33; -eval { - $p->xml_mode(1); -}; -like($@, qr/^_hparser_xs_state element is not a reference/); - -$p->{_hparser_xs_state} = $x; - -ok($p->xml_mode(0)); diff --git a/ext/HTML/Parser/t/marked-sect.t b/ext/HTML/Parser/t/marked-sect.t deleted file mode 100644 index 6a63478..0000000 --- a/ext/HTML/Parser/t/marked-sect.t +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -my $tag; -my $text; - -use HTML::Parser (); -my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"], - text_h => [sub { $text .= shift }, "dtext"], - ); - - -use Test::More tests => 14; - -SKIP: { -eval { - $p->marked_sections(1); -}; -skip $@, 14 if $@; - -$p->parse(""); -is($text, "foo"); - -$p->parse(""); -is($text, "foobar"); - -$p->parse("]]>\n
"); -is($text, "foobarfoo\n"); - -$text = ""; -$p->parse("parse(",bar>]]>
"); -is($text, "<foo]]>"); - -$text = ""; -$p->parse("]]>]]>å
"); -is($text, "å
åå"); -is($tag, "br"); - -$text = ""; -$p->parse("]]>
"); -is($text, ""); - -$text = ""; -$p->parse("]]>
"); -is($text, "fooå
"); - -$text = ""; -$p->parse("]]>
"); -is($text, "fooå
"); - -$text = ""; -$p->parse("]]>
"); -is($text, "fooå"); - -$text = ""; -$p->parse("]]>
"); -is($text, "fooå"); - -# offsets/line/column numbers -$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"], - marked_sections => 1, - ); -$p->parse(<<'EOT')->eof; -Test - -]]> - -

Test

-EOT - -my @x; -sub x { - my($line, $col, $offset, $event, $text) = @_; - $text =~ s/\n/\\n/g; - $text =~ s/ /./g; - push(@x, "$line.$col:$offset $event \"$text\"\n"); -} - -#diag @x; -is(join("", @x), <<'EOT'); -1.0:0 start_document "" -1.0:0 start "" -1.7:7 text "Test" -1.11:11 end "" -1.19:19 text "\n" -3.3:32 text "fooå
\n" -4.3:49 text "\n" -5.4:54 text "\nINCLUDE\nSTUFF\n" -8.3:72 text "\n.." -9.2:75 start "

" -9.6:79 text "Test" -9.10:83 end "

" -9.15:88 text "\n" -10.0:89 end_document "" -EOT - -my $doc = ""; -my $result = ""; -$p = HTML::Parser->new( - marked_sections => 1, - handlers => { - default => [ sub { $result .= join("",@_); }, "skipped_text,text" ] - } -)->parse($doc)->eof; -is($doc, $result); - -$text = ""; -$p = HTML::Parser->new( - text_h => [sub { $text .= shift }, "dtext"], - marked_sections => 1, -); - -$p->parse(""); -is($text, "foo [1]", "CDATA text ending in square bracket"); - -} # SKIP diff --git a/ext/HTML/Parser/t/msie-compat.t b/ext/HTML/Parser/t/msie-compat.t deleted file mode 100644 index 90d4b7e..0000000 --- a/ext/HTML/Parser/t/msie-compat.t +++ /dev/null @@ -1,58 +0,0 @@ -#!perl -w - -use strict; -use HTML::Parser; - -use Test::More tests => 2; - -my $TEXT = ""; -sub h -{ - my($event, $tagname, $text) = @_; - for ($event, $tagname, $text) { - if (defined) { - s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; - } - else { - $_ = ""; - } - } - - $TEXT .= "[$event,$tagname,$text]\n"; -} - -my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"]); -$p->parse("
"); -$p->parse(""); -$p->parse("' 'bar>' x>"); -$p->parse("\""); -$p->parse(" \"bar>\" x>"); -$p->parse(""); -$p->parse("\" >"); -$p->parse(" -xmp - -EOT - -my $p = HTML::Parser->new(api_version => 3); - -my $sum_len = 0; -my $count = 0; -my $err; - -$p->handler(default => - sub { - my($offset, $length, $offset_end, $line, $col, $text) = @_; - my $copy = $text; - $copy =~ s/\n/\\n/g; - substr($copy, 30) = "..." if length($copy) > 32; - #diag sprintf ">>> %d.%d %s", $line, $col, $copy; - if ($offset != $sum_len) { - diag "offset mismatch $offset vs $sum_len"; - $err++; - } - if ($offset_end != $offset + $length) { - diag "offset_end $offset_end wrong"; - $err++; - } - if ($length != length($text)) { - diag "length mismatch"; - $err++; - } - if (substr($HTML, $offset, $length) ne $text) { - diag "content mismatch"; - $err++; - } - $sum_len += $length; - $count++; - }, - 'offset,length,offset_end,line,column,text'); - -for (split(//, $HTML)) { - $p->parse($_); -} -$p->eof; - -ok($count > 5 && !$err); - - diff --git a/ext/HTML/Parser/t/options.t b/ext/HTML/Parser/t/options.t deleted file mode 100644 index ff5f7db..0000000 --- a/ext/HTML/Parser/t/options.t +++ /dev/null @@ -1,36 +0,0 @@ -# Test option setting methods - -use Test::More tests => 10; - -use strict; -use HTML::Parser (); - -my $p = HTML::Parser->new(api_version => 3, - xml_mode => 1); -my $old; - -$old = $p->boolean_attribute_value("foo"); -ok(!defined $old); - -$old = $p->boolean_attribute_value(); -is($old, "foo"); - -$old = $p->boolean_attribute_value(undef); -is($old, "foo"); -ok(!defined($p->boolean_attribute_value)); - -ok($p->xml_mode(0)); -ok(!$p->xml_mode); - -my $seen_buggy_comment_warning; -$SIG{__WARN__} = - sub { - local $_ = shift; - $seen_buggy_comment_warning++ - if /^netscape_buggy_comment\(\) is deprecated/; - }; - -ok(!$p->strict_comment(1)); -ok($p->strict_comment); -ok(!$p->netscape_buggy_comment); -ok($seen_buggy_comment_warning); diff --git a/ext/HTML/Parser/t/parsefile.t b/ext/HTML/Parser/t/parsefile.t deleted file mode 100644 index f373f06..0000000 --- a/ext/HTML/Parser/t/parsefile.t +++ /dev/null @@ -1,45 +0,0 @@ -use Test::More tests => 6; - -my $filename = "file$$.htm"; -die "$filename is already there" if -e $filename; -open(FILE, ">$filename") || die "Can't create $filename: $!"; -print FILE <<'EOT'; close(FILE); -Heisan -EOT - -{ - package MyParser; - require HTML::Parser; - @ISA=qw(HTML::Parser); - - sub start - { - my($self, $tag, $attr) = @_; - Test::More::is($tag, "title"); - } -} - -MyParser->new->parse_file($filename); -open(FILE, $filename) || die; -MyParser->new->parse_file(*FILE); -seek(FILE, 0, 0) || die; -MyParser->new->parse_file(\*FILE); -close(FILE); - -require IO::File; -my $io = IO::File->new($filename) || die; -MyParser->new->parse_file($io); -$io->seek(0, 0) || die; -MyParser->new->parse_file(*$io); - -my $text = ''; -$io->seek(0, 0) || die; -MyParser->new( - start_h => [ sub{ shift->eof; }, "self" ], - text_h => [ sub{ $text = shift; }, "text" ])->parse_file(*$io); -ok(!$text); - -close($io); # needed because of bug in perl -undef($io); - -unlink($filename) or warn "Can't unlink $filename: $!"; diff --git a/ext/HTML/Parser/t/parser.t b/ext/HTML/Parser/t/parser.t deleted file mode 100644 index 0ce4d95..0000000 --- a/ext/HTML/Parser/t/parser.t +++ /dev/null @@ -1,184 +0,0 @@ -use Test::More tests => 7; - -$HTML = <<'HTML'; - - - - - -Various entities. The parser must never break them in the middle: - -/ -/ -È -௖ -￿ -å-Å - - - -

- - and this is not. - - that Netscape hates --> - -< this > was not a tag. - - - -HTML - -#------------------------------------------------------------------- - -{ - package P; - require HTML::Parser; - @ISA=qw(HTML::Parser); - $OUT=''; - $COUNT=0; - - sub new - { - my $class = shift; - my $self = $class->SUPER::new; - $OUT = ''; - die "Can only have one" if $COUNT++; - $self; - } - - sub DESTROY - { - my $self = shift; - eval { $self->SUPER::DESTROY; }; - $COUNT--; - } - - sub declaration - { - my($self, $decl) = @_; - $OUT .= "[[$decl]]|"; - } - - sub start - { - my($self, $tag, $attr) = @_; - $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); - $attr = "/$attr" if length $attr; - $OUT .= "<<$tag$attr>>|"; - } - - sub end - { - my($self, $tag) = @_; - $OUT .= ">>$tag<<|"; - } - - sub comment - { - my($self, $comment) = @_; - $OUT .= "##$comment##|"; - } - - sub text - { - my($self, $text) = @_; - #$text =~ s/\n/\\n/g; - #$text =~ s/\t/\\t/g; - #$text =~ s/ /·/g; - $OUT .= "$text|"; - } - - sub result - { - $OUT; - } -} - -for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") { -#for $chunksize (1) { - if ($chunksize =~ /^file/) { - #print "Parsing from $chunksize"; - } else { - #print "Parsing using $chunksize byte chunks"; - } - my $p = P->new; - - if ($chunksize =~ /^file/) { - # First we must create the file - my $tmpfile = "tmp-$$.html"; - my $file = $tmpfile; - die "$file already exists" if -e $file; - open(FILE, ">$file") or die "Can't create $file: $!"; - binmode FILE; - print FILE $HTML; - close(FILE); - - if ($chunksize eq "filehandle") { - require FileHandle; - my $fh = FileHandle->new($file) || die "Can't open $file: $!"; - $file = $fh; - } - - # then we can parse it. - $p->parse_file($file); - close $file if $chunksize eq "filehandle"; - unlink($tmpfile) || warn "Can't unlink $tmpfile: $!"; - } else { - my $copy = $HTML; - while (length $copy) { - my $chunk = substr($copy, 0, $chunksize); - substr($copy, 0, $chunksize) = ''; - $p->parse($chunk); - } - $p->eof; - } - - my $res = $p->result; - my $bad; - - # Then we start looking for things that should not happen - if ($res =~ /\s\|\s/) { - diag "broken space"; - $bad++; - } - for ( - # Make sure entities are not broken - '/', '/', 'È', '௖', '￿', 'å', 'Å', - - # Some elements that should be produced - "|[[DOCTYPE HTML]]|", - "|## this is\na comment ##|", - "|<