inc/HTML/TokeParser.pm

   1 #line 1 "inc/HTML/TokeParser.pm - /System/Library/Perl/Extras/5.8.6/darwin-thread-multi-2level/HTML/TokeParser.pm"
   2 package HTML::TokeParser;
   3
   4 # $Id: TokeParser.pm,v 2.28 2003/10/14 10:11:05 gisle Exp $
   5
   6 require HTML::PullParser;
   7 @ISA=qw(HTML::PullParser);
   8 $VERSION = sprintf("%d.%02d", q$Revision: 2.28 $ =~ /(\d+)\.(\d+)/);
   9
  10 use strict;
  11 use Carp ();
  12 use HTML::Entities qw(decode_entities);
  13 use HTML::Tagset ();
  14
  15 my %ARGS =
  16 (
  17  start       => "'S',tagname,attr,attrseq,text",
  18  end         => "'E',tagname,text",
  19  text        => "'T',text,is_cdata",
  20  process     => "'PI',token0,text",
  21  comment     => "'C',text",
  22  declaration => "'D',text",
  23 );
  24
  25
  26 sub new
  27 {
  28     my $class = shift;
  29     my %cnf;
  30     if (@_ == 1) {
  31         my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
  32         %cnf = ($type => $_[0]);
  33     }
  34     else {
  35         %cnf = @_;
  36     }
  37
  38     my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
  39
  40     my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
  41
  42     $self->{textify} = $textify;
  43     $self;
  44 }
  45
  46
  47 sub get_tag
  48 {
  49     my $self = shift;
  50     my $token;
  51     while (1) {
  52         $token = $self->get_token || return undef;
  53         my $type = shift @$token;
  54         next unless $type eq "S" || $type eq "E";
  55         substr($token->[0], 0, 0) = "/" if $type eq "E";
  56         return $token unless @_;
  57         for (@_) {
  58             return $token if $token->[0] eq $_;
  59         }
  60     }
  61 }
  62
  63
  64 sub _textify {
  65     my($self, $token) = @_;
  66     my $tag = $token->[1];
  67     return undef unless exists $self->{textify}{$tag};
  68
  69     my $alt = $self->{textify}{$tag};
  70     my $text;
  71     if (ref($alt)) {
  72         $text = &$alt(@$token);
  73     } else {
  74         $text = $token->[2]{$alt || "alt"};
  75         $text = "[\U$tag]" unless defined $text;
  76     }
  77     return $text;
  78 }
  79
  80
  81 sub get_text
  82 {
  83     my $self = shift;
  84     my @text;
  85     while (my $token = $self->get_token) {
  86         my $type = $token->[0];
  87         if ($type eq "T") {
  88             my $text = $token->[1];
  89             decode_entities($text) unless $token->[2];
  90             push(@text, $text);
  91         } elsif ($type =~ /^[SE]$/) {
  92             my $tag = $token->[1];
  93             if ($type eq "S") {
  94                 if (defined(my $text = _textify($self, $token))) {
  95                     push(@text, $text);
  96                     next;
  97                 }
  98             } else {
  99                 $tag = "/$tag";
 100             }
 101             if (!@_ || grep $_ eq $tag, @_) {
 102                  $self->unget_token($token);
 103                  last;
 104             }
 105             push(@text, " ")
 106                 if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
 107         }
 108     }
 109     join("", @text);
 110 }
 111
 112
 113 sub get_trimmed_text
 114 {
 115     my $self = shift;
 116     my $text = $self->get_text(@_);
 117     $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
 118     $text;
 119 }
 120
 121 sub get_phrase {
 122     my $self = shift;
 123     my @text;
 124     while (my $token = $self->get_token) {
 125         my $type = $token->[0];
 126         if ($type eq "T") {
 127             my $text = $token->[1];
 128             decode_entities($text) unless $token->[2];
 129             push(@text, $text);
 130         } elsif ($type =~ /^[SE]$/) {
 131             my $tag = $token->[1];
 132             if ($type eq "S") {
 133                 if (defined(my $text = _textify($self, $token))) {
 134                     push(@text, $text);
 135                     next;
 136                 }
 137             }
 138             if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
 139                 $self->unget_token($token);
 140                 last;
 141             }
 142             push(@text, " ") if $tag eq "br";
 143         }
 144     }
 145     my $text = join("", @text);
 146     $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
 147     $text;
 148 }
 149
 150 1;
 151
 152
 153 __END__
 154
 155 #line 341