From: Joe Higton Date: Thu, 27 May 2010 14:51:31 +0000 (+0100) Subject: Improved selector parsing. X-Git-Tag: release_0.009004~53 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=catagits%2FHTML-Zoom.git;a=commitdiff_plain;h=682fa876be5265211f777cf85fac6528d4dd4d41 Improved selector parsing. Mostly: treating each selection clause seperately instead of having special cases for each el + attribute combination --- diff --git a/lib/HTML/Zoom/SelectorParser.pm b/lib/HTML/Zoom/SelectorParser.pm index 9f72b0a..4e4f11c 100644 --- a/lib/HTML/Zoom/SelectorParser.pm +++ b/lib/HTML/Zoom/SelectorParser.pm @@ -7,26 +7,10 @@ use Carp qw(confess); my $sel_char = '-\w_'; my $sel_re = qr/([$sel_char]+)/; +my $match_value_re = qr/"?$sel_re"?/; -sub new { bless({}, shift) } - -my $match_attr_on_regex = sub { - my ($self, $name, $attr, $regex) = @_; - - sub { - $_[0]->{name} && $_[0]->{name} eq $name and - $_[0]->{attrs}{$attr} && $_[0]->{attrs}{$attr} =~ $regex - } -}; - -my $match_attr_on_eq = sub { - my ($self, $name, $attr, $val) = @_; - sub { - $_[0]->{name} && $_[0]->{name} eq $name and - $_[0]->{attrs}{$attr} && $_[0]->{attrs}{$attr} eq $val - } -}; +sub new { bless({}, shift) } sub _raw_parse_simple_selector { for ($_[1]) { # same pos() as outside @@ -36,65 +20,6 @@ sub _raw_parse_simple_selector { /\G\*/gc and return sub { 1 }; - # 'el[attr~="foo"] - - /\G$sel_re\[$sel_re~="$sel_re"\]/gc and - return do { - my $name = $1; - my $attr = $2; - my $val = $3; - sub { - if ( - $_[0]->{name} && $_[0]->{name} eq $name and - $_[0]->{attrs}{$attr} - ) { - my %vals = map { $_ => 1 } split /\s+/, $_[0]->{attrs}{$attr}; - return $vals{$val} - } - return undef - } - }; - - # 'el[attr^="foo"] - - /\G$sel_re\[$sel_re\^="$sel_re"\]/gc and - return do { $_[0]->$match_attr_on_regex($1, $2, qr/^\Q$3\E/) }; - - # 'el[attr$="foo"] - - /\G$sel_re\[$sel_re\$="$sel_re"\]/gc and - return do { $_[0]->$match_attr_on_regex($1, $2, qr/\Q$3\E$/) }; - - # 'el[attr*="foo"] - - /\G$sel_re\[$sel_re\*="$sel_re"\]/gc and - return do { $_[0]->$match_attr_on_regex($1, $2, qr/\Q$3\E/) }; - - # 'el[attr="foo"] - - /\G$sel_re\[$sel_re="$sel_re"\]/gc and - return do { $_[0]->$match_attr_on_eq($1, $2, $3) }; - - # 'el[attr] - - /\G$sel_re\[$sel_re\]/gc and - return do { - my $name = $1; - my $attr = $2; - sub { - $_[0]->{name} && $_[0]->{name} eq $name && $_[0]->{attrs}{$attr} - } - }; - - # 'el.class1' - element + class - - /\G$sel_re\.$sel_re/gc and - return do { $_[0]->$match_attr_on_eq($1, 'class', $2) }; - - # 'el#id' - element + id - /\G$sel_re#$sel_re/gc and - return do { $_[0]->$match_attr_on_eq($1, 'id', $2) }; - # 'element' - match on tag name /\G$sel_re/gc and @@ -123,7 +48,50 @@ sub _raw_parse_simple_selector { } }; - confess "Couldn't parse $_ as starting with simple selector"; + # '[attr^=foo]' - match attribute with ^ anchored regex + /\G\[$sel_re\^=$match_value_re\]/gc and + return do{ + my $attribute = $1; + my $value = $2; + $_[0]->{attrs}{$attribute} + && $_[0]->{attrs}{$attribute} =~ qr/^\Q$value\E/; + }; + + # '[attr$=foo]' - match attribute with $ anchored regex + /\G\[$sel_re\$=$match_value_re\]/gc and + return do{ + my $attribute = $1; + my $value = $2; + $_[0]->{attrs}{$attribute} + && $_[0]->{attrs}{$attribute} =~ qr/\Q$value\E$/; + }; + + # '[attr*=foo] - match attribute with regex: + /\G\[$sel_re\*=$match_value_re\]/gc and + return do{ + my $attribute = $1; + my $value = $2; + $_[0]->{attrs}{$attribute} + && $_[0]->{attrs}{$attribute} =~ qr/\Q$value\E$/; + }; + + # '[attr=bar]' - match attributes + /\G\[$sel_re=$match_value_re\]/gc and + return do { + my $attribute = $1; + my $value = $2; + sub{ + $_[0]->{attrs}{$attribute} + && $_[0]->{attrs}{$attribute} eq $value; + } + }; + + # '[attr] - match attribute being present: + /\G\[$sel_re\]/gc and + return do { + my $attribute = $1; + $_[0]->{attrs}{$attribute}; + } } } @@ -135,10 +103,40 @@ sub parse_selector { for ($sel) { my @sub; PARSE: { do { - push(@sub, $self->_raw_parse_simple_selector($_)); - last PARSE if (pos == length); - /\G\s*,\s*/gc or confess "Selectors not comma separated"; - } until (pos == length) }; + + my @this_chain; + + # slurp selectors until we find something else: + while( my $sel = $self->_raw_parse_simple_selector($_) ){ + push @this_chain, $sel; + } + + if( @this_chain == 1 ) + { + push @sub, @this_chain; + } + else{ + # make a compound match closure of everything + # in this chain of selectors: + push @sub, sub{ + my $r; + for my $inner ( @this_chain ){ + if( ! ($r = $inner->( @_ )) ){ + return $r; + } + } + return $r; + } + } + + # now we're at the end or a delimiter: + last PARSE if( pos == length ); + /\G\s*,\s*/gc or do { + /\G(.*)/; + $self->_blam( "Selectors not comma separated." ); + } + + } until (pos == length) }; return $sub[0] if (@sub == 1); return sub { foreach my $inner (@sub) { @@ -149,4 +147,12 @@ sub parse_selector { } +sub _blam { + my ($self, $error) = @_; + my $hat = (' ' x (pos||0)).'^'; + die "Error parsing dispatch specification: ${error}\n +${_} +${hat} here\n"; +} + 1; diff --git a/t/selectors.t b/t/selectors.t index 01e11cf..59d9c7c 100644 --- a/t/selectors.t +++ b/t/selectors.t @@ -4,6 +4,16 @@ use Test::More; use HTML::Zoom; +my $tmpl = < +
+ Bob + Builder +
+
+ +END + # el#id is( HTML::Zoom->from_html('
') ->select('div#yo') @@ -36,6 +46,15 @@ is( HTML::Zoom->from_html('
') '
grg
', 'E[attr="val"] works' ); +# el[attr=foo] +is( HTML::Zoom->from_html('
') + ->select('div[frew=yo]') + ->replace_content('grg') + ->to_html, + '
grg
', + 'E[attr=val] works' ); + + # el[attr*="foo"] is( HTML::Zoom->from_html('
') ->select('div[f*="oo"]') @@ -68,6 +87,14 @@ is( HTML::Zoom->from_html('
') '
grg
', 'E[attr*="val"] works' ); +# [attr=bar] +ok( check_select( '[prop=moo]'), '[attr=bar]' ); + +# el[attr=bar],[prop=foo] +is( check_select('span[class=career],[prop=moo]'), 2, + 'Multiple selectors: el[attr=bar],[attr=foo]'); + + # sel1 sel2 is( HTML::Zoom->from_html('
') ->select('table tr') @@ -86,3 +113,16 @@ is( HTML::Zoom->from_html('
') 'sel1 sel2 sel3 works' ); done_testing; + + +sub check_select{ + # less crude?: + my $output = HTML::Zoom + ->from_html($tmpl) + ->select(shift)->replace("the monkey")->to_html; + my $count = 0; + while ( $output =~ /\G?.*the monkey/gc ){ + $count++; + } + return $count; +}