X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2FQ-Branch.git;a=blobdiff_plain;f=lib%2FSQL%2FAbstract%2FTree.pm;h=3791fe9d2e94b69ccf025fb1cf4dd9d18b407327;hp=81a360d24df2a5e36ab30c8ef71d99b943fdc08d;hb=cf5b7ab163f8ac123ebc9bb1156e79646cd5bd2f;hpb=0c2de280869928d9ff1ee95f36a9a45318766990 diff --git a/lib/SQL/Abstract/Tree.pm b/lib/SQL/Abstract/Tree.pm index 81a360d..3791fe9 100644 --- a/lib/SQL/Abstract/Tree.pm +++ b/lib/SQL/Abstract/Tree.pm @@ -1,38 +1,10 @@ package SQL::Abstract::Tree; -use strict; -use warnings; +use Moo; no warnings 'qw'; -use Carp; - -use Hash::Merge qw//; - -use base 'Class::Accessor::Grouped'; - -__PACKAGE__->mk_group_accessors( simple => qw( - newline indent_string indent_amount colormap indentmap fill_in_placeholders - placeholder_surround -)); - -my $merger = Hash::Merge->new; -$merger->specify_behavior({ - SCALAR => { - SCALAR => sub { $_[1] }, - ARRAY => sub { [ $_[0], @{$_[1]} ] }, - HASH => sub { $_[1] }, - }, - ARRAY => { - SCALAR => sub { $_[1] }, - ARRAY => sub { $_[1] }, - HASH => sub { $_[1] }, - }, - HASH => { - SCALAR => sub { $_[1] }, - ARRAY => sub { [ values %{$_[0]}, @{$_[1]} ] }, - HASH => sub { Hash::Merge::_merge_hashes( $_[0], $_[1] ) }, - }, -}, 'SQLA::Tree Behavior' ); +use Carp; +use Sub::Quote 'quote_sub'; my $op_look_ahead = '(?: (?= [\s\)\(\;] ) | \z)'; my $op_look_behind = '(?: (?<= [\,\s\)\(] ) | \A )'; @@ -68,6 +40,7 @@ my @expression_start_keywords = ( 'HAVING', 'ORDER \s+ BY', 'SKIP', + 'FETCH', 'FIRST', 'LIMIT', 'OFFSET', @@ -82,7 +55,6 @@ my @expression_start_keywords = ( 'SAVEPOINT', 'RELEASE \s+ SAVEPOINT', 'RETURNING', - 'ROW_NUMBER \s* \( \s* \) \s+ OVER', ); my $expr_start_re = join ("\n\t|\n", @expression_start_keywords ); @@ -91,30 +63,32 @@ $expr_start_re = qr/ $op_look_behind (?i: $expr_start_re ) $op_look_ahead /x; # These are binary operator keywords always a single LHS and RHS # * AND/OR are handled separately as they are N-ary # * so is NOT as being unary -# * BETWEEN without paranthesis around the ANDed arguments (which -# makes it a non-binary op) is detected and accomodated in +# * BETWEEN without parentheses around the ANDed arguments (which +# makes it a non-binary op) is detected and accommodated in # _recurse_parse() # * AS is not really an operator but is handled here as it's also LHS/RHS # this will be included in the $binary_op_re, the distinction is interesting during -# testing as one is tighter than the other, plus mathops have different look -# ahead/behind (e.g. "x"="y" ) -my @math_op_keywords = (qw/ - + < > != <> = <= >= /); -my $math_op_re = join ("\n\t|\n", map +# testing as one is tighter than the other, plus alphanum cmp ops have different +# look ahead/behind (e.g. "x"="y" ) +my @alphanum_cmp_op_keywords = (qw/< > != <> = <= >= /); +my $alphanum_cmp_op_re = join ("\n\t|\n", map { "(?: (?<= [\\w\\s] | $quote_right ) | \\A )" . quotemeta ($_) . "(?: (?= [\\w\\s] | $quote_left ) | \\z )" } - @math_op_keywords + @alphanum_cmp_op_keywords ); -$math_op_re = qr/$math_op_re/x; +$alphanum_cmp_op_re = qr/$alphanum_cmp_op_re/x; -my $binary_op_re = '(?: NOT \s+)? (?:' . join ('|', qw/IN BETWEEN R?LIKE/) . ')'; +my $binary_op_re = '(?: NOT \s+)? (?:' . join ('|', qw/IN BETWEEN [RI]?LIKE REGEXP/) . ')'; $binary_op_re = join "\n\t|\n", "$op_look_behind (?i: $binary_op_re | AS ) $op_look_ahead", - $math_op_re, + $alphanum_cmp_op_re, $op_look_behind . 'IS (?:\s+ NOT)?' . "(?= \\s+ NULL \\b | $op_look_ahead )", ; $binary_op_re = qr/$binary_op_re/x; -my $unary_op_re = '(?: NOT \s+ EXISTS | NOT )'; +my $rno_re = qr/ROW_NUMBER \s* \( \s* \) \s+ OVER/ix; + +my $unary_op_re = 'NOT \s+ EXISTS | NOT | ' . $rno_re; $unary_op_re = join "\n\t|\n", "$op_look_behind (?i: $unary_op_re ) $op_look_ahead", ; @@ -129,14 +103,14 @@ my $tokenizer_re = join("\n\t|\n", $unary_op_re, $asc_desc_re, $and_or_re, - "$op_look_behind \\* $op_look_ahead", + $op_look_behind . ' \* ' . $op_look_ahead, (map { quotemeta $_ } qw/, ( )/), $placeholder_re, ); # this one *is* capturing for the split below # splits on whitespace if all else fails -# has to happen before the composiign qr's are anchored (below) +# has to happen before the composing qr's are anchored (below) $tokenizer_re = qr/ \s* ( $tokenizer_re ) \s* | \s+ /x; # Parser states for _recurse_parse() @@ -149,8 +123,7 @@ use constant PARSE_LIST_ELT => 5; my $expr_term_re = qr/$expr_start_re | \)/x; my $rhs_term_re = qr/ $expr_term_re | $binary_op_re | $unary_op_re | $asc_desc_re | $and_or_re | \, /x; -my $common_single_args_re = qr/ \* | $placeholder_re /x; -my $all_std_keywords_re = qr/ $rhs_term_re | \( | $common_single_args_re /x; +my $all_std_keywords_re = qr/ $rhs_term_re | \( | $placeholder_re /x; # anchor everything - even though keywords are separated by the tokenizer, leakage may occur for ( @@ -158,20 +131,20 @@ for ( $quote_right, $placeholder_re, $expr_start_re, - $math_op_re, + $alphanum_cmp_op_re, $binary_op_re, $unary_op_re, $asc_desc_re, $and_or_re, $expr_term_re, $rhs_term_re, - $common_single_args_re, $all_std_keywords_re, ) { $_ = qr/ \A $_ \z /x; } - +# what can be bunched together under one MISC in an AST +my $compressable_node_re = qr/^ \- (?: MISC | LITERAL | PLACEHOLDER ) $/x; my %indents = ( select => 0, @@ -195,18 +168,33 @@ my %indents = ( first => 1, ); -my %profiles = ( - console => { - fill_in_placeholders => 1, - placeholder_surround => ['?/', ''], - indent_string => ' ', - indent_amount => 2, - newline => "\n", - colormap => {}, - indentmap => \%indents, - - eval { require Term::ANSIColor } - ? do { + +has [qw( + newline indent_string indent_amount fill_in_placeholders placeholder_surround +)] => (is => 'ro'); + +has [qw( indentmap colormap )] => ( is => 'ro', default => quote_sub('{}') ); + +# class global is in fact desired +my $merger; + +sub BUILDARGS { + my $class = shift; + my $args = ref $_[0] eq 'HASH' ? $_[0] : {@_}; + + if (my $p = delete $args->{profile}) { + my %extra_args; + if ($p eq 'console') { + %extra_args = ( + fill_in_placeholders => 1, + placeholder_surround => ['?/', ''], + indent_string => ' ', + indent_amount => 2, + newline => "\n", + colormap => {}, + indentmap => \%indents, + + ! ( eval { require Term::ANSIColor } ) ? () : do { my $c = \&Term::ANSIColor::color; my $red = [$c->('red') , $c->('reset')]; @@ -251,79 +239,86 @@ my %profiles = ( offset => $green, } ); - } : (), - }, - console_monochrome => { - fill_in_placeholders => 1, - placeholder_surround => ['?/', ''], - indent_string => ' ', - indent_amount => 2, - newline => "\n", - colormap => {}, - indentmap => \%indents, - }, - html => { - fill_in_placeholders => 1, - placeholder_surround => ['', ''], - indent_string => ' ', - indent_amount => 2, - newline => "
\n", - colormap => { - select => ['' , ''], - 'insert into' => ['' , ''], - update => ['' , ''], - 'delete from' => ['' , ''], - - set => ['', ''], - from => ['' , ''], - - where => ['' , ''], - values => ['', ''], - - join => ['' , ''], - 'left join' => ['',''], - on => ['' , ''], - - 'group by' => ['', ''], - having => ['', ''], - 'order by' => ['', ''], - - skip => ['', ''], - first => ['', ''], - limit => ['', ''], - offset => ['', ''], - - 'begin work' => ['', ''], - commit => ['', ''], - rollback => ['', ''], - savepoint => ['', ''], - 'rollback to savepoint' => ['', ''], - 'release savepoint' => ['', ''], - }, - indentmap => \%indents, - }, - none => { - colormap => {}, - indentmap => {}, - }, -); - -sub new { - my $class = shift; - my $args = shift || {}; - - my $profile = delete $args->{profile} || 'none'; + }, + ); + } + elsif ($p eq 'console_monochrome') { + %extra_args = ( + fill_in_placeholders => 1, + placeholder_surround => ['?/', ''], + indent_string => ' ', + indent_amount => 2, + newline => "\n", + indentmap => \%indents, + ); + } + elsif ($p eq 'html') { + %extra_args = ( + fill_in_placeholders => 1, + placeholder_surround => ['', ''], + indent_string => ' ', + indent_amount => 2, + newline => "
\n", + colormap => { map { + (my $class = $_) =~ s/\s+/-/g; + ( $_ => [ qq||, '' ] ) + } ( + keys %indents, + qw(commit rollback savepoint), + 'begin work', 'rollback to savepoint', 'release savepoint', + ) }, + indentmap => \%indents, + ); + } + elsif ($p eq 'none') { + # nada + } + else { + croak "No such profile '$p'"; + } - die "No such profile '$profile'!" unless exists $profiles{$profile}; + # see if we got any duplicates and merge if needed + if (scalar grep { exists $args->{$_} } keys %extra_args) { + # heavy-duty merge + $args = ($merger ||= do { + require Hash::Merge; + my $m = Hash::Merge->new; + + $m->specify_behavior({ + SCALAR => { + SCALAR => sub { $_[1] }, + ARRAY => sub { [ $_[0], @{$_[1]} ] }, + HASH => sub { $_[1] }, + }, + ARRAY => { + SCALAR => sub { $_[1] }, + ARRAY => sub { $_[1] }, + HASH => sub { $_[1] }, + }, + HASH => { + SCALAR => sub { $_[1] }, + ARRAY => sub { [ values %{$_[0]}, @{$_[1]} ] }, + HASH => sub { Hash::Merge::_merge_hashes( $_[0], $_[1] ) }, + }, + }, 'SQLA::Tree Behavior' ); + + $m; + })->merge(\%extra_args, $args ); - my $data = $merger->merge( $profiles{$profile}, $args ); + } + else { + $args = { %extra_args, %$args }; + } + } - bless $data, $class + $args; } sub parse { my ($self, $s) = @_; + return [] unless defined $s; + # tokenize string, and remove all optional whitespace my $tokens = []; foreach my $token (split $tokenizer_re, $s) { @@ -345,7 +340,7 @@ sub _recurse_parse { my @left; while (1) { # left-associative parsing - if ( ! @$tokens + if (! @$tokens or ($state == PARSE_IN_PARENS && $tokens->[0] eq ')') or @@ -419,19 +414,23 @@ sub _recurse_parse { @right = $self->_recurse_parse($tokens, PARSE_IN_EXPR); } - @left = [$op => [ @left, @right ]]; + push @left, [$op => [ (@left ? pop @left : ''), @right ]]; } # unary op keywords - elsif ( $token =~ $unary_op_re ) { + elsif ($token =~ $unary_op_re) { my $op = uc $token; - my @right = $self->_recurse_parse ($tokens, PARSE_RHS); + + # normalize RNO explicitly + $op = 'ROW_NUMBER() OVER' if $op =~ /^$rno_re$/; + + my @right = $self->_recurse_parse($tokens, PARSE_RHS); push @left, [ $op => \@right ]; } # expression terminator keywords - elsif ( $token =~ $expr_start_re ) { + elsif ($token =~ $expr_start_re) { my $op = uc $token; my @right = $self->_recurse_parse($tokens, PARSE_IN_EXPR); @@ -439,49 +438,66 @@ sub _recurse_parse { } # a '?' - elsif ( $token =~ $placeholder_re) { + elsif ($token =~ $placeholder_re) { push @left, [ -PLACEHOLDER => [ $token ] ]; } # check if the current token is an unknown op-start - elsif (@$tokens and ($tokens->[0] eq '(' or $tokens->[0] =~ $common_single_args_re ) ) { + elsif (@$tokens and ($tokens->[0] eq '(' or $tokens->[0] =~ $placeholder_re ) ) { push @left, [ $token => [ $self->_recurse_parse($tokens, PARSE_RHS) ] ]; } # we're now in "unknown token" land - start eating tokens until - # we see something familiar + # we see something familiar, OR in the case of RHS (binop) stop + # after the first token + # Also stop processing when we could end up with an unknown func else { my @lits = [ -LITERAL => [$token] ]; - while (@$tokens and $tokens->[0] !~ $all_std_keywords_re) { - push @lits, [ -LITERAL => [ shift @$tokens ] ]; - } + unshift @lits, pop @left if @left == 1; - if (@left == 1) { - unshift @lits, pop @left; - } + unless ( $state == PARSE_RHS ) { + while ( + @$tokens + and + $tokens->[0] !~ $all_std_keywords_re + and + ! (@$tokens > 1 and $tokens->[1] eq '(') + ) { + push @lits, [ -LITERAL => [ shift @$tokens ] ]; + } + } @lits = [ -MISC => [ @lits ] ] if @lits > 1; push @left, @lits; } - # deal with post-fix operators (only when sql is sane - i.e. we have one element to apply to) - if (@left == 1 and @$tokens) { - - # asc/desc - if ($tokens->[0] =~ $asc_desc_re) { - my $op = shift @$tokens; - - # if -MISC - this is a literal collection, do not promote asc/desc to an op - if ($left[0][0] eq '-MISC') { - push @{$left[0][1]}, [ -LITERAL => [ $op ] ]; + # compress -LITERAL -MISC and -PLACEHOLDER pieces into a single + # -MISC container + if (@left > 1) { + my $i = 0; + while ($#left > $i) { + if ($left[$i][0] =~ $compressable_node_re and $left[$i+1][0] =~ $compressable_node_re) { + splice @left, $i, 2, [ -MISC => [ + map { $_->[0] eq '-MISC' ? @{$_->[1]} : $_ } (@left[$i, $i+1]) + ]]; } else { - @left = [ ('-' . uc ($op)) => [ @left ] ]; + $i++; } } } + + return @left if $state == PARSE_RHS; + + # deal with post-fix operators + if (@$tokens) { + # asc/desc + if ($tokens->[0] =~ $asc_desc_re) { + @left = [ ('-' . uc (shift @$tokens)) => [ @left ] ]; + } + } } } @@ -596,14 +612,17 @@ sub _unparse { } else { my ($l, $r) = @{$self->pad_keyword($op, $depth)}; - return sprintf "$l%s%s%s$r", - $self->format_keyword($op), + + my $rhs = $self->_unparse($args, $bindargs, $depth); + + return sprintf "$l%s$r", join( ( ref $args eq 'ARRAY' and @{$args} == 1 and $args->[0][0] eq '-PAREN' ) ? '' # mysql-- : ' ' , - $self->_unparse($args, $bindargs, $depth), - ; + $self->format_keyword($op), + (length $rhs ? $rhs : () ), + ); } } @@ -638,25 +657,38 @@ sub _parenthesis_unroll { next; } + my $parent_op = $ast->[0]; + # unroll nested parenthesis - while ( @{$child->[1]} == 1 and $child->[1][0][0] eq '-PAREN') { + while ( $parent_op ne 'IN' and @{$child->[1]} == 1 and $child->[1][0][0] eq '-PAREN') { $child = $child->[1][0]; $changes++; } - # if the parent operator explcitly allows it nuke the parenthesis - if ( $ast->[0] =~ $unrollable_ops_re ) { + # set to CHILD in the case of PARENT ( CHILD ) + # but NOT in the case of PARENT( CHILD1, CHILD2 ) + my $single_child_op = (@{$child->[1]} == 1) ? $child->[1][0][0] : ''; + + my $child_op_argc = $single_child_op ? scalar @{$child->[1][0][1]} : undef; + + my $single_grandchild_op + = ( $child_op_argc||0 == 1 and ref $child->[1][0][1][0] eq 'ARRAY' ) + ? $child->[1][0][1][0][0] + : '' + ; + + # if the parent operator explicitly allows it AND the child isn't a subselect + # nuke the parenthesis + if ($parent_op =~ $unrollable_ops_re and $single_child_op ne 'SELECT') { push @children, @{$child->[1]}; $changes++; } # if the parenthesis are wrapped around an AND/OR matching the parent AND/OR - open the parenthesis up and merge the list elsif ( - @{$child->[1]} == 1 - and - ( $ast->[0] eq 'AND' or $ast->[0] eq 'OR') - and - $child->[1][0][0] eq $ast->[0] + $single_child_op eq $parent_op + and + ( $parent_op eq 'AND' or $parent_op eq 'OR') ) { push @children, @{$child->[1][0][1]}; $changes++; @@ -665,13 +697,9 @@ sub _parenthesis_unroll { # only *ONE* LITERAL or placeholder element # as an AND/OR/NOT argument elsif ( - @{$child->[1]} == 1 && ( - $child->[1][0][0] eq '-LITERAL' - or - $child->[1][0][0] eq '-PLACEHOLDER' - ) && ( - $ast->[0] eq 'AND' or $ast->[0] eq 'OR' or $ast->[0] eq 'NOT' - ) + ( $single_child_op eq '-LITERAL' or $single_child_op eq '-PLACEHOLDER' ) + and + ( $parent_op eq 'AND' or $parent_op eq 'OR' or $parent_op eq 'NOT' ) ) { push @children, @{$child->[1]}; $changes++; @@ -684,20 +712,18 @@ sub _parenthesis_unroll { # break precedence) or when the child is BETWEEN (special # case) elsif ( - @{$child->[1]} == 1 - and - ($ast->[0] eq 'AND' or $ast->[0] eq 'OR') + ($parent_op eq 'AND' or $parent_op eq 'OR') and - $child->[1][0][0] =~ $binary_op_re + $single_child_op =~ $binary_op_re and - $child->[1][0][0] ne 'BETWEEN' + $single_child_op ne 'BETWEEN' and - @{$child->[1][0][1]} == 2 + $child_op_argc == 2 and ! ( - $child->[1][0][0] =~ $math_op_re + $single_child_op =~ $alphanum_cmp_op_re and - $ast->[0] =~ $math_op_re + $parent_op =~ $alphanum_cmp_op_re ) ) { push @children, @{$child->[1]}; @@ -711,20 +737,20 @@ sub _parenthesis_unroll { # or a single non-mathop with a single LITERAL ( nonmathop foo ) # or a single non-mathop with a single PLACEHOLDER ( nonmathop ? ) elsif ( - @{$child->[1]} == 1 + $single_child_op and - @{$child->[1][0][1]} == 1 + $parent_op =~ $alphanum_cmp_op_re and - $ast->[0] =~ $math_op_re + $single_child_op !~ $alphanum_cmp_op_re and - $child->[1][0][0] !~ $math_op_re + $child_op_argc == 1 and ( - $child->[1][0][1][0][0] eq '-PAREN' + $single_grandchild_op eq '-PAREN' or - $child->[1][0][1][0][0] eq '-LITERAL' + $single_grandchild_op eq '-LITERAL' or - $child->[1][0][1][0][0] eq '-PLACEHOLDER' + $single_grandchild_op eq '-PLACEHOLDER' ) ) { push @children, @{$child->[1]}; @@ -733,16 +759,17 @@ sub _parenthesis_unroll { # a construct of ... ( somefunc ( ... ) ) ... can safely lose the outer parens # except for the case of ( NOT ( ... ) ) which has already been handled earlier + # and except for the case of RNO, where the double are explicit syntax elsif ( - @{$child->[1]} == 1 + $parent_op ne 'ROW_NUMBER() OVER' and - @{$child->[1][0][1]} == 1 + $single_child_op and - $child->[1][0][0] ne 'NOT' + $single_child_op ne 'NOT' and - ref $child->[1][0][1][0] eq 'ARRAY' + $child_op_argc == 1 and - $child->[1][0][1][0][0] eq '-PAREN' + $single_grandchild_op eq '-PAREN' ) { push @children, @{$child->[1]}; $changes++;