X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FSQL%2FAbstract%2FTree.pm;h=1a4560c66c058e4bb73d6941d58f663c1125c634;hb=007f08535b6449047aa3bbc02d88a41b70d2e74c;hp=90d3052a4e0f3fa3b2416ece99c63c3c7803b2c7;hpb=a1e204f4ac8bc478a6821aca8df5cfa4af18a628;p=dbsrgits%2FSQL-Abstract.git diff --git a/lib/SQL/Abstract/Tree.pm b/lib/SQL/Abstract/Tree.pm index 90d3052..1a4560c 100644 --- a/lib/SQL/Abstract/Tree.pm +++ b/lib/SQL/Abstract/Tree.pm @@ -2,6 +2,7 @@ package SQL::Abstract::Tree; use strict; use warnings; +no warnings 'qw'; use Carp; use Hash::Merge qw//; @@ -33,19 +34,14 @@ $merger->specify_behavior({ }, }, 'SQLA::Tree Behavior' ); - -# Parser states for _recurse_parse() -use constant PARSE_TOP_LEVEL => 0; -use constant PARSE_IN_EXPR => 1; -use constant PARSE_IN_PARENS => 2; -use constant PARSE_RHS => 3; -use constant PARSE_IN_FUNC => 4; - my $op_look_ahead = '(?: (?= [\s\)\(\;] ) | \z)'; -my $op_look_behind = '(?: (?<= [\s\)\(] ) | \A )'; +my $op_look_behind = '(?: (?<= [\,\s\)\(] ) | \A )'; + my $quote_left = qr/[\`\'\"\[]/; my $quote_right = qr/[\`\'\"\]]/; +my $placeholder_re = qr/(?: \? | \$\d+ )/x; + # These SQL keywords always signal end of the current expression (except inside # of a parenthesized subexpression). # Format: A list of strings that will be compiled to extended syntax ie. @@ -54,10 +50,10 @@ my $quote_right = qr/[\`\'\"\]]/; my @expression_start_keywords = ( 'SELECT', 'UPDATE', + 'SET', 'INSERT \s+ INTO', 'DELETE \s+ FROM', 'FROM', - 'SET', '(?: (?: (?: (?: LEFT | RIGHT | FULL ) \s+ )? @@ -67,23 +63,31 @@ my @expression_start_keywords = ( )', 'ON', 'WHERE', - 'VALUES', - 'EXISTS', + '(?: DEFAULT \s+ )? VALUES', + '(?: NOT \s+)? EXISTS', 'GROUP \s+ BY', 'HAVING', 'ORDER \s+ BY', + 'SKIP', + 'FIRST', 'LIMIT', 'OFFSET', 'FOR', 'UNION', 'INTERSECT', 'EXCEPT', + 'BEGIN \s+ WORK', + 'COMMIT', + 'ROLLBACK \s+ TO \s+ SAVEPOINT', + 'ROLLBACK', + 'SAVEPOINT', + 'RELEASE \s+ SAVEPOINT', 'RETURNING', 'ROW_NUMBER \s* \( \s* \) \s+ OVER', ); -my $exp_start_re = join ("\n\t|\n", @expression_start_keywords ); -$exp_start_re = qr/ $op_look_behind (?i: $exp_start_re ) $op_look_ahead /xo; +my $expr_start_re = join ("\n\t|\n", @expression_start_keywords ); +$expr_start_re = qr/ $op_look_behind (?i: $expr_start_re ) $op_look_ahead /x; # These are binary operator keywords always a single LHS and RHS # * AND/OR are handled separately as they are N-ary @@ -91,6 +95,7 @@ $exp_start_re = qr/ $op_look_behind (?i: $exp_start_re ) $op_look_ahead /xo; # * BETWEEN without paranthesis around the ANDed arguments (which # makes it a non-binary op) is detected and accomodated in # _recurse_parse() +# * AS is not really an operator but is handled here as it's also LHS/RHS # this will be included in the $binary_op_re, the distinction is interesting during # testing as one is tighter than the other, plus mathops have different look @@ -100,27 +105,45 @@ my $math_re = join ("\n\t|\n", map { "(?: (?<= [\\w\\s] | $quote_right ) | \\A )" . quotemeta ($_) . "(?: (?= [\\w\\s] | $quote_left ) | \\z )" } @math_op_keywords ); -$math_re = qr/$math_re/xo; +$math_re = qr/$math_re/x; sub _math_op_re { $math_re } my $binary_op_re = '(?: NOT \s+)? (?:' . join ('|', qw/IN BETWEEN R?LIKE/) . ')'; -$binary_op_re = "(?: $op_look_behind (?i: $binary_op_re ) $op_look_ahead ) \n\t|\n $math_re"; -$binary_op_re = qr/$binary_op_re/xo; +$binary_op_re = join "\n\t|\n", + "$op_look_behind (?i: $binary_op_re | AS ) $op_look_ahead", + $math_re, + $op_look_behind . 'IS (?:\s+ NOT)?' . "(?= \\s+ NULL \\b | $op_look_ahead )", +; +$binary_op_re = qr/$binary_op_re/x; sub _binary_op_re { $binary_op_re } - -my $tokenizer_re = join("\n\t|\n", - $exp_start_re, +my $all_known_re = join("\n\t|\n", + $expr_start_re, $binary_op_re, - "$op_look_behind (?i: AND|OR|NOT ) $op_look_ahead", - (map { quotemeta $_ } qw/( ) ? */), + "$op_look_behind (?i: AND|OR|NOT|\\* ) $op_look_ahead", + (map { quotemeta $_ } qw/, ( )/), + $placeholder_re, ); -#this one *is* capturing -$tokenizer_re = qr/ \s* ( $tokenizer_re ) \s* /x; +$all_known_re = qr/$all_known_re/x; + +#this one *is* capturing for the split below +# splits on whitespace if all else fails +my $tokenizer_re = qr/ \s* ( $all_known_re ) \s* | \s+ /x; + +# Parser states for _recurse_parse() +use constant PARSE_TOP_LEVEL => 0; +use constant PARSE_IN_EXPR => 1; +use constant PARSE_IN_PARENS => 2; +use constant PARSE_IN_FUNC => 3; +use constant PARSE_RHS => 4; + +my $expr_term_re = qr/ ^ (?: $expr_start_re | \) ) $/x; +my $rhs_term_re = qr/ ^ (?: $expr_term_re | $binary_op_re | (?i: AND | OR | NOT | \, ) ) $/x; +my $func_start_re = qr/^ (?: \* | $placeholder_re | \( ) $/x; my %indents = ( select => 0, @@ -132,11 +155,16 @@ my %indents = ( join => 1, 'left join' => 1, on => 2, + having => 0, 'group by' => 0, 'order by' => 0, set => 1, into => 1, values => 1, + limit => 1, + offset => 1, + skip => 1, + first => 1, ); my %profiles = ( @@ -147,31 +175,52 @@ my %profiles = ( indent_amount => 2, newline => "\n", colormap => {}, - indentmap => { %indents }, + indentmap => \%indents, eval { require Term::ANSIColor } ? do { my $c = \&Term::ANSIColor::color; + + my $red = [$c->('red') , $c->('reset')]; + my $cyan = [$c->('cyan') , $c->('reset')]; + my $green = [$c->('green') , $c->('reset')]; + my $yellow = [$c->('yellow') , $c->('reset')]; + my $blue = [$c->('blue') , $c->('reset')]; + my $magenta = [$c->('magenta'), $c->('reset')]; + my $b_o_w = [$c->('black on_white'), $c->('reset')]; ( - placeholder_surround => [$c->('black on_cyan'), $c->('reset')], + placeholder_surround => [$c->('black on_magenta'), $c->('reset')], colormap => { - select => [$c->('red'), $c->('reset')], - 'insert into' => [$c->('red'), $c->('reset')], - update => [$c->('red'), $c->('reset')], - 'delete from' => [$c->('red'), $c->('reset')], - - set => [$c->('cyan'), $c->('reset')], - from => [$c->('cyan'), $c->('reset')], - - where => [$c->('green'), $c->('reset')], - values => [$c->('yellow'), $c->('reset')], - - join => [$c->('magenta'), $c->('reset')], - 'left join' => [$c->('magenta'), $c->('reset')], - on => [$c->('blue'), $c->('reset')], - - 'group by' => [$c->('yellow'), $c->('reset')], - 'order by' => [$c->('yellow'), $c->('reset')], + 'begin work' => $b_o_w, + commit => $b_o_w, + rollback => $b_o_w, + savepoint => $b_o_w, + 'rollback to savepoint' => $b_o_w, + 'release savepoint' => $b_o_w, + + select => $red, + 'insert into' => $red, + update => $red, + 'delete from' => $red, + + set => $cyan, + from => $cyan, + + where => $green, + values => $yellow, + + join => $magenta, + 'left join' => $magenta, + on => $blue, + + 'group by' => $yellow, + having => $yellow, + 'order by' => $yellow, + + skip => $green, + first => $green, + limit => $green, + offset => $green, } ); } : (), @@ -183,7 +232,7 @@ my %profiles = ( indent_amount => 2, newline => "\n", colormap => {}, - indentmap => { %indents }, + indentmap => \%indents, }, html => { fill_in_placeholders => 1, @@ -196,17 +245,34 @@ my %profiles = ( 'insert into' => ['' , ''], update => ['' , ''], 'delete from' => ['' , ''], - where => ['' , ''], + + set => ['', ''], from => ['' , ''], + + where => ['' , ''], + values => ['', ''], + join => ['' , ''], + 'left join' => ['',''], on => ['' , ''], + 'group by' => ['', ''], + having => ['', ''], 'order by' => ['', ''], - set => ['', ''], - into => ['', ''], - values => ['', ''], + + skip => ['', ''], + first => ['', ''], + limit => ['', ''], + offset => ['', ''], + + 'begin work' => ['', ''], + commit => ['', ''], + rollback => ['', ''], + savepoint => ['', ''], + 'rollback to savepoint' => ['', ''], + 'release savepoint' => ['', ''], }, - indentmap => { %indents }, + indentmap => \%indents, }, none => { colormap => {}, @@ -219,6 +285,9 @@ sub new { my $args = shift || {}; my $profile = delete $args->{profile} || 'none'; + + die "No such profile '$profile'!" unless exists $profiles{$profile}; + my $data = $merger->merge( $profiles{$profile}, $args ); bless $data, $class @@ -230,13 +299,21 @@ sub parse { # tokenize string, and remove all optional whitespace my $tokens = []; foreach my $token (split $tokenizer_re, $s) { - push @$tokens, $token if (length $token) && ($token =~ /\S/); + push @$tokens, $token if ( + defined $token + and + length $token + and + $token =~ /\S/ + ); } - - my $tree = $self->_recurse_parse($tokens, PARSE_TOP_LEVEL); - return $tree; + $self->_recurse_parse($tokens, PARSE_TOP_LEVEL); } +{ +# this is temporary, lists can be parsed *without* recursing, but +# it requires a massive rewrite of the AST generator +no warnings qw/recursion/; sub _recurse_parse { my ($self, $tokens, $state) = @_; @@ -248,11 +325,11 @@ sub _recurse_parse { or ($state == PARSE_IN_PARENS && $lookahead eq ')') or - ($state == PARSE_IN_EXPR && $lookahead =~ qr/ ^ (?: $exp_start_re | \) ) $ /x ) + ($state == PARSE_IN_EXPR && $lookahead =~ $expr_term_re ) or - ($state == PARSE_RHS && $lookahead =~ qr/ ^ (?: $exp_start_re | $binary_op_re | (?i: AND | OR | NOT ) | \) ) $ /x ) + ($state == PARSE_RHS && $lookahead =~ $rhs_term_re ) or - ($state == PARSE_IN_FUNC && $lookahead ne '(') + ($state == PARSE_IN_FUNC && $lookahead !~ $func_start_re) # if there are multiple values - the parenthesis will switch the $state ) { return $left||(); } @@ -268,17 +345,18 @@ sub _recurse_parse { $left = $left ? [$left, [PAREN => [$right||()] ]] : [PAREN => [$right||()] ]; } - # AND/OR - elsif ($token =~ /^ (?: OR | AND ) $/xi ) { - my $op = uc $token; - my $right = $self->_recurse_parse($tokens, PARSE_IN_EXPR); + # AND/OR and LIST (,) + elsif ($token =~ /^ (?: OR | AND | \, ) $/xi ) { + my $op = ($token eq ',') ? 'LIST' : uc $token; + + my $right = $self->_recurse_parse($tokens, PARSE_IN_EXPR) || []; # Merge chunks if logic matches - if (ref $right and $op eq $right->[0]) { - $left = [ (shift @$right ), [$left, map { @$_ } @$right] ]; + if (ref $right and @$right and $op eq $right->[0]) { + $left = [ (shift @$right ), [$left||[], map { @$_ } @$right] ]; } else { - $left = [$op => [$left, $right]]; + $left = [$op => [ $left||[], $right ]]; } } # binary operator keywords @@ -296,35 +374,43 @@ sub _recurse_parse { $left = [$op => [$left, $right] ]; } # expression terminator keywords (as they start a new expression) - elsif ( $token =~ / ^ $exp_start_re $ /x ) { + elsif ( $token =~ / ^ $expr_start_re $ /x ) { my $op = uc $token; my $right = $self->_recurse_parse($tokens, PARSE_IN_EXPR); - $left = $left ? [ $left, [$op => [$right] ]] - : [ $op => [$right] ]; + $left = $left ? [ $left, [$op => [$right||()] ]] + : [ $op => [$right||()] ]; } # NOT elsif ( $token =~ /^ NOT $/ix ) { my $op = uc $token; my $right = $self->_recurse_parse ($tokens, PARSE_RHS); - $left = $left ? [ @$left, [$op => [$right] ]] - : [ $op => [$right] ]; + $left = $left ? [ @$left, [$op => [$right||()] ]] + : [ $op => [$right||()] ]; } - # generic function - elsif (@$tokens && $tokens->[0] eq '(') { - my $right = $self->_recurse_parse($tokens, PARSE_IN_FUNC); - - $left = $left ? [ $left, [ $token => [$right||()] ]] - : [ $token => [$right||()] ]; + elsif ( $token =~ $placeholder_re) { + $left = $left ? [ $left, [ PLACEHOLDER => [ $token ] ] ] + : [ PLACEHOLDER => [ $token ] ]; } - # literal (eat everything on the right until RHS termination) + # we're now in "unknown token" land - start eating tokens until + # we see something familiar else { - my $right = $self->_recurse_parse ($tokens, PARSE_RHS); - $left = $left ? [ $left, [LITERAL => [join ' ', $token, $self->unparse($right)||()] ] ] - : [ LITERAL => [join ' ', $token, $self->unparse($right)||()] ]; + my $right; + + # check if the current token is an unknown op-start + if (@$tokens and $tokens->[0] =~ $func_start_re) { + $right = [ $token => [ $self->_recurse_parse($tokens, PARSE_IN_FUNC) || () ] ]; + } + else { + $right = [ LITERAL => [ $token ] ]; + } + + $left = $left ? [ $left, $right ] + : $right; } } } +} sub format_keyword { my ($self, $keyword) = @_; @@ -351,7 +437,7 @@ sub pad_keyword { $before = $self->newline . $self->indent($depth + $self->indentmap->{lc $keyword}); } $before = '' if $depth == 0 and defined $starters{lc $keyword}; - return [$before, ' ']; + return [$before, '']; } sub indent { ($_[0]->indent_string||'') x ( ( $_[0]->indent_amount || 0 ) * $_[1] ) } @@ -367,24 +453,33 @@ sub fill_in_placeholder { my ($self, $bindargs) = @_; if ($self->fill_in_placeholders) { - my $val = pop @{$bindargs} || ''; + my $val = shift @{$bindargs} || ''; + my $quoted = $val =~ s/^(['"])(.*)\1$/$2/; my ($left, $right) = @{$self->placeholder_surround}; $val =~ s/\\/\\\\/g; $val =~ s/'/\\'/g; - return qq('$left$val$right') + $val = qq('$val') if $quoted; + return qq($left$val$right) } return '?' } +# FIXME - terrible name for a user facing API sub unparse { - my ($self, $tree, $bindargs, $depth) = @_; + my ($self, $tree, $bindargs) = @_; + $self->_unparse($tree, [@{$bindargs||[]}], 0); +} - $depth ||= 0; +sub _unparse { + my ($self, $tree, $bindargs, $depth) = @_; if (not $tree or not @$tree) { return ''; } + # FIXME - needs a config switch to disable + $self->_parenthesis_unroll($tree); + my ($car, $cdr) = @{$tree}[0,1]; if (! defined $car or (! ref $car and ! defined $cdr) ) { @@ -395,35 +490,189 @@ sub unparse { } if (ref $car) { - return join ('', map $self->unparse($_, $bindargs, $depth), @$tree); + return join (' ', map $self->_unparse($_, $bindargs, $depth), @$tree); } elsif ($car eq 'LITERAL') { - if ($cdr->[0] eq '?') { - return $self->fill_in_placeholder($bindargs) - } return $cdr->[0]; } + elsif ($car eq 'PLACEHOLDER') { + return $self->fill_in_placeholder($bindargs); + } elsif ($car eq 'PAREN') { - return '(' . - join(' ', - map $self->unparse($_, $bindargs, $depth + 2), @{$cdr}) . - ($self->_is_key($cdr)?( $self->newline||'' ).$self->indent($depth + 1):'') . ') '; + return sprintf ('( %s )', + join (' ', map { $self->_unparse($_, $bindargs, $depth + 2) } @{$cdr} ) + . + ($self->_is_key($cdr) + ? ( $self->newline||'' ) . $self->indent($depth + 1) + : '' + ) + ); } elsif ($car eq 'AND' or $car eq 'OR' or $car =~ / ^ $binary_op_re $ /x ) { - return join (" $car ", map $self->unparse($_, $bindargs, $depth), @{$cdr}); + return join (" $car ", map $self->_unparse($_, $bindargs, $depth), @{$cdr}); + } + elsif ($car eq 'LIST' ) { + return join (', ', map $self->_unparse($_, $bindargs, $depth), @{$cdr}); } else { my ($l, $r) = @{$self->pad_keyword($car, $depth)}; - return sprintf "$l%s %s$r", $self->format_keyword($car), $self->unparse($cdr, $bindargs, $depth); + + return sprintf "$l%s%s%s$r", + $self->format_keyword($car), + ( ref $cdr eq 'ARRAY' and ref $cdr->[0] eq 'ARRAY' and $cdr->[0][0] and $cdr->[0][0] eq 'PAREN' ) + ? '' # mysql-- + : ' ' + , + $self->_unparse($cdr, $bindargs, $depth), + ; } } +# All of these keywords allow their parameters to be specified with or without parenthesis without changing the semantics +my @unrollable_ops = ( + 'ON', + 'WHERE', + 'GROUP \s+ BY', + 'HAVING', + 'ORDER \s+ BY', + 'I?LIKE', +); +my $unrollable_ops_re = join ' | ', @unrollable_ops; +$unrollable_ops_re = qr/$unrollable_ops_re/xi; + +sub _parenthesis_unroll { + my $self = shift; + my $ast = shift; + + return unless (ref $ast and ref $ast->[1]); + + my $changes; + do { + my @children; + $changes = 0; + + for my $child (@{$ast->[1]}) { + + # the current node in this loop is *always* a PAREN + if (! ref $child or ! @$child or $child->[0] ne 'PAREN') { + push @children, $child; + next; + } + + # unroll nested parenthesis + while ( @{$child->[1]} && $child->[1][0][0] eq 'PAREN') { + $child = $child->[1][0]; + $changes++; + } + + # if the parenthesis are wrapped around an AND/OR matching the parent AND/OR - open the parenthesis up and merge the list + if ( + ( $ast->[0] eq 'AND' or $ast->[0] eq 'OR') + and + $child->[1][0][0] eq $ast->[0] + ) { + push @children, @{$child->[1][0][1]}; + $changes++; + } + + # if the parent operator explcitly allows it nuke the parenthesis + elsif ( $ast->[0] =~ $unrollable_ops_re ) { + push @children, $child->[1][0]; + $changes++; + } + + # only *ONE* LITERAL or placeholder element + # as an AND/OR/NOT argument + elsif ( + @{$child->[1]} == 1 && ( + $child->[1][0][0] eq 'LITERAL' + or + $child->[1][0][0] eq 'PLACEHOLDER' + ) && ( + $ast->[0] eq 'AND' or $ast->[0] eq 'OR' or $ast->[0] eq 'NOT' + ) + ) { + push @children, $child->[1][0]; + $changes++; + } + + # an AND/OR expression with only one binop in the parenthesis + # with exactly two grandchildren + # the only time when we can *not* unroll this is when both + # the parent and the child are mathops (in which case we'll + # break precedence) or when the child is BETWEEN (special + # case) + elsif ( + @{$child->[1]} == 1 + and + ($ast->[0] eq 'AND' or $ast->[0] eq 'OR') + and + $child->[1][0][0] =~ SQL::Abstract::Tree::_binary_op_re() + and + $child->[1][0][0] ne 'BETWEEN' + and + @{$child->[1][0][1]} == 2 + and + ! ( + $child->[1][0][0] =~ SQL::Abstract::Tree::_math_op_re() + and + $ast->[0] =~ SQL::Abstract::Tree::_math_op_re() + ) + ) { + push @children, $child->[1][0]; + $changes++; + } + + # a function binds tighter than a mathop - see if our ancestor is a + # mathop, and our content is: + # a single non-mathop child with a single PAREN grandchild which + # would indicate mathop ( nonmathop ( ... ) ) + # or a single non-mathop with a single LITERAL ( nonmathop foo ) + # or a single non-mathop with a single PLACEHOLDER ( nonmathop ? ) + elsif ( + @{$child->[1]} == 1 + and + @{$child->[1][0][1]} == 1 + and + $ast->[0] =~ SQL::Abstract::Tree::_math_op_re() + and + $child->[1][0][0] !~ SQL::Abstract::Tree::_math_op_re + and + ( + $child->[1][0][1][0][0] eq 'PAREN' + or + $child->[1][0][1][0][0] eq 'LITERAL' + or + $child->[1][0][1][0][0] eq 'PLACEHOLDER' + ) + ) { + push @children, $child->[1][0]; + $changes++; + } + + + # otherwise no more mucking for this pass + else { + push @children, $child; + } + } + + $ast->[1] = \@children; + + } while ($changes); + +} + sub format { my $self = shift; $self->unparse($self->parse($_[0]), $_[1]) } 1; =pod +=head1 NAME + +SQL::Abstract::Tree - Represent SQL as an AST + =head1 SYNOPSIS my $sqla_tree = SQL::Abstract::Tree->new({ profile => 'console' });