Commit | Line | Data |
01dd4e4f |
1 | package SQL::Abstract::Tree; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Carp; |
6 | |
7 | # Parser states for _recurse_parse() |
8 | use constant PARSE_TOP_LEVEL => 0; |
9 | use constant PARSE_IN_EXPR => 1; |
10 | use constant PARSE_IN_PARENS => 2; |
11 | use constant PARSE_RHS => 3; |
12 | |
13 | # These SQL keywords always signal end of the current expression (except inside |
14 | # of a parenthesized subexpression). |
15 | # Format: A list of strings that will be compiled to extended syntax (ie. |
16 | # /.../x) regexes, without capturing parentheses. They will be automatically |
17 | # anchored to word boundaries to match the whole token). |
18 | my @expression_terminator_sql_keywords = ( |
19 | 'SELECT', |
20 | 'FROM', |
21 | '(?: |
22 | (?: |
23 | (?: \b (?: LEFT | RIGHT | FULL ) \s+ )? |
24 | (?: \b (?: CROSS | INNER | OUTER ) \s+ )? |
25 | )? |
26 | JOIN |
27 | )', |
28 | 'ON', |
29 | 'WHERE', |
30 | 'EXISTS', |
31 | 'GROUP \s+ BY', |
32 | 'HAVING', |
33 | 'ORDER \s+ BY', |
34 | 'LIMIT', |
35 | 'OFFSET', |
36 | 'FOR', |
37 | 'UNION', |
38 | 'INTERSECT', |
39 | 'EXCEPT', |
40 | 'RETURNING', |
41 | ); |
42 | |
43 | # These are binary operator keywords always a single LHS and RHS |
44 | # * AND/OR are handled separately as they are N-ary |
45 | # * so is NOT as being unary |
46 | # * BETWEEN without paranthesis around the ANDed arguments (which |
47 | # makes it a non-binary op) is detected and accomodated in |
48 | # _recurse_parse() |
49 | my $stuff_around_mathops = qr/[\w\s\`\'\"\)]/; |
50 | my @binary_op_keywords = ( |
51 | ( map |
52 | { |
53 | ' ^ ' . quotemeta ($_) . "(?= \$ | $stuff_around_mathops ) ", |
54 | " (?<= $stuff_around_mathops)" . quotemeta ($_) . "(?= \$ | $stuff_around_mathops ) ", |
55 | } |
56 | (qw/< > != <> = <= >=/) |
57 | ), |
58 | ( map |
59 | { '\b (?: NOT \s+)?' . $_ . '\b' } |
60 | (qw/IN BETWEEN LIKE/) |
61 | ), |
62 | ); |
63 | |
64 | my $tokenizer_re_str = join("\n\t|\n", |
65 | ( map { '\b' . $_ . '\b' } @expression_terminator_sql_keywords, 'AND', 'OR', 'NOT'), |
66 | @binary_op_keywords, |
67 | ); |
68 | |
69 | my $tokenizer_re = qr/ \s* ( $tokenizer_re_str | \( | \) | \? ) \s* /xi; |
70 | |
71 | sub _binary_op_keywords { @binary_op_keywords } |
72 | |
d695b0ad |
73 | sub new { bless sub {}, shift } |
74 | |
01dd4e4f |
75 | sub parse { |
d695b0ad |
76 | my ($self, $s) = @_; |
01dd4e4f |
77 | |
78 | # tokenize string, and remove all optional whitespace |
79 | my $tokens = []; |
80 | foreach my $token (split $tokenizer_re, $s) { |
81 | push @$tokens, $token if (length $token) && ($token =~ /\S/); |
82 | } |
83 | |
d695b0ad |
84 | my $tree = $self->_recurse_parse($tokens, PARSE_TOP_LEVEL); |
01dd4e4f |
85 | return $tree; |
86 | } |
87 | |
88 | sub _recurse_parse { |
d695b0ad |
89 | my ($self, $tokens, $state) = @_; |
01dd4e4f |
90 | |
91 | my $left; |
92 | while (1) { # left-associative parsing |
93 | |
94 | my $lookahead = $tokens->[0]; |
95 | if ( not defined($lookahead) |
96 | or |
97 | ($state == PARSE_IN_PARENS && $lookahead eq ')') |
98 | or |
99 | ($state == PARSE_IN_EXPR && grep { $lookahead =~ /^ $_ $/xi } ('\)', @expression_terminator_sql_keywords ) ) |
100 | or |
101 | ($state == PARSE_RHS && grep { $lookahead =~ /^ $_ $/xi } ('\)', @expression_terminator_sql_keywords, @binary_op_keywords, 'AND', 'OR', 'NOT' ) ) |
102 | ) { |
103 | return $left; |
104 | } |
105 | |
106 | my $token = shift @$tokens; |
107 | |
108 | # nested expression in () |
109 | if ($token eq '(' ) { |
d695b0ad |
110 | my $right = $self->_recurse_parse($tokens, PARSE_IN_PARENS); |
111 | $token = shift @$tokens or croak "missing closing ')' around block " . $self->unparse($right); |
112 | $token eq ')' or croak "unexpected token '$token' terminating block " . $self->unparse($right); |
01dd4e4f |
113 | |
114 | $left = $left ? [@$left, [PAREN => [$right] ]] |
115 | : [PAREN => [$right] ]; |
116 | } |
117 | # AND/OR |
118 | elsif ($token =~ /^ (?: OR | AND ) $/xi ) { |
119 | my $op = uc $token; |
d695b0ad |
120 | my $right = $self->_recurse_parse($tokens, PARSE_IN_EXPR); |
01dd4e4f |
121 | |
122 | # Merge chunks if logic matches |
123 | if (ref $right and $op eq $right->[0]) { |
124 | $left = [ (shift @$right ), [$left, map { @$_ } @$right] ]; |
125 | } |
126 | else { |
127 | $left = [$op => [$left, $right]]; |
128 | } |
129 | } |
130 | # binary operator keywords |
131 | elsif (grep { $token =~ /^ $_ $/xi } @binary_op_keywords ) { |
132 | my $op = uc $token; |
d695b0ad |
133 | my $right = $self->_recurse_parse($tokens, PARSE_RHS); |
01dd4e4f |
134 | |
135 | # A between with a simple LITERAL for a 1st RHS argument needs a |
136 | # rerun of the search to (hopefully) find the proper AND construct |
137 | if ($op eq 'BETWEEN' and $right->[0] eq 'LITERAL') { |
138 | unshift @$tokens, $right->[1][0]; |
d695b0ad |
139 | $right = $self->_recurse_parse($tokens, PARSE_IN_EXPR); |
01dd4e4f |
140 | } |
141 | |
142 | $left = [$op => [$left, $right] ]; |
143 | } |
144 | # expression terminator keywords (as they start a new expression) |
145 | elsif (grep { $token =~ /^ $_ $/xi } @expression_terminator_sql_keywords ) { |
146 | my $op = uc $token; |
d695b0ad |
147 | my $right = $self->_recurse_parse($tokens, PARSE_IN_EXPR); |
01dd4e4f |
148 | $left = $left ? [ $left, [$op => [$right] ]] |
149 | : [ $op => [$right] ]; |
150 | } |
151 | # NOT (last as to allow all other NOT X pieces first) |
152 | elsif ( $token =~ /^ not $/ix ) { |
153 | my $op = uc $token; |
d695b0ad |
154 | my $right = $self->_recurse_parse ($tokens, PARSE_RHS); |
01dd4e4f |
155 | $left = $left ? [ @$left, [$op => [$right] ]] |
156 | : [ $op => [$right] ]; |
157 | |
158 | } |
159 | # literal (eat everything on the right until RHS termination) |
160 | else { |
d695b0ad |
161 | my $right = $self->_recurse_parse ($tokens, PARSE_RHS); |
162 | $left = $left ? [ $left, [LITERAL => [join ' ', $token, $self->unparse($right)||()] ] ] |
163 | : [ LITERAL => [join ' ', $token, $self->unparse($right)||()] ]; |
01dd4e4f |
164 | } |
165 | } |
166 | } |
167 | |
d695b0ad |
168 | use Term::ANSIColor 'color'; |
169 | |
170 | my %ghetto_colormap = ( |
171 | select => [color('red'), color('reset')], |
172 | where => [color('green'), color('reset')], |
173 | from => [color('cyan'), color('reset')], |
174 | ); |
175 | |
176 | sub format_keyword { |
177 | my ($self, $keyword) = @_; |
178 | |
179 | if (my $around = $ghetto_colormap{lc $keyword}) { |
180 | $keyword = "$around->[0]$keyword$around->[1]"; |
181 | } |
182 | |
183 | return $keyword |
184 | } |
185 | |
e171c446 |
186 | |
187 | my %ghetto_whitespacemap = ( |
188 | select => 0, |
189 | where => 1, |
190 | from => 1, |
191 | ); |
192 | |
a24cc3a0 |
193 | sub whitespace { |
194 | my ($self, $keyword, $depth) = @_; |
e171c446 |
195 | |
196 | my $before = ''; |
197 | my $after = ''; |
198 | if (defined $ghetto_whitespacemap{lc $keyword}) { |
199 | $before = $self->newline . $self->indent($depth + $ghetto_whitespacemap{lc $keyword}); |
a24cc3a0 |
200 | } |
e171c446 |
201 | return [$before, $after]; |
a24cc3a0 |
202 | } |
203 | |
204 | sub newline { "\n" } |
205 | |
206 | sub indent { ' ' x $_[1] } |
207 | |
0569a14f |
208 | sub _is_select { |
209 | my $tree = shift; |
210 | $tree = $tree->[0] while ref $tree; |
211 | |
212 | lc $tree eq 'select'; |
213 | } |
214 | |
01dd4e4f |
215 | sub unparse { |
a24cc3a0 |
216 | my ($self, $tree, $depth) = @_; |
217 | |
e171c446 |
218 | $depth ||= 0; |
01dd4e4f |
219 | |
220 | if (not $tree ) { |
221 | return ''; |
222 | } |
a24cc3a0 |
223 | |
224 | my $car = $tree->[0]; |
225 | my $cdr = $tree->[1]; |
226 | |
227 | if (ref $car) { |
e171c446 |
228 | return join ('', map $self->unparse($_, $depth), @$tree); |
01dd4e4f |
229 | } |
a24cc3a0 |
230 | elsif ($car eq 'LITERAL') { |
231 | return $cdr->[0]; |
01dd4e4f |
232 | } |
a24cc3a0 |
233 | elsif ($car eq 'PAREN') { |
e171c446 |
234 | return '(' . |
a24cc3a0 |
235 | join(' ', |
0569a14f |
236 | map $self->unparse($_, $depth + 2), @{$cdr}) . |
237 | (_is_select($cdr)?$self->newline.$self->indent($depth + 1):'') . ')'; |
01dd4e4f |
238 | } |
a24cc3a0 |
239 | elsif ($car eq 'OR' or $car eq 'AND' or (grep { $car =~ /^ $_ $/xi } @binary_op_keywords ) ) { |
e171c446 |
240 | return join (" $car ", map $self->unparse($_, $depth), @{$cdr}); |
01dd4e4f |
241 | } |
242 | else { |
a24cc3a0 |
243 | my ($l, $r) = @{$self->whitespace($car, $depth)}; |
e171c446 |
244 | return sprintf "$l%s %s$r", $self->format_keyword($car), $self->unparse($cdr, $depth); |
01dd4e4f |
245 | } |
246 | } |
247 | |
d695b0ad |
248 | sub format { my $self = shift; $self->unparse($self->parse(@_)) } |
01dd4e4f |
249 | |
250 | 1; |
251 | |