Commit | Line | Data |
01dd4e4f |
1 | package SQL::Abstract::Tree; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Carp; |
6 | |
7 | # Parser states for _recurse_parse() |
8 | use constant PARSE_TOP_LEVEL => 0; |
9 | use constant PARSE_IN_EXPR => 1; |
10 | use constant PARSE_IN_PARENS => 2; |
11 | use constant PARSE_RHS => 3; |
12 | |
13 | # These SQL keywords always signal end of the current expression (except inside |
14 | # of a parenthesized subexpression). |
15 | # Format: A list of strings that will be compiled to extended syntax (ie. |
16 | # /.../x) regexes, without capturing parentheses. They will be automatically |
17 | # anchored to word boundaries to match the whole token). |
18 | my @expression_terminator_sql_keywords = ( |
19 | 'SELECT', |
20 | 'FROM', |
21 | '(?: |
22 | (?: |
23 | (?: \b (?: LEFT | RIGHT | FULL ) \s+ )? |
24 | (?: \b (?: CROSS | INNER | OUTER ) \s+ )? |
25 | )? |
26 | JOIN |
27 | )', |
28 | 'ON', |
29 | 'WHERE', |
30 | 'EXISTS', |
31 | 'GROUP \s+ BY', |
32 | 'HAVING', |
33 | 'ORDER \s+ BY', |
34 | 'LIMIT', |
35 | 'OFFSET', |
36 | 'FOR', |
37 | 'UNION', |
38 | 'INTERSECT', |
39 | 'EXCEPT', |
40 | 'RETURNING', |
41 | ); |
42 | |
43 | # These are binary operator keywords always a single LHS and RHS |
44 | # * AND/OR are handled separately as they are N-ary |
45 | # * so is NOT as being unary |
46 | # * BETWEEN without paranthesis around the ANDed arguments (which |
47 | # makes it a non-binary op) is detected and accomodated in |
48 | # _recurse_parse() |
49 | my $stuff_around_mathops = qr/[\w\s\`\'\"\)]/; |
50 | my @binary_op_keywords = ( |
51 | ( map |
52 | { |
53 | ' ^ ' . quotemeta ($_) . "(?= \$ | $stuff_around_mathops ) ", |
54 | " (?<= $stuff_around_mathops)" . quotemeta ($_) . "(?= \$ | $stuff_around_mathops ) ", |
55 | } |
56 | (qw/< > != <> = <= >=/) |
57 | ), |
58 | ( map |
59 | { '\b (?: NOT \s+)?' . $_ . '\b' } |
60 | (qw/IN BETWEEN LIKE/) |
61 | ), |
62 | ); |
63 | |
64 | my $tokenizer_re_str = join("\n\t|\n", |
65 | ( map { '\b' . $_ . '\b' } @expression_terminator_sql_keywords, 'AND', 'OR', 'NOT'), |
66 | @binary_op_keywords, |
67 | ); |
68 | |
69 | my $tokenizer_re = qr/ \s* ( $tokenizer_re_str | \( | \) | \? ) \s* /xi; |
70 | |
71 | sub _binary_op_keywords { @binary_op_keywords } |
72 | |
73 | sub parse { |
74 | my $s = shift; |
75 | |
76 | # tokenize string, and remove all optional whitespace |
77 | my $tokens = []; |
78 | foreach my $token (split $tokenizer_re, $s) { |
79 | push @$tokens, $token if (length $token) && ($token =~ /\S/); |
80 | } |
81 | |
82 | my $tree = _recurse_parse($tokens, PARSE_TOP_LEVEL); |
83 | return $tree; |
84 | } |
85 | |
86 | sub _recurse_parse { |
87 | my ($tokens, $state) = @_; |
88 | |
89 | my $left; |
90 | while (1) { # left-associative parsing |
91 | |
92 | my $lookahead = $tokens->[0]; |
93 | if ( not defined($lookahead) |
94 | or |
95 | ($state == PARSE_IN_PARENS && $lookahead eq ')') |
96 | or |
97 | ($state == PARSE_IN_EXPR && grep { $lookahead =~ /^ $_ $/xi } ('\)', @expression_terminator_sql_keywords ) ) |
98 | or |
99 | ($state == PARSE_RHS && grep { $lookahead =~ /^ $_ $/xi } ('\)', @expression_terminator_sql_keywords, @binary_op_keywords, 'AND', 'OR', 'NOT' ) ) |
100 | ) { |
101 | return $left; |
102 | } |
103 | |
104 | my $token = shift @$tokens; |
105 | |
106 | # nested expression in () |
107 | if ($token eq '(' ) { |
108 | my $right = _recurse_parse($tokens, PARSE_IN_PARENS); |
109 | $token = shift @$tokens or croak "missing closing ')' around block " . unparse ($right); |
110 | $token eq ')' or croak "unexpected token '$token' terminating block " . unparse ($right); |
111 | |
112 | $left = $left ? [@$left, [PAREN => [$right] ]] |
113 | : [PAREN => [$right] ]; |
114 | } |
115 | # AND/OR |
116 | elsif ($token =~ /^ (?: OR | AND ) $/xi ) { |
117 | my $op = uc $token; |
118 | my $right = _recurse_parse($tokens, PARSE_IN_EXPR); |
119 | |
120 | # Merge chunks if logic matches |
121 | if (ref $right and $op eq $right->[0]) { |
122 | $left = [ (shift @$right ), [$left, map { @$_ } @$right] ]; |
123 | } |
124 | else { |
125 | $left = [$op => [$left, $right]]; |
126 | } |
127 | } |
128 | # binary operator keywords |
129 | elsif (grep { $token =~ /^ $_ $/xi } @binary_op_keywords ) { |
130 | my $op = uc $token; |
131 | my $right = _recurse_parse($tokens, PARSE_RHS); |
132 | |
133 | # A between with a simple LITERAL for a 1st RHS argument needs a |
134 | # rerun of the search to (hopefully) find the proper AND construct |
135 | if ($op eq 'BETWEEN' and $right->[0] eq 'LITERAL') { |
136 | unshift @$tokens, $right->[1][0]; |
137 | $right = _recurse_parse($tokens, PARSE_IN_EXPR); |
138 | } |
139 | |
140 | $left = [$op => [$left, $right] ]; |
141 | } |
142 | # expression terminator keywords (as they start a new expression) |
143 | elsif (grep { $token =~ /^ $_ $/xi } @expression_terminator_sql_keywords ) { |
144 | my $op = uc $token; |
145 | my $right = _recurse_parse($tokens, PARSE_IN_EXPR); |
146 | $left = $left ? [ $left, [$op => [$right] ]] |
147 | : [ $op => [$right] ]; |
148 | } |
149 | # NOT (last as to allow all other NOT X pieces first) |
150 | elsif ( $token =~ /^ not $/ix ) { |
151 | my $op = uc $token; |
152 | my $right = _recurse_parse ($tokens, PARSE_RHS); |
153 | $left = $left ? [ @$left, [$op => [$right] ]] |
154 | : [ $op => [$right] ]; |
155 | |
156 | } |
157 | # literal (eat everything on the right until RHS termination) |
158 | else { |
159 | my $right = _recurse_parse ($tokens, PARSE_RHS); |
160 | $left = $left ? [ $left, [LITERAL => [join ' ', $token, unparse($right)||()] ] ] |
161 | : [ LITERAL => [join ' ', $token, unparse($right)||()] ]; |
162 | } |
163 | } |
164 | } |
165 | |
166 | sub unparse { |
167 | my $tree = shift; |
168 | |
169 | if (not $tree ) { |
170 | return ''; |
171 | } |
172 | elsif (ref $tree->[0]) { |
173 | return join (" ", map { unparse ($_) } @$tree); |
174 | } |
175 | elsif ($tree->[0] eq 'LITERAL') { |
176 | return $tree->[1][0]; |
177 | } |
178 | elsif ($tree->[0] eq 'PAREN') { |
179 | return sprintf '(%s)', join (" ", map {unparse($_)} @{$tree->[1]}); |
180 | } |
181 | elsif ($tree->[0] eq 'OR' or $tree->[0] eq 'AND' or (grep { $tree->[0] =~ /^ $_ $/xi } @binary_op_keywords ) ) { |
182 | return join (" $tree->[0] ", map {unparse($_)} @{$tree->[1]}); |
183 | } |
184 | else { |
d49e5323 |
185 | return sprintf "%s %s\n", $tree->[0], unparse ($tree->[1]); |
01dd4e4f |
186 | } |
187 | } |
188 | |
189 | |
190 | 1; |
191 | |