document hopes and dreams
[scpubgit/Q-Branch.git] / lib / SQL / Abstract / Tree.pm
CommitLineData
01dd4e4f 1package SQL::Abstract::Tree;
2
3use strict;
4use warnings;
5use Carp;
6
7# Parser states for _recurse_parse()
8use constant PARSE_TOP_LEVEL => 0;
9use constant PARSE_IN_EXPR => 1;
10use constant PARSE_IN_PARENS => 2;
11use constant PARSE_RHS => 3;
12
13# These SQL keywords always signal end of the current expression (except inside
14# of a parenthesized subexpression).
15# Format: A list of strings that will be compiled to extended syntax (ie.
16# /.../x) regexes, without capturing parentheses. They will be automatically
17# anchored to word boundaries to match the whole token).
18my @expression_terminator_sql_keywords = (
19 'SELECT',
20 'FROM',
21 '(?:
22 (?:
23 (?: \b (?: LEFT | RIGHT | FULL ) \s+ )?
24 (?: \b (?: CROSS | INNER | OUTER ) \s+ )?
25 )?
26 JOIN
27 )',
28 'ON',
29 'WHERE',
30 'EXISTS',
31 'GROUP \s+ BY',
32 'HAVING',
33 'ORDER \s+ BY',
34 'LIMIT',
35 'OFFSET',
36 'FOR',
37 'UNION',
38 'INTERSECT',
39 'EXCEPT',
40 'RETURNING',
41);
42
43# These are binary operator keywords always a single LHS and RHS
44# * AND/OR are handled separately as they are N-ary
45# * so is NOT as being unary
46# * BETWEEN without paranthesis around the ANDed arguments (which
47# makes it a non-binary op) is detected and accomodated in
48# _recurse_parse()
49my $stuff_around_mathops = qr/[\w\s\`\'\"\)]/;
50my @binary_op_keywords = (
51 ( map
52 {
53 ' ^ ' . quotemeta ($_) . "(?= \$ | $stuff_around_mathops ) ",
54 " (?<= $stuff_around_mathops)" . quotemeta ($_) . "(?= \$ | $stuff_around_mathops ) ",
55 }
56 (qw/< > != <> = <= >=/)
57 ),
58 ( map
59 { '\b (?: NOT \s+)?' . $_ . '\b' }
60 (qw/IN BETWEEN LIKE/)
61 ),
62);
63
64my $tokenizer_re_str = join("\n\t|\n",
65 ( map { '\b' . $_ . '\b' } @expression_terminator_sql_keywords, 'AND', 'OR', 'NOT'),
66 @binary_op_keywords,
67);
68
69my $tokenizer_re = qr/ \s* ( $tokenizer_re_str | \( | \) | \? ) \s* /xi;
70
71sub _binary_op_keywords { @binary_op_keywords }
72
73sub parse {
74 my $s = shift;
75
76 # tokenize string, and remove all optional whitespace
77 my $tokens = [];
78 foreach my $token (split $tokenizer_re, $s) {
79 push @$tokens, $token if (length $token) && ($token =~ /\S/);
80 }
81
82 my $tree = _recurse_parse($tokens, PARSE_TOP_LEVEL);
83 return $tree;
84}
85
86sub _recurse_parse {
87 my ($tokens, $state) = @_;
88
89 my $left;
90 while (1) { # left-associative parsing
91
92 my $lookahead = $tokens->[0];
93 if ( not defined($lookahead)
94 or
95 ($state == PARSE_IN_PARENS && $lookahead eq ')')
96 or
97 ($state == PARSE_IN_EXPR && grep { $lookahead =~ /^ $_ $/xi } ('\)', @expression_terminator_sql_keywords ) )
98 or
99 ($state == PARSE_RHS && grep { $lookahead =~ /^ $_ $/xi } ('\)', @expression_terminator_sql_keywords, @binary_op_keywords, 'AND', 'OR', 'NOT' ) )
100 ) {
101 return $left;
102 }
103
104 my $token = shift @$tokens;
105
106 # nested expression in ()
107 if ($token eq '(' ) {
108 my $right = _recurse_parse($tokens, PARSE_IN_PARENS);
109 $token = shift @$tokens or croak "missing closing ')' around block " . unparse ($right);
110 $token eq ')' or croak "unexpected token '$token' terminating block " . unparse ($right);
111
112 $left = $left ? [@$left, [PAREN => [$right] ]]
113 : [PAREN => [$right] ];
114 }
115 # AND/OR
116 elsif ($token =~ /^ (?: OR | AND ) $/xi ) {
117 my $op = uc $token;
118 my $right = _recurse_parse($tokens, PARSE_IN_EXPR);
119
120 # Merge chunks if logic matches
121 if (ref $right and $op eq $right->[0]) {
122 $left = [ (shift @$right ), [$left, map { @$_ } @$right] ];
123 }
124 else {
125 $left = [$op => [$left, $right]];
126 }
127 }
128 # binary operator keywords
129 elsif (grep { $token =~ /^ $_ $/xi } @binary_op_keywords ) {
130 my $op = uc $token;
131 my $right = _recurse_parse($tokens, PARSE_RHS);
132
133 # A between with a simple LITERAL for a 1st RHS argument needs a
134 # rerun of the search to (hopefully) find the proper AND construct
135 if ($op eq 'BETWEEN' and $right->[0] eq 'LITERAL') {
136 unshift @$tokens, $right->[1][0];
137 $right = _recurse_parse($tokens, PARSE_IN_EXPR);
138 }
139
140 $left = [$op => [$left, $right] ];
141 }
142 # expression terminator keywords (as they start a new expression)
143 elsif (grep { $token =~ /^ $_ $/xi } @expression_terminator_sql_keywords ) {
144 my $op = uc $token;
145 my $right = _recurse_parse($tokens, PARSE_IN_EXPR);
146 $left = $left ? [ $left, [$op => [$right] ]]
147 : [ $op => [$right] ];
148 }
149 # NOT (last as to allow all other NOT X pieces first)
150 elsif ( $token =~ /^ not $/ix ) {
151 my $op = uc $token;
152 my $right = _recurse_parse ($tokens, PARSE_RHS);
153 $left = $left ? [ @$left, [$op => [$right] ]]
154 : [ $op => [$right] ];
155
156 }
157 # literal (eat everything on the right until RHS termination)
158 else {
159 my $right = _recurse_parse ($tokens, PARSE_RHS);
160 $left = $left ? [ $left, [LITERAL => [join ' ', $token, unparse($right)||()] ] ]
161 : [ LITERAL => [join ' ', $token, unparse($right)||()] ];
162 }
163 }
164}
165
166sub unparse {
167 my $tree = shift;
168
169 if (not $tree ) {
170 return '';
171 }
172 elsif (ref $tree->[0]) {
173 return join (" ", map { unparse ($_) } @$tree);
174 }
175 elsif ($tree->[0] eq 'LITERAL') {
176 return $tree->[1][0];
177 }
178 elsif ($tree->[0] eq 'PAREN') {
179 return sprintf '(%s)', join (" ", map {unparse($_)} @{$tree->[1]});
180 }
181 elsif ($tree->[0] eq 'OR' or $tree->[0] eq 'AND' or (grep { $tree->[0] =~ /^ $_ $/xi } @binary_op_keywords ) ) {
182 return join (" $tree->[0] ", map {unparse($_)} @{$tree->[1]});
183 }
184 else {
d49e5323 185 return sprintf "%s %s\n", $tree->[0], unparse ($tree->[1]);
01dd4e4f 186 }
187}
188
189
1901;
191