Commit | Line | Data |
3fea05b9 |
1 | package TAP::Parser::Grammar; |
2 | |
3 | use strict; |
4 | use vars qw($VERSION @ISA); |
5 | |
6 | use TAP::Object (); |
7 | use TAP::Parser::ResultFactory (); |
8 | use TAP::Parser::YAMLish::Reader (); |
9 | |
10 | @ISA = qw(TAP::Object); |
11 | |
12 | =head1 NAME |
13 | |
14 | TAP::Parser::Grammar - A grammar for the Test Anything Protocol. |
15 | |
16 | =head1 VERSION |
17 | |
18 | Version 3.17 |
19 | |
20 | =cut |
21 | |
22 | $VERSION = '3.17'; |
23 | |
24 | =head1 SYNOPSIS |
25 | |
26 | use TAP::Parser::Grammar; |
27 | my $grammar = $self->make_grammar({ |
28 | stream => $tap_parser_stream, |
29 | parser => $tap_parser, |
30 | version => 12, |
31 | }); |
32 | |
33 | my $result = $grammar->tokenize; |
34 | |
35 | =head1 DESCRIPTION |
36 | |
37 | C<TAP::Parser::Grammar> tokenizes lines from a TAP stream and constructs |
38 | L<TAP::Parser::Result> subclasses to represent the tokens. |
39 | |
40 | Do not attempt to use this class directly. It won't make sense. It's mainly |
41 | here to ensure that we will be able to have pluggable grammars when TAP is |
42 | expanded at some future date (plus, this stuff was really cluttering the |
43 | parser). |
44 | |
45 | =head1 METHODS |
46 | |
47 | =head2 Class Methods |
48 | |
49 | =head3 C<new> |
50 | |
51 | my $grammar = TAP::Parser::Grammar->new({ |
52 | stream => $stream, |
53 | parser => $parser, |
54 | version => $version, |
55 | }); |
56 | |
57 | Returns L<TAP::Parser> grammar object that will parse the specified stream. |
58 | Both C<stream> and C<parser> are required arguments. If C<version> is not set |
59 | it defaults to C<12> (see L</set_version> for more details). |
60 | |
61 | =cut |
62 | |
63 | # new() implementation supplied by TAP::Object |
64 | sub _initialize { |
65 | my ( $self, $args ) = @_; |
66 | $self->{stream} = $args->{stream}; # TODO: accessor |
67 | $self->{parser} = $args->{parser}; # TODO: accessor |
68 | $self->set_version( $args->{version} || 12 ); |
69 | return $self; |
70 | } |
71 | |
72 | my %language_for; |
73 | |
74 | { |
75 | |
76 | # XXX the 'not' and 'ok' might be on separate lines in VMS ... |
77 | my $ok = qr/(?:not )?ok\b/; |
78 | my $num = qr/\d+/; |
79 | |
80 | my %v12 = ( |
81 | version => { |
82 | syntax => qr/^TAP\s+version\s+(\d+)\s*\z/i, |
83 | handler => sub { |
84 | my ( $self, $line ) = @_; |
85 | my $version = $1; |
86 | return $self->_make_version_token( $line, $version, ); |
87 | }, |
88 | }, |
89 | plan => { |
90 | syntax => qr/^1\.\.(\d+)\s*(.*)\z/, |
91 | handler => sub { |
92 | my ( $self, $line ) = @_; |
93 | my ( $tests_planned, $tail ) = ( $1, $2 ); |
94 | my $explanation = undef; |
95 | my $skip = ''; |
96 | |
97 | if ( $tail =~ /^todo((?:\s+\d+)+)/ ) { |
98 | my @todo = split /\s+/, _trim($1); |
99 | return $self->_make_plan_token( |
100 | $line, $tests_planned, 'TODO', |
101 | '', \@todo |
102 | ); |
103 | } |
104 | elsif ( 0 == $tests_planned ) { |
105 | $skip = 'SKIP'; |
106 | |
107 | # If we can't match # SKIP the directive should be undef. |
108 | ($explanation) = $tail =~ /^#\s*SKIP\S*\s+(.*)/i; |
109 | } |
110 | elsif ( $tail !~ /^\s*$/ ) { |
111 | return $self->_make_unknown_token($line); |
112 | } |
113 | |
114 | $explanation = '' unless defined $explanation; |
115 | |
116 | return $self->_make_plan_token( |
117 | $line, $tests_planned, $skip, |
118 | $explanation, [] |
119 | ); |
120 | |
121 | }, |
122 | }, |
123 | |
124 | # An optimization to handle the most common test lines without |
125 | # directives. |
126 | simple_test => { |
127 | syntax => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x, |
128 | handler => sub { |
129 | my ( $self, $line ) = @_; |
130 | my ( $ok, $num, $desc ) = ( $1, $2, $3 ); |
131 | |
132 | return $self->_make_test_token( |
133 | $line, $ok, $num, |
134 | $desc |
135 | ); |
136 | }, |
137 | }, |
138 | test => { |
139 | syntax => qr/^($ok) \s* ($num)? \s* (.*) \z/x, |
140 | handler => sub { |
141 | my ( $self, $line ) = @_; |
142 | my ( $ok, $num, $desc ) = ( $1, $2, $3 ); |
143 | my ( $dir, $explanation ) = ( '', '' ); |
144 | if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* ) |
145 | \# \s* (SKIP|TODO) \b \s* (.*) $/ix |
146 | ) |
147 | { |
148 | ( $desc, $dir, $explanation ) = ( $1, $2, $3 ); |
149 | } |
150 | return $self->_make_test_token( |
151 | $line, $ok, $num, $desc, |
152 | $dir, $explanation |
153 | ); |
154 | }, |
155 | }, |
156 | comment => { |
157 | syntax => qr/^#(.*)/, |
158 | handler => sub { |
159 | my ( $self, $line ) = @_; |
160 | my $comment = $1; |
161 | return $self->_make_comment_token( $line, $comment ); |
162 | }, |
163 | }, |
164 | bailout => { |
165 | syntax => qr/^Bail out!\s*(.*)/, |
166 | handler => sub { |
167 | my ( $self, $line ) = @_; |
168 | my $explanation = $1; |
169 | return $self->_make_bailout_token( |
170 | $line, |
171 | $explanation |
172 | ); |
173 | }, |
174 | }, |
175 | ); |
176 | |
177 | my %v13 = ( |
178 | %v12, |
179 | plan => { |
180 | syntax => qr/^1\.\.(\d+)(?:\s*#\s*SKIP\b(.*))?\z/i, |
181 | handler => sub { |
182 | my ( $self, $line ) = @_; |
183 | my ( $tests_planned, $explanation ) = ( $1, $2 ); |
184 | my $skip |
185 | = ( 0 == $tests_planned || defined $explanation ) |
186 | ? 'SKIP' |
187 | : ''; |
188 | $explanation = '' unless defined $explanation; |
189 | return $self->_make_plan_token( |
190 | $line, $tests_planned, $skip, |
191 | $explanation, [] |
192 | ); |
193 | }, |
194 | }, |
195 | yaml => { |
196 | syntax => qr/^ (\s+) (---.*) $/x, |
197 | handler => sub { |
198 | my ( $self, $line ) = @_; |
199 | my ( $pad, $marker ) = ( $1, $2 ); |
200 | return $self->_make_yaml_token( $pad, $marker ); |
201 | }, |
202 | }, |
203 | pragma => { |
204 | syntax => |
205 | qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x, |
206 | handler => sub { |
207 | my ( $self, $line ) = @_; |
208 | my $pragmas = $1; |
209 | return $self->_make_pragma_token( $line, $pragmas ); |
210 | }, |
211 | }, |
212 | ); |
213 | |
214 | %language_for = ( |
215 | '12' => { |
216 | tokens => \%v12, |
217 | }, |
218 | '13' => { |
219 | tokens => \%v13, |
220 | setup => sub { |
221 | shift->{stream}->handle_unicode; |
222 | }, |
223 | }, |
224 | ); |
225 | } |
226 | |
227 | ############################################################################## |
228 | |
229 | =head2 Instance Methods |
230 | |
231 | =head3 C<set_version> |
232 | |
233 | $grammar->set_version(13); |
234 | |
235 | Tell the grammar which TAP syntax version to support. The lowest |
236 | supported version is 12. Although 'TAP version' isn't valid version 12 |
237 | syntax it is accepted so that higher version numbers may be parsed. |
238 | |
239 | =cut |
240 | |
241 | sub set_version { |
242 | my $self = shift; |
243 | my $version = shift; |
244 | |
245 | if ( my $language = $language_for{$version} ) { |
246 | $self->{version} = $version; |
247 | $self->{tokens} = $language->{tokens}; |
248 | |
249 | if ( my $setup = $language->{setup} ) { |
250 | $self->$setup(); |
251 | } |
252 | |
253 | $self->_order_tokens; |
254 | } |
255 | else { |
256 | require Carp; |
257 | Carp::croak("Unsupported syntax version: $version"); |
258 | } |
259 | } |
260 | |
261 | # Optimization to put the most frequent tokens first. |
262 | sub _order_tokens { |
263 | my $self = shift; |
264 | |
265 | my %copy = %{ $self->{tokens} }; |
266 | my @ordered_tokens = grep {defined} |
267 | map { delete $copy{$_} } qw( simple_test test comment plan ); |
268 | push @ordered_tokens, values %copy; |
269 | |
270 | $self->{ordered_tokens} = \@ordered_tokens; |
271 | } |
272 | |
273 | ############################################################################## |
274 | |
275 | =head3 C<tokenize> |
276 | |
277 | my $token = $grammar->tokenize; |
278 | |
279 | This method will return a L<TAP::Parser::Result> object representing the |
280 | current line of TAP. |
281 | |
282 | =cut |
283 | |
284 | sub tokenize { |
285 | my $self = shift; |
286 | |
287 | my $line = $self->{stream}->next; |
288 | unless ( defined $line ) { |
289 | delete $self->{parser}; # break circular ref |
290 | return; |
291 | } |
292 | |
293 | my $token; |
294 | |
295 | foreach my $token_data ( @{ $self->{ordered_tokens} } ) { |
296 | if ( $line =~ $token_data->{syntax} ) { |
297 | my $handler = $token_data->{handler}; |
298 | $token = $self->$handler($line); |
299 | last; |
300 | } |
301 | } |
302 | |
303 | $token = $self->_make_unknown_token($line) unless $token; |
304 | |
305 | return $self->{parser}->make_result($token); |
306 | } |
307 | |
308 | ############################################################################## |
309 | |
310 | =head3 C<token_types> |
311 | |
312 | my @types = $grammar->token_types; |
313 | |
314 | Returns the different types of tokens which this grammar can parse. |
315 | |
316 | =cut |
317 | |
318 | sub token_types { |
319 | my $self = shift; |
320 | return keys %{ $self->{tokens} }; |
321 | } |
322 | |
323 | ############################################################################## |
324 | |
325 | =head3 C<syntax_for> |
326 | |
327 | my $syntax = $grammar->syntax_for($token_type); |
328 | |
329 | Returns a pre-compiled regular expression which will match a chunk of TAP |
330 | corresponding to the token type. For example (not that you should really pay |
331 | attention to this, C<< $grammar->syntax_for('comment') >> will return |
332 | C<< qr/^#(.*)/ >>. |
333 | |
334 | =cut |
335 | |
336 | sub syntax_for { |
337 | my ( $self, $type ) = @_; |
338 | return $self->{tokens}->{$type}->{syntax}; |
339 | } |
340 | |
341 | ############################################################################## |
342 | |
343 | =head3 C<handler_for> |
344 | |
345 | my $handler = $grammar->handler_for($token_type); |
346 | |
347 | Returns a code reference which, when passed an appropriate line of TAP, |
348 | returns the lexed token corresponding to that line. As a result, the basic |
349 | TAP parsing loop looks similar to the following: |
350 | |
351 | my @tokens; |
352 | my $grammar = TAP::Grammar->new; |
353 | LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) { |
354 | foreach my $type ( $grammar->token_types ) { |
355 | my $syntax = $grammar->syntax_for($type); |
356 | if ( $line =~ $syntax ) { |
357 | my $handler = $grammar->handler_for($type); |
358 | push @tokens => $grammar->$handler($line); |
359 | next LINE; |
360 | } |
361 | } |
362 | push @tokens => $grammar->_make_unknown_token($line); |
363 | } |
364 | |
365 | =cut |
366 | |
367 | sub handler_for { |
368 | my ( $self, $type ) = @_; |
369 | return $self->{tokens}->{$type}->{handler}; |
370 | } |
371 | |
372 | sub _make_version_token { |
373 | my ( $self, $line, $version ) = @_; |
374 | return { |
375 | type => 'version', |
376 | raw => $line, |
377 | version => $version, |
378 | }; |
379 | } |
380 | |
381 | sub _make_plan_token { |
382 | my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_; |
383 | |
384 | if ( $directive eq 'SKIP' |
385 | && 0 != $tests_planned |
386 | && $self->{version} < 13 ) |
387 | { |
388 | warn |
389 | "Specified SKIP directive in plan but more than 0 tests ($line)\n"; |
390 | } |
391 | |
392 | return { |
393 | type => 'plan', |
394 | raw => $line, |
395 | tests_planned => $tests_planned, |
396 | directive => $directive, |
397 | explanation => _trim($explanation), |
398 | todo_list => $todo, |
399 | }; |
400 | } |
401 | |
402 | sub _make_test_token { |
403 | my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_; |
404 | return { |
405 | ok => $ok, |
406 | test_num => $num, |
407 | description => _trim($desc), |
408 | directive => ( defined $dir ? uc $dir : '' ), |
409 | explanation => _trim($explanation), |
410 | raw => $line, |
411 | type => 'test', |
412 | }; |
413 | } |
414 | |
415 | sub _make_unknown_token { |
416 | my ( $self, $line ) = @_; |
417 | return { |
418 | raw => $line, |
419 | type => 'unknown', |
420 | }; |
421 | } |
422 | |
423 | sub _make_comment_token { |
424 | my ( $self, $line, $comment ) = @_; |
425 | return { |
426 | type => 'comment', |
427 | raw => $line, |
428 | comment => _trim($comment) |
429 | }; |
430 | } |
431 | |
432 | sub _make_bailout_token { |
433 | my ( $self, $line, $explanation ) = @_; |
434 | return { |
435 | type => 'bailout', |
436 | raw => $line, |
437 | bailout => _trim($explanation) |
438 | }; |
439 | } |
440 | |
441 | sub _make_yaml_token { |
442 | my ( $self, $pad, $marker ) = @_; |
443 | |
444 | my $yaml = TAP::Parser::YAMLish::Reader->new; |
445 | |
446 | my $stream = $self->{stream}; |
447 | |
448 | # Construct a reader that reads from our input stripping leading |
449 | # spaces from each line. |
450 | my $leader = length($pad); |
451 | my $strip = qr{ ^ (\s{$leader}) (.*) $ }x; |
452 | my @extra = ($marker); |
453 | my $reader = sub { |
454 | return shift @extra if @extra; |
455 | my $line = $stream->next; |
456 | return $2 if $line =~ $strip; |
457 | return; |
458 | }; |
459 | |
460 | my $data = $yaml->read($reader); |
461 | |
462 | # Reconstitute input. This is convoluted. Maybe we should just |
463 | # record it on the way in... |
464 | chomp( my $raw = $yaml->get_raw ); |
465 | $raw =~ s/^/$pad/mg; |
466 | |
467 | return { |
468 | type => 'yaml', |
469 | raw => $raw, |
470 | data => $data |
471 | }; |
472 | } |
473 | |
474 | sub _make_pragma_token { |
475 | my ( $self, $line, $pragmas ) = @_; |
476 | return { |
477 | type => 'pragma', |
478 | raw => $line, |
479 | pragmas => [ split /\s*,\s*/, _trim($pragmas) ], |
480 | }; |
481 | } |
482 | |
483 | sub _trim { |
484 | my $data = shift; |
485 | |
486 | return '' unless defined $data; |
487 | |
488 | $data =~ s/^\s+//; |
489 | $data =~ s/\s+$//; |
490 | return $data; |
491 | } |
492 | |
493 | 1; |
494 | |
495 | =head1 TAP GRAMMAR |
496 | |
497 | B<NOTE:> This grammar is slightly out of date. There's still some discussion |
498 | about it and a new one will be provided when we have things better defined. |
499 | |
500 | The L<TAP::Parser> does not use a formal grammar because TAP is essentially a |
501 | stream-based protocol. In fact, it's quite legal to have an infinite stream. |
502 | For the same reason that we don't apply regexes to streams, we're not using a |
503 | formal grammar here. Instead, we parse the TAP in lines. |
504 | |
505 | For purposes for forward compatability, any result which does not match the |
506 | following grammar is currently referred to as |
507 | L<TAP::Parser::Result::Unknown>. It is I<not> a parse error. |
508 | |
509 | A formal grammar would look similar to the following: |
510 | |
511 | (* |
512 | For the time being, I'm cheating on the EBNF by allowing |
513 | certain terms to be defined by POSIX character classes by |
514 | using the following syntax: |
515 | |
516 | digit ::= [:digit:] |
517 | |
518 | As far as I am aware, that's not valid EBNF. Sue me. I |
519 | didn't know how to write "char" otherwise (Unicode issues). |
520 | Suggestions welcome. |
521 | *) |
522 | |
523 | tap ::= version? { comment | unknown } leading_plan lines |
524 | | |
525 | lines trailing_plan {comment} |
526 | |
527 | version ::= 'TAP version ' positiveInteger {positiveInteger} "\n" |
528 | |
529 | leading_plan ::= plan skip_directive? "\n" |
530 | |
531 | trailing_plan ::= plan "\n" |
532 | |
533 | plan ::= '1..' nonNegativeInteger |
534 | |
535 | lines ::= line {line} |
536 | |
537 | line ::= (comment | test | unknown | bailout ) "\n" |
538 | |
539 | test ::= status positiveInteger? description? directive? |
540 | |
541 | status ::= 'not '? 'ok ' |
542 | |
543 | description ::= (character - (digit | '#')) {character - '#'} |
544 | |
545 | directive ::= todo_directive | skip_directive |
546 | |
547 | todo_directive ::= hash_mark 'TODO' ' ' {character} |
548 | |
549 | skip_directive ::= hash_mark 'SKIP' ' ' {character} |
550 | |
551 | comment ::= hash_mark {character} |
552 | |
553 | hash_mark ::= '#' {' '} |
554 | |
555 | bailout ::= 'Bail out!' {character} |
556 | |
557 | unknown ::= { (character - "\n") } |
558 | |
559 | (* POSIX character classes and other terminals *) |
560 | |
561 | digit ::= [:digit:] |
562 | character ::= ([:print:] - "\n") |
563 | positiveInteger ::= ( digit - '0' ) {digit} |
564 | nonNegativeInteger ::= digit {digit} |
565 | |
566 | =head1 SUBCLASSING |
567 | |
568 | Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview. |
569 | |
570 | If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to |
571 | do is read through the code. There's no easy way of summarizing it here. |
572 | |
573 | =head1 SEE ALSO |
574 | |
575 | L<TAP::Object>, |
576 | L<TAP::Parser>, |
577 | L<TAP::Parser::Iterator>, |
578 | L<TAP::Parser::Result>, |
579 | |
580 | =cut |