lib/TAP/Parser/Grammar.pm

   1 package TAP::Parser::Grammar;
   2
   3 use strict;
   4 use vars qw($VERSION @ISA);
   5
   6 use TAP::Object                  ();
   7 use TAP::Parser::ResultFactory   ();
   8 use TAP::Parser::YAMLish::Reader ();
   9
  10 @ISA = qw(TAP::Object);
  11
  12 =head1 NAME
  13
  14 TAP::Parser::Grammar - A grammar for the Test Anything Protocol.
  15
  16 =head1 VERSION
  17
  18 Version 3.13
  19
  20 =cut
  21
  22 $VERSION = '3.13';
  23
  24 =head1 SYNOPSIS
  25
  26   use TAP::Parser::Grammar;
  27   my $grammar = $self->make_grammar({
  28     stream  => $tap_parser_stream,
  29     parser  => $tap_parser,
  30     version => 12,
  31   });
  32
  33   my $result = $grammar->tokenize;
  34
  35 =head1 DESCRIPTION
  36
  37 C<TAP::Parser::Grammar> tokenizes lines from a TAP stream and constructs
  38 L<TAP::Parser::Result> subclasses to represent the tokens.
  39
  40 Do not attempt to use this class directly.  It won't make sense.  It's mainly
  41 here to ensure that we will be able to have pluggable grammars when TAP is
  42 expanded at some future date (plus, this stuff was really cluttering the
  43 parser).
  44
  45 =head1 METHODS
  46
  47 =head2 Class Methods
  48
  49 =head3 C<new>
  50
  51   my $grammar = TAP::Parser::Grammar->new({
  52       stream  => $stream,
  53       parser  => $parser,
  54       version => $version,
  55   });
  56
  57 Returns L<TAP::Parser> grammar object that will parse the specified stream.
  58 Both C<stream> and C<parser> are required arguments.  If C<version> is not set
  59 it defaults to C<12> (see L</set_version> for more details).
  60
  61 =cut
  62
  63 # new() implementation supplied by TAP::Object
  64 sub _initialize {
  65     my ( $self, $args ) = @_;
  66     $self->{stream} = $args->{stream};    # TODO: accessor
  67     $self->{parser} = $args->{parser};    # TODO: accessor
  68     $self->set_version( $args->{version} || 12 );
  69     return $self;
  70 }
  71
  72 my %language_for;
  73
  74 {
  75
  76     # XXX the 'not' and 'ok' might be on separate lines in VMS ...
  77     my $ok  = qr/(?:not )?ok\b/;
  78     my $num = qr/\d+/;
  79
  80     my %v12 = (
  81         version => {
  82             syntax  => qr/^TAP\s+version\s+(\d+)\s*\z/i,
  83             handler => sub {
  84                 my ( $self, $line ) = @_;
  85                 my $version = $1;
  86                 return $self->_make_version_token( $line, $version, );
  87             },
  88         },
  89         plan => {
  90             syntax  => qr/^1\.\.(\d+)\s*(.*)\z/,
  91             handler => sub {
  92                 my ( $self, $line ) = @_;
  93                 my ( $tests_planned, $tail ) = ( $1, $2 );
  94                 my $explanation = undef;
  95                 my $skip        = '';
  96
  97                 if ( $tail =~ /^todo((?:\s+\d+)+)/ ) {
  98                     my @todo = split /\s+/, _trim($1);
  99                     return $self->_make_plan_token(
 100                         $line, $tests_planned, 'TODO',
 101                         '',    \@todo
 102                     );
 103                 }
 104                 elsif ( 0 == $tests_planned ) {
 105                     $skip = 'SKIP';
 106
 107                     # If we can't match # SKIP the directive should be undef.
 108                     ($explanation) = $tail =~ /^#\s*SKIP\s+(.*)/i;
 109                 }
 110                 elsif ( $tail !~ /^\s*$/ ) {
 111                     return $self->_make_unknown_token($line);
 112                 }
 113
 114                 $explanation = '' unless defined $explanation;
 115
 116                 return $self->_make_plan_token(
 117                     $line, $tests_planned, $skip,
 118                     $explanation, []
 119                 );
 120
 121             },
 122         },
 123
 124         # An optimization to handle the most common test lines without
 125         # directives.
 126         simple_test => {
 127             syntax  => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x,
 128             handler => sub {
 129                 my ( $self, $line ) = @_;
 130                 my ( $ok, $num, $desc ) = ( $1, $2, $3 );
 131
 132                 return $self->_make_test_token(
 133                     $line, $ok, $num,
 134                     $desc
 135                 );
 136             },
 137         },
 138         test => {
 139             syntax  => qr/^($ok) \s* ($num)? \s* (.*) \z/x,
 140             handler => sub {
 141                 my ( $self, $line ) = @_;
 142                 my ( $ok, $num, $desc ) = ( $1, $2, $3 );
 143                 my ( $dir, $explanation ) = ( '', '' );
 144                 if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* )
 145                        \# \s* (SKIP|TODO) \b \s* (.*) $/ix
 146                   )
 147                 {
 148                     ( $desc, $dir, $explanation ) = ( $1, $2, $3 );
 149                 }
 150                 return $self->_make_test_token(
 151                     $line,   $ok, $num, $desc,
 152                     $dir, $explanation
 153                 );
 154             },
 155         },
 156         comment => {
 157             syntax  => qr/^#(.*)/,
 158             handler => sub {
 159                 my ( $self, $line ) = @_;
 160                 my $comment = $1;
 161                 return $self->_make_comment_token( $line, $comment );
 162             },
 163         },
 164         bailout => {
 165             syntax  => qr/^Bail out!\s*(.*)/,
 166             handler => sub {
 167                 my ( $self, $line ) = @_;
 168                 my $explanation = $1;
 169                 return $self->_make_bailout_token(
 170                     $line,
 171                     $explanation
 172                 );
 173             },
 174         },
 175     );
 176
 177     my %v13 = (
 178         %v12,
 179         plan => {
 180             syntax  => qr/^1\.\.(\d+)(?:\s*#\s*SKIP\b(.*))?\z/i,
 181             handler => sub {
 182                 my ( $self, $line ) = @_;
 183                 my ( $tests_planned, $explanation ) = ( $1, $2 );
 184                 my $skip
 185                   = ( 0 == $tests_planned || defined $explanation )
 186                   ? 'SKIP'
 187                   : '';
 188                 $explanation = '' unless defined $explanation;
 189                 return $self->_make_plan_token(
 190                     $line, $tests_planned, $skip,
 191                     $explanation, []
 192                 );
 193             },
 194         },
 195         yaml => {
 196             syntax  => qr/^ (\s+) (---.*) $/x,
 197             handler => sub {
 198                 my ( $self, $line ) = @_;
 199                 my ( $pad, $marker ) = ( $1, $2 );
 200                 return $self->_make_yaml_token( $pad, $marker );
 201             },
 202         },
 203         pragma => {
 204             syntax =>
 205               qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x,
 206             handler => sub {
 207                 my ( $self, $line ) = @_;
 208                 my $pragmas = $1;
 209                 return $self->_make_pragma_token( $line, $pragmas );
 210             },
 211         },
 212     );
 213
 214     %language_for = (
 215         '12' => {
 216             tokens => \%v12,
 217         },
 218         '13' => {
 219             tokens => \%v13,
 220             setup  => sub {
 221                 shift->{stream}->handle_unicode;
 222             },
 223         },
 224     );
 225 }
 226
 227 ##############################################################################
 228
 229 =head2 Instance Methods
 230
 231 =head3 C<set_version>
 232
 233   $grammar->set_version(13);
 234
 235 Tell the grammar which TAP syntax version to support. The lowest
 236 supported version is 12. Although 'TAP version' isn't valid version 12
 237 syntax it is accepted so that higher version numbers may be parsed.
 238
 239 =cut
 240
 241 sub set_version {
 242     my $self    = shift;
 243     my $version = shift;
 244
 245     if ( my $language = $language_for{$version} ) {
 246         $self->{version} = $version;
 247         $self->{tokens}  = $language->{tokens};
 248
 249         if ( my $setup = $language->{setup} ) {
 250             $self->$setup();
 251         }
 252
 253         $self->_order_tokens;
 254     }
 255     else {
 256         require Carp;
 257         Carp::croak("Unsupported syntax version: $version");
 258     }
 259 }
 260
 261 # Optimization to put the most frequent tokens first.
 262 sub _order_tokens {
 263     my $self = shift;
 264
 265     my %copy = %{ $self->{tokens} };
 266     my @ordered_tokens = grep {defined}
 267       map { delete $copy{$_} } qw( simple_test test comment plan );
 268     push @ordered_tokens, values %copy;
 269
 270     $self->{ordered_tokens} = \@ordered_tokens;
 271 }
 272
 273 ##############################################################################
 274
 275 =head3 C<tokenize>
 276
 277   my $token = $grammar->tokenize;
 278
 279 This method will return a L<TAP::Parser::Result> object representing the
 280 current line of TAP.
 281
 282 =cut
 283
 284 sub tokenize {
 285     my $self = shift;
 286
 287     my $line = $self->{stream}->next;
 288     unless ( defined $line ) {
 289         delete $self->{parser};    # break circular ref
 290         return;
 291     }
 292
 293     my $token;
 294
 295     foreach my $token_data ( @{ $self->{ordered_tokens} } ) {
 296         if ( $line =~ $token_data->{syntax} ) {
 297             my $handler = $token_data->{handler};
 298             $token = $self->$handler($line);
 299             last;
 300         }
 301     }
 302
 303     $token = $self->_make_unknown_token($line) unless $token;
 304
 305     return $self->{parser}->make_result($token);
 306 }
 307
 308 ##############################################################################
 309
 310 =head3 C<token_types>
 311
 312   my @types = $grammar->token_types;
 313
 314 Returns the different types of tokens which this grammar can parse.
 315
 316 =cut
 317
 318 sub token_types {
 319     my $self = shift;
 320     return keys %{ $self->{tokens} };
 321 }
 322
 323 ##############################################################################
 324
 325 =head3 C<syntax_for>
 326
 327   my $syntax = $grammar->syntax_for($token_type);
 328
 329 Returns a pre-compiled regular expression which will match a chunk of TAP
 330 corresponding to the token type.  For example (not that you should really pay
 331 attention to this, C<< $grammar->syntax_for('comment') >> will return
 332 C<< qr/^#(.*)/ >>.
 333
 334 =cut
 335
 336 sub syntax_for {
 337     my ( $self, $type ) = @_;
 338     return $self->{tokens}->{$type}->{syntax};
 339 }
 340
 341 ##############################################################################
 342
 343 =head3 C<handler_for>
 344
 345   my $handler = $grammar->handler_for($token_type);
 346
 347 Returns a code reference which, when passed an appropriate line of TAP,
 348 returns the lexed token corresponding to that line.  As a result, the basic
 349 TAP parsing loop looks similar to the following:
 350
 351  my @tokens;
 352  my $grammar = TAP::Grammar->new;
 353  LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) {
 354      foreach my $type ( $grammar->token_types ) {
 355          my $syntax  = $grammar->syntax_for($type);
 356          if ( $line =~ $syntax ) {
 357              my $handler = $grammar->handler_for($type);
 358              push @tokens => $grammar->$handler($line);
 359              next LINE;
 360          }
 361      }
 362      push @tokens => $grammar->_make_unknown_token($line);
 363  }
 364
 365 =cut
 366
 367 sub handler_for {
 368     my ( $self, $type ) = @_;
 369     return $self->{tokens}->{$type}->{handler};
 370 }
 371
 372 sub _make_version_token {
 373     my ( $self, $line, $version ) = @_;
 374     return {
 375         type    => 'version',
 376         raw     => $line,
 377         version => $version,
 378     };
 379 }
 380
 381 sub _make_plan_token {
 382     my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_;
 383
 384     if (   $directive eq 'SKIP'
 385         && 0 != $tests_planned
 386         && $self->{version} < 13 )
 387     {
 388         warn
 389           "Specified SKIP directive in plan but more than 0 tests ($line)\n";
 390     }
 391
 392     return {
 393         type          => 'plan',
 394         raw           => $line,
 395         tests_planned => $tests_planned,
 396         directive     => $directive,
 397         explanation   => _trim($explanation),
 398         todo_list     => $todo,
 399     };
 400 }
 401
 402 sub _make_test_token {
 403     my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_;
 404     my %test = (
 405         ok          => $ok,
 406         test_num    => $num,
 407         description => _trim($desc),
 408         directive   => uc( defined $dir ? $dir : '' ),
 409         explanation => _trim($explanation),
 410         raw         => $line,
 411         type        => 'test',
 412     );
 413     return \%test;
 414 }
 415
 416 sub _make_unknown_token {
 417     my ( $self, $line ) = @_;
 418     return {
 419         raw  => $line,
 420         type => 'unknown',
 421     };
 422 }
 423
 424 sub _make_comment_token {
 425     my ( $self, $line, $comment ) = @_;
 426     return {
 427         type    => 'comment',
 428         raw     => $line,
 429         comment => _trim($comment)
 430     };
 431 }
 432
 433 sub _make_bailout_token {
 434     my ( $self, $line, $explanation ) = @_;
 435     return {
 436         type    => 'bailout',
 437         raw     => $line,
 438         bailout => _trim($explanation)
 439     };
 440 }
 441
 442 sub _make_yaml_token {
 443     my ( $self, $pad, $marker ) = @_;
 444
 445     my $yaml = TAP::Parser::YAMLish::Reader->new;
 446
 447     my $stream = $self->{stream};
 448
 449     # Construct a reader that reads from our input stripping leading
 450     # spaces from each line.
 451     my $leader = length($pad);
 452     my $strip  = qr{ ^ (\s{$leader}) (.*) $ }x;
 453     my @extra  = ($marker);
 454     my $reader = sub {
 455         return shift @extra if @extra;
 456         my $line = $stream->next;
 457         return $2 if $line =~ $strip;
 458         return;
 459     };
 460
 461     my $data = $yaml->read($reader);
 462
 463     # Reconstitute input. This is convoluted. Maybe we should just
 464     # record it on the way in...
 465     chomp( my $raw = $yaml->get_raw );
 466     $raw =~ s/^/$pad/mg;
 467
 468     return {
 469         type => 'yaml',
 470         raw  => $raw,
 471         data => $data
 472     };
 473 }
 474
 475 sub _make_pragma_token {
 476     my ( $self, $line, $pragmas ) = @_;
 477     return {
 478         type    => 'pragma',
 479         raw     => $line,
 480         pragmas => [ split /\s*,\s*/, _trim($pragmas) ],
 481     };
 482 }
 483
 484 sub _trim {
 485     my $data = shift;
 486
 487     return '' unless defined $data;
 488
 489     $data =~ s/^\s+//;
 490     $data =~ s/\s+$//;
 491     return $data;
 492 }
 493
 494 1;
 495
 496 =head1 TAP GRAMMAR
 497
 498 B<NOTE:>  This grammar is slightly out of date.  There's still some discussion
 499 about it and a new one will be provided when we have things better defined.
 500
 501 The L<TAP::Parser> does not use a formal grammar because TAP is essentially a
 502 stream-based protocol.  In fact, it's quite legal to have an infinite stream.
 503 For the same reason that we don't apply regexes to streams, we're not using a
 504 formal grammar here.  Instead, we parse the TAP in lines.
 505
 506 For purposes for forward compatability, any result which does not match the
 507 following grammar is currently referred to as
 508 L<TAP::Parser::Result::Unknown>.  It is I<not> a parse error.
 509
 510 A formal grammar would look similar to the following:
 511
 512  (*
 513      For the time being, I'm cheating on the EBNF by allowing
 514      certain terms to be defined by POSIX character classes by
 515      using the following syntax:
 516
 517        digit ::= [:digit:]
 518
 519      As far as I am aware, that's not valid EBNF.  Sue me.  I
 520      didn't know how to write "char" otherwise (Unicode issues).
 521      Suggestions welcome.
 522  *)
 523
 524  tap            ::= version? { comment | unknown } leading_plan lines
 525                     |
 526                     lines trailing_plan {comment}
 527
 528  version        ::= 'TAP version ' positiveInteger {positiveInteger} "\n"
 529
 530  leading_plan   ::= plan skip_directive? "\n"
 531
 532  trailing_plan  ::= plan "\n"
 533
 534  plan           ::= '1..' nonNegativeInteger
 535
 536  lines          ::= line {line}
 537
 538  line           ::= (comment | test | unknown | bailout ) "\n"
 539
 540  test           ::= status positiveInteger? description? directive?
 541
 542  status         ::= 'not '? 'ok '
 543
 544  description    ::= (character - (digit | '#')) {character - '#'}
 545
 546  directive      ::= todo_directive | skip_directive
 547
 548  todo_directive ::= hash_mark 'TODO' ' ' {character}
 549
 550  skip_directive ::= hash_mark 'SKIP' ' ' {character}
 551
 552  comment        ::= hash_mark {character}
 553
 554  hash_mark      ::= '#' {' '}
 555
 556  bailout        ::= 'Bail out!' {character}
 557
 558  unknown        ::= { (character - "\n") }
 559
 560  (* POSIX character classes and other terminals *)
 561
 562  digit              ::= [:digit:]
 563  character          ::= ([:print:] - "\n")
 564  positiveInteger    ::= ( digit - '0' ) {digit}
 565  nonNegativeInteger ::= digit {digit}
 566
 567 =head1 SUBCLASSING
 568
 569 Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview.
 570
 571 If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to
 572 do is read through the code.  There's no easy way of summarizing it here.
 573
 574 =head1 SEE ALSO
 575
 576 L<TAP::Object>,
 577 L<TAP::Parser>,
 578 L<TAP::Parser::Iterator>,
 579 L<TAP::Parser::Result>,
 580
 581 =cut