local-lib5/lib/perl5/PPI/Tokenizer.pm

   1 package PPI::Tokenizer;
   2
   3 =pod
   4
   5 =head1 NAME
   6
   7 PPI::Tokenizer - The Perl Document Tokenizer
   8
   9 =head1 SYNOPSIS
  10
  11   # Create a tokenizer for a file, array or string
  12   $Tokenizer = PPI::Tokenizer->new( 'filename.pl' );
  13   $Tokenizer = PPI::Tokenizer->new( \@lines       );
  14   $Tokenizer = PPI::Tokenizer->new( \$source      );
  15
  16   # Return all the tokens for the document
  17   my $tokens = $Tokenizer->all_tokens;
  18
  19   # Or we can use it as an iterator
  20   while ( my $Token = $Tokenizer->get_token ) {
  21         print "Found token '$Token'\n";
  22   }
  23
  24   # If we REALLY need to manually nudge the cursor, you
  25   # can do that to (The lexer needs this ability to do rollbacks)
  26   $is_incremented = $Tokenizer->increment_cursor;
  27   $is_decremented = $Tokenizer->decrement_cursor;
  28
  29 =head1 DESCRIPTION
  30
  31 PPI::Tokenizer is the class that provides Tokenizer objects for use in
  32 breaking strings of Perl source code into Tokens.
  33
  34 By the time you are reading this, you probably need to know a little
  35 about the difference between how perl parses Perl "code" and how PPI
  36 parsers Perl "documents".
  37
  38 "perl" itself (the interpreter) uses a heavily modified lex specification
  39 to specify its parsing logic, maintains several types of state as it
  40 goes, and incrementally tokenizes, lexes AND EXECUTES at the same time.
  41
  42 In fact, it is provably impossible to use perl's parsing method without
  43 simultaneously executing code. A formal mathematical proof has been
  44 published demonstrating the method.
  45
  46 This is where the truism "Only perl can parse Perl" comes from.
  47
  48 PPI uses a completely different approach by abandoning the (impossible)
  49 ability to parse Perl the same way that the interpreter does, and instead
  50 parsing the source as a document, using a document structure independantly
  51 derived from the Perl documentation and approximating the perl interpreter
  52 interpretation as closely as possible.
  53
  54 It was touch and go for a long time whether we could get it close enough,
  55 but in the end it turned out that it could be done.
  56
  57 In this approach, the tokenizer C<PPI::Tokenizer> is implemented separately
  58 from the lexer L<PPI::Lexer>.
  59
  60 The job of C<PPI::Tokenizer> is to take pure source as a string and break it
  61 up into a stream/set of tokens, and contains most of the "black magic" used
  62 in PPI. By comparison, the lexer implements a relatively straight forward
  63 tree structure, and has an implementation that is uncomplicated (compared
  64 to the insanity in the tokenizer at least).
  65
  66 The Tokenizer uses an immense amount of heuristics, guessing and cruft,
  67 supported by a very B<VERY> flexible internal API, but fortunately it was
  68 possible to largely encapsulate the black magic, so there is not a lot that
  69 gets exposed to people using the C<PPI::Tokenizer> itself.
  70
  71 =head1 METHODS
  72
  73 Despite the incredible complexity, the Tokenizer itself only exposes a
  74 relatively small number of methods, with most of the complexity implemented
  75 in private methods.
  76
  77 =cut
  78
  79 # Make sure everything we need is loaded so
  80 # we don't have to go and load all of PPI.
  81 use strict;
  82 use Params::Util    qw{_INSTANCE _SCALAR0 _ARRAY0};
  83 use List::MoreUtils ();
  84 use PPI::Util       ();
  85 use PPI::Element    ();
  86 use PPI::Token      ();
  87 use PPI::Exception  ();
  88 use PPI::Exception::ParserRejection ();
  89
  90 use vars qw{$VERSION};
  91 BEGIN {
  92         $VERSION = '1.206';
  93 }
  94
  95
  96
  97
  98
  99 #####################################################################
 100 # Creation and Initialization
 101
 102 =pod
 103
 104 =head2 new $file | \@lines | \$source
 105
 106 The main C<new> constructor creates a new Tokenizer object. These
 107 objects have no configuration parameters, and can only be used once,
 108 to tokenize a single perl source file.
 109
 110 It takes as argument either a normal scalar containing source code,
 111 a reference to a scalar containing source code, or a reference to an
 112 ARRAY containing newline-terminated lines of source code.
 113
 114 Returns a new C<PPI::Tokenizer> object on success, or throws a
 115 L<PPI::Exception> exception on error.
 116
 117 =cut
 118
 119 sub new {
 120         my $class = ref($_[0]) || $_[0];
 121
 122         # Create the empty tokenizer struct
 123         my $self = bless {
 124                 # Source code
 125                 source       => undef,
 126                 source_bytes => undef,
 127
 128                 # Line buffer
 129                 line         => undef,
 130                 line_length  => undef,
 131                 line_cursor  => undef,
 132                 line_count   => 0,
 133
 134                 # Parse state
 135                 token        => undef,
 136                 class        => 'PPI::Token::BOM',
 137                 zone         => 'PPI::Token::Whitespace',
 138
 139                 # Output token buffer
 140                 tokens       => [],
 141                 token_cursor => 0,
 142                 token_eof    => 0,
 143
 144                 # Perl 6 blocks
 145                 perl6        => [],
 146         }, $class;
 147
 148         if ( ! defined $_[1] ) {
 149                 # We weren't given anything
 150                 PPI::Exception->throw("No source provided to Tokenizer");
 151
 152         } elsif ( ! ref $_[1] ) {
 153                 my $source = PPI::Util::_slurp($_[1]);
 154                 if ( ref $source ) {
 155                         # Content returned by reference
 156                         $self->{source} = $$source;
 157                 } else {
 158                         # Errors returned as a string
 159                         return( $source );
 160                 }
 161
 162         } elsif ( _SCALAR0($_[1]) ) {
 163                 $self->{source} = ${$_[1]};
 164
 165         } elsif ( _ARRAY0($_[1]) ) {
 166                 $self->{source} = join '', map { "\n" } @{$_[1]};
 167
 168         } else {
 169                 # We don't support whatever this is
 170                 PPI::Exception->throw(ref($_[1]) . " is not supported as a source provider");
 171         }
 172
 173         # We can't handle a null string
 174         $self->{source_bytes} = length $self->{source};
 175         if ( $self->{source_bytes} > 1048576 ) {
 176                 # Dammit! It's ALWAYS the "Perl" modules larger than a
 177                 # meg that seems to blow up the Tokenizer/Lexer.
 178                 # Nobody actually writes real programs larger than a meg
 179                 # Perl::Tidy (the largest) is only 800k.
 180                 # It is always these idiots with massive Data::Dumper
 181                 # structs or huge RecDescent parser.
 182                 PPI::Exception::ParserRejection->throw("File is too large");
 183
 184         } elsif ( $self->{source_bytes} ) {
 185                 # Split on local newlines
 186                 $self->{source} =~ s/(?:\015{1,2}\012|\015|\012)/\n/g;
 187                 $self->{source} = [ split /(?<=\n)/, $self->{source} ];
 188
 189         } else {
 190                 $self->{source} = [ ];
 191         }
 192
 193         ### EVIL
 194         # I'm explaining this earlier than I should so you can understand
 195         # why I'm about to do something that looks very strange. There's
 196         # a problem with the Tokenizer, in that tokens tend to change
 197         # classes as each letter is added, but they don't get allocated
 198         # their definite final class until the "end" of the token, the
 199         # detection of which occurs in about a hundred different places,
 200         # all through various crufty code (that triples the speed).
 201         #
 202         # However, in general, this does not apply to tokens in which a
 203         # whitespace character is valid, such as comments, whitespace and
 204         # big strings.
 205         #
 206         # So what we do is add a space to the end of the source. This
 207         # triggers normal "end of token" functionality for all cases. Then,
 208         # once the tokenizer hits end of file, it examines the last token to
 209         # manually either remove the ' ' token, or chop it off the end of
 210         # a longer one in which the space would be valid.
 211         if ( List::MoreUtils::any { /^__(?:DATA|END)__\s*$/ } @{$self->{source}} ) {
 212                 $self->{source_eof_chop} = '';
 213         } elsif ( ! defined $self->{source}->[0] ) {
 214                 $self->{source_eof_chop} = '';
 215         } elsif ( $self->{source}->[-1] =~ /\s$/ ) {
 216                 $self->{source_eof_chop} = '';
 217         } else {
 218                 $self->{source_eof_chop} = 1;
 219                 $self->{source}->[-1] .= ' ';
 220         }
 221
 222         $self;
 223 }
 224
 225
 226
 227
 228
 229 #####################################################################
 230 # Main Public Methods
 231
 232 =pod
 233
 234 =head2 get_token
 235
 236 When using the PPI::Tokenizer object as an iterator, the C<get_token>
 237 method is the primary method that is used. It increments the cursor
 238 and returns the next Token in the output array.
 239
 240 The actual parsing of the file is done only as-needed, and a line at
 241 a time. When C<get_token> hits the end of the token array, it will
 242 cause the parser to pull in the next line and parse it, continuing
 243 as needed until there are more tokens on the output array that
 244 get_token can then return.
 245
 246 This means that a number of Tokenizer objects can be created, and
 247 won't consume significant CPU until you actually begin to pull tokens
 248 from it.
 249
 250 Return a L<PPI::Token> object on success, C<0> if the Tokenizer had
 251 reached the end of the file, or C<undef> on error.
 252
 253 =cut
 254
 255 sub get_token {
 256         my $self = shift;
 257
 258         # Shortcut for EOF
 259         if ( $self->{token_eof}
 260          and $self->{token_cursor} > scalar @{$self->{tokens}}
 261         ) {
 262                 return 0;
 263         }
 264
 265         # Return the next token if we can
 266         if ( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) {
 267                 $self->{token_cursor}++;
 268                 return $token;
 269         }
 270
 271         my $line_rv;
 272
 273         # Catch exceptions and return undef, so that we
 274         # can start to convert code to exception-based code.
 275         my $rv = eval {
 276                 # No token, we need to get some more
 277                 while ( $line_rv = $self->_process_next_line ) {
 278                         # If there is something in the buffer, return it
 279                         # The defined() prevents a ton of calls to PPI::Util::TRUE
 280                         if ( defined( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) ) {
 281                                 $self->{token_cursor}++;
 282                                 return $token;
 283                         }
 284                 }
 285                 return undef;
 286         };
 287         if ( $@ ) {
 288                 if ( _INSTANCE($@, 'PPI::Exception') ) {
 289                         $@->throw;
 290                 } else {
 291                         my $errstr = $@;
 292                         $errstr =~ s/^(.*) at line .+$/$1/;
 293                         PPI::Exception->throw( $errstr );
 294                 }
 295         } elsif ( $rv ) {
 296                 return $rv;
 297         }
 298
 299         if ( defined $line_rv ) {
 300                 # End of file, but we can still return things from the buffer
 301                 if ( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) {
 302                         $self->{token_cursor}++;
 303                         return $token;
 304                 }
 305
 306                 # Set our token end of file flag
 307                 $self->{token_eof} = 1;
 308                 return 0;
 309         }
 310
 311         # Error, pass it up to our caller
 312         undef;
 313 }
 314
 315 =pod
 316
 317 =head2 all_tokens
 318
 319 When not being used as an iterator, the C<all_tokens> method tells
 320 the Tokenizer to parse the entire file and return all of the tokens
 321 in a single ARRAY reference.
 322
 323 It should be noted that C<all_tokens> does B<NOT> interfere with the
 324 use of the Tokenizer object as an iterator (does not modify the token
 325 cursor) and use of the two different mechanisms can be mixed safely.
 326
 327 Returns a reference to an ARRAY of L<PPI::Token> objects on success
 328 or throws an exception on error.
 329
 330 =cut
 331
 332 sub all_tokens {
 333         my $self = shift;
 334
 335         # Catch exceptions and return undef, so that we
 336         # can start to convert code to exception-based code.
 337         eval {
 338                 # Process lines until we get EOF
 339                 unless ( $self->{token_eof} ) {
 340                         my $rv;
 341                         while ( $rv = $self->_process_next_line ) {}
 342                         unless ( defined $rv ) {
 343                                 PPI::Exception->throw("Error while processing source");
 344                         }
 345
 346                         # Clean up the end of the tokenizer
 347                         $self->_clean_eof;
 348                 }
 349         };
 350         if ( $@ ) {
 351                 my $errstr = $@;
 352                 $errstr =~ s/^(.*) at line .+$/$1/;
 353                 PPI::Exception->throw( $errstr );
 354         }
 355
 356         # End of file, return a copy of the token array.
 357         return [ @{$self->{tokens}} ];
 358 }
 359
 360 =pod
 361
 362 =head2 increment_cursor
 363
 364 Although exposed as a public method, C<increment_method> is implemented
 365 for expert use only, when writing lexers or other components that work
 366 directly on token streams.
 367
 368 It manually increments the token cursor forward through the file, in effect
 369 "skipping" the next token.
 370
 371 Return true if the cursor is incremented, C<0> if already at the end of
 372 the file, or C<undef> on error.
 373
 374 =cut
 375
 376 sub increment_cursor {
 377         # Do this via the get_token method, which makes sure there
 378         # is actually a token there to move to.
 379         $_[0]->get_token and 1;
 380 }
 381
 382 =pod
 383
 384 =head2 decrement_cursor
 385
 386 Although exposed as a public method, C<decrement_method> is implemented
 387 for expert use only, when writing lexers or other components that work
 388 directly on token streams.
 389
 390 It manually decrements the token cursor backwards through the file, in
 391 effect "rolling back" the token stream. And indeed that is what it is
 392 primarily intended for, when the component that is consuming the token
 393 stream needs to implement some sort of "roll back" feature in its use
 394 of the token stream.
 395
 396 Return true if the cursor is decremented, C<0> if already at the
 397 beginning of the file, or C<undef> on error.
 398
 399 =cut
 400
 401 sub decrement_cursor {
 402         my $self = shift;
 403
 404         # Check for the beginning of the file
 405         return 0 unless $self->{token_cursor};
 406
 407         # Decrement the token cursor
 408         $self->{token_eof} = 0;
 409         --$self->{token_cursor};
 410 }
 411
 412
 413
 414
 415
 416 #####################################################################
 417 # Working With Source
 418
 419 # Fetches the next line from the input line buffer
 420 # Returns undef at EOF.
 421 sub _get_line {
 422         my $self = shift;
 423         return undef unless $self->{source}; # EOF hit previously
 424
 425         # Pull off the next line
 426         my $line = shift @{$self->{source}};
 427
 428         # Flag EOF if we hit it
 429         $self->{source} = undef unless defined $line;
 430
 431         # Return the line (or EOF flag)
 432         return $line; # string or undef
 433 }
 434
 435 # Fetches the next line, ready to process
 436 # Returns 1 on success
 437 # Returns 0 on EOF
 438 sub _fill_line {
 439         my $self   = shift;
 440         my $inscan = shift;
 441
 442         # Get the next line
 443         my $line = $self->_get_line;
 444         unless ( defined $line ) {
 445                 # End of file
 446                 unless ( $inscan ) {
 447                         delete $self->{line};
 448                         delete $self->{line_cursor};
 449                         delete $self->{line_length};
 450                         return 0;
 451                 }
 452
 453                 # In the scan version, just set the cursor to the end
 454                 # of the line, and the rest should just cascade out.
 455                 $self->{line_cursor} = $self->{line_length};
 456                 return 0;
 457         }
 458
 459         # Populate the appropriate variables
 460         $self->{line}        = $line;
 461         $self->{line_cursor} = -1;
 462         $self->{line_length} = length $line;
 463         $self->{line_count}++;
 464
 465         1;
 466 }
 467
 468 # Get the current character
 469 sub _char {
 470         my $self = shift;
 471         substr( $self->{line}, $self->{line_cursor}, 1 );
 472 }
 473
 474
 475
 476
 477
 478 ####################################################################
 479 # Per line processing methods
 480
 481 # Processes the next line
 482 # Returns 1 on success completion
 483 # Returns 0 if EOF
 484 # Returns undef on error
 485 sub _process_next_line {
 486         my $self = shift;
 487
 488         # Fill the line buffer
 489         my $rv;
 490         unless ( $rv = $self->_fill_line ) {
 491                 return undef unless defined $rv;
 492
 493                 # End of file, finalize last token
 494                 $self->_finalize_token;
 495                 return 0;
 496         }
 497
 498         # Run the __TOKENIZER__on_line_start
 499         $rv = $self->{class}->__TOKENIZER__on_line_start( $self );
 500         unless ( $rv ) {
 501                 # If there are no more source lines, then clean up
 502                 if ( ref $self->{source} eq 'ARRAY' and ! @{$self->{source}} ) {
 503                         $self->_clean_eof;
 504                 }
 505
 506                 # Defined but false means next line
 507                 return 1 if defined $rv;
 508                 PPI::Exception->throw("Error at line $self->{line_count}");
 509         }
 510
 511         # If we can't deal with the entire line, process char by char
 512         while ( $rv = $self->_process_next_char ) {}
 513         unless ( defined $rv ) {
 514                 PPI::Exception->throw("Error at line $self->{line_count}, character $self->{line_cursor}");
 515         }
 516
 517         # Trigger any action that needs to happen at the end of a line
 518         $self->{class}->__TOKENIZER__on_line_end( $self );
 519
 520         # If there are no more source lines, then clean up
 521         unless ( ref($self->{source}) eq 'ARRAY' and @{$self->{source}} ) {
 522                 return $self->_clean_eof;
 523         }
 524
 525         return 1;
 526 }
 527
 528
 529
 530
 531
 532 #####################################################################
 533 # Per-character processing methods
 534
 535 # Process on a per-character basis.
 536 # Note that due the the high number of times this gets
 537 # called, it has been fairly heavily in-lined, so the code
 538 # might look a bit ugly and duplicated.
 539 sub _process_next_char {
 540         my $self = shift;
 541
 542         ### FIXME - This checks for a screwed up condition that triggers
 543         ###         several warnings, amoungst other things.
 544         if ( ! defined $self->{line_cursor} or ! defined $self->{line_length} ) {
 545                 # $DB::single = 1;
 546                 return undef;
 547         }
 548
 549         # Increment the counter and check for end of line
 550         return 0 if ++$self->{line_cursor} >= $self->{line_length};
 551
 552         # Pass control to the token class
 553         my $result;
 554         unless ( $result = $self->{class}->__TOKENIZER__on_char( $self ) ) {
 555                 # undef is error. 0 is "Did stuff ourself, you don't have to do anything"
 556                 return defined $result ? 1 : undef;
 557         }
 558
 559         # We will need the value of the current character
 560         my $char = substr( $self->{line}, $self->{line_cursor}, 1 );
 561         if ( $result eq '1' ) {
 562                 # If __TOKENIZER__on_char returns 1, it is signaling that it thinks that
 563                 # the character is part of it.
 564
 565                 # Add the character
 566                 if ( defined $self->{token} ) {
 567                         $self->{token}->{content} .= $char;
 568                 } else {
 569                         defined($self->{token} = $self->{class}->new($char)) or return undef;
 570                 }
 571
 572                 return 1;
 573         }
 574
 575         # We have been provided with the name of a class
 576         if ( $self->{class} ne "PPI::Token::$result" ) {
 577                 # New class
 578                 $self->_new_token( $result, $char );
 579         } elsif ( defined $self->{token} ) {
 580                 # Same class as current
 581                 $self->{token}->{content} .= $char;
 582         } else {
 583                 # Same class, but no current
 584                 defined($self->{token} = $self->{class}->new($char)) or return undef;
 585         }
 586
 587         1;
 588 }
 589
 590
 591
 592
 593
 594 #####################################################################
 595 # Altering Tokens in Tokenizer
 596
 597 # Finish the end of a token.
 598 # Returns the resulting parse class as a convenience.
 599 sub _finalize_token {
 600         my $self = shift;
 601         return $self->{class} unless defined $self->{token};
 602
 603         # Add the token to the token buffer
 604         push @{ $self->{tokens} }, $self->{token};
 605         $self->{token} = undef;
 606
 607         # Return the parse class to that of the zone we are in
 608         $self->{class} = $self->{zone};
 609 }
 610
 611 # Creates a new token and sets it in the tokenizer
 612 # The defined() in here prevent a ton of calls to PPI::Util::TRUE
 613 sub _new_token {
 614         my $self = shift;
 615         # throw PPI::Exception() unless @_;
 616         my $class = substr( $_[0], 0, 12 ) eq 'PPI::Token::'
 617                 ? shift : 'PPI::Token::' . shift;
 618
 619         # Finalize any existing token
 620         $self->_finalize_token if defined $self->{token};
 621
 622         # Create the new token and update the parse class
 623         defined($self->{token} = $class->new($_[0])) or PPI::Exception->throw;
 624         $self->{class} = $class;
 625
 626         1;
 627 }
 628
 629 # At the end of the file, we need to clean up the results of the erroneous
 630 # space that we inserted at the beginning of the process.
 631 sub _clean_eof {
 632         my $self = shift;
 633
 634         # Finish any partially completed token
 635         $self->_finalize_token if $self->{token};
 636
 637         # Find the last token, and if it has no content, kill it.
 638         # There appears to be some evidence that such "null tokens" are
 639         # somehow getting created accidentally.
 640         my $last_token = $self->{tokens}->[ -1 ];
 641         unless ( length $last_token->{content} ) {
 642                 pop @{$self->{tokens}};
 643         }
 644
 645         # Now, if the last character of the last token is a space we added,
 646         # chop it off, deleting the token if there's nothing else left.
 647         if ( $self->{source_eof_chop} ) {
 648                 $last_token = $self->{tokens}->[ -1 ];
 649                 $last_token->{content} =~ s/ $//;
 650                 unless ( length $last_token->{content} ) {
 651                         # Popping token
 652                         pop @{$self->{tokens}};
 653                 }
 654
 655                 # The hack involving adding an extra space is now reversed, and
 656                 # now nobody will ever know. The perfect crime!
 657                 $self->{source_eof_chop} = '';
 658         }
 659
 660         1;
 661 }
 662
 663
 664
 665
 666
 667 #####################################################################
 668 # Utility Methods
 669
 670 # Context
 671 sub _last_token {
 672         $_[0]->{tokens}->[-1];
 673 }
 674
 675 sub _last_significant_token {
 676         my $self   = shift;
 677         my $cursor = $#{ $self->{tokens} };
 678         while ( $cursor >= 0 ) {
 679                 my $token = $self->{tokens}->[$cursor--];
 680                 return $token if $token->significant;
 681         }
 682
 683         # Nothing...
 684         PPI::Token::Whitespace->null;
 685 }
 686
 687 # Get an array ref of previous significant tokens.
 688 # Like _last_significant_token except it gets more than just one token
 689 # Returns array ref on success.
 690 # Returns 0 on not enough tokens
 691 sub _previous_significant_tokens {
 692         my $self   = shift;
 693         my $count  = shift || 1;
 694         my $cursor = $#{ $self->{tokens} };
 695
 696         my ($token, @tokens);
 697         while ( $cursor >= 0 ) {
 698                 $token = $self->{tokens}->[$cursor--];
 699                 if ( $token->significant ) {
 700                         push @tokens, $token;
 701                         return \@tokens if scalar @tokens >= $count;
 702                 }
 703         }
 704
 705         # Pad with empties
 706         foreach ( 1 .. ($count - scalar @tokens) ) {
 707                 push @tokens, PPI::Token::Whitespace->null;
 708         }
 709
 710         \@tokens;
 711 }
 712
 713 my %OBVIOUS_CLASS = (
 714         'PPI::Token::Symbol'              => 'operator',
 715         'PPI::Token::Magic'               => 'operator',
 716         'PPI::Token::Number'              => 'operator',
 717         'PPI::Token::ArrayIndex'          => 'operator',
 718         'PPI::Token::Quote::Double'       => 'operator',
 719         'PPI::Token::Quote::Interpolate'  => 'operator',
 720         'PPI::Token::Quote::Literal'      => 'operator',
 721         'PPI::Token::Quote::Single'       => 'operator',
 722         'PPI::Token::QuoteLike::Backtick' => 'operator',
 723         'PPI::Token::QuoteLike::Command'  => 'operator',
 724         'PPI::Token::QuoteLike::Readline' => 'operator',
 725         'PPI::Token::QuoteLike::Regexp'   => 'operator',
 726         'PPI::Token::QuoteLike::Words'    => 'operator',
 727 );
 728
 729 my %OBVIOUS_CONTENT = (
 730         '(' => 'operand',
 731         '{' => 'operand',
 732         '[' => 'operand',
 733         ';' => 'operand',
 734         '}' => 'operator',
 735 );
 736
 737 # Try to determine operator/operand context, is possible.
 738 # Returns "operator", "operand", or "" if unknown.
 739 sub _opcontext {
 740         my $self   = shift;
 741         my $tokens = $self->_previous_significant_tokens(1);
 742         my $p0     = $tokens->[0];
 743         my $c0     = ref $p0;
 744
 745         # Map the obvious cases
 746         return $OBVIOUS_CLASS{$c0}   if defined $OBVIOUS_CLASS{$c0};
 747         return $OBVIOUS_CONTENT{$p0} if defined $OBVIOUS_CONTENT{$p0};
 748
 749         # Most of the time after an operator, we are an operand
 750         return 'operand' if $p0->isa('PPI::Token::Operator');
 751
 752         # If there's NOTHING, it's operand
 753         return 'operand' if $p0->content eq '';
 754
 755         # Otherwise, we don't know
 756         return ''
 757 }
 758
 759 1;
 760
 761 =pod
 762
 763 =head1 NOTES
 764
 765 =head2 How the Tokenizer Works
 766
 767 Understanding the Tokenizer is not for the feint-hearted. It is by far
 768 the most complex and twisty piece of perl I've ever written that is actually
 769 still built properly and isn't a terrible spaghetti-like mess. In fact, you
 770 probably want to skip this section.
 771
 772 But if you really want to understand, well then here goes.
 773
 774 =head2 Source Input and Clean Up
 775
 776 The Tokenizer starts by taking source in a variety of forms, sucking it
 777 all in and merging into one big string, and doing our own internal line
 778 split, using a "universal line separator" which allows the Tokenizer to
 779 take source for any platform (and even supports a few known types of
 780 broken newlines caused by mixed mac/pc/*nix editor screw ups).
 781
 782 The resulting array of lines is used to feed the tokenizer, and is also
 783 accessed directly by the heredoc-logic to do the line-oriented part of
 784 here-doc support.
 785
 786 =head2 Doing Things the Old Fashioned Way
 787
 788 Due to the complexity of perl, and after 2 previously aborted parser
 789 attempts, in the end the tokenizer was fashioned around a line-buffered
 790 character-by-character method.
 791
 792 That is, the Tokenizer pulls and holds a line at a time into a line buffer,
 793 and then iterates a cursor along it. At each cursor position, a method is
 794 called in whatever token class we are currently in, which will examine the
 795 character at the current position, and handle it.
 796
 797 As the handler methods in the various token classes are called, they
 798 build up a output token array for the source code.
 799
 800 Various parts of the Tokenizer use look-ahead, arbitrary-distance
 801 look-behind (although currently the maximum is three significant tokens),
 802 or both, and various other heuristic guesses.
 803
 804 I've been told it is officially termed a I<"backtracking parser
 805 with infinite lookaheads">.
 806
 807 =head2 State Variables
 808
 809 Aside from the current line and the character cursor, the Tokenizer
 810 maintains a number of different state variables.
 811
 812 =over
 813
 814 =item Current Class
 815
 816 The Tokenizer maintains the current token class at all times. Much of the
 817 time is just going to be the "Whitespace" class, which is what the base of
 818 a document is. As the tokenizer executes the various character handlers,
 819 the class changes a lot as it moves a long. In fact, in some instances,
 820 the character handler may not handle the character directly itself, but
 821 rather change the "current class" and then hand off to the character
 822 handler for the new class.
 823
 824 Because of this, and some other things I'll deal with later, the number of
 825 times the character handlers are called does not in fact have a direct
 826 relationship to the number of actual characters in the document.
 827
 828 =item Current Zone
 829
 830 Rather than create a class stack to allow for infinitely nested layers of
 831 classes, the Tokenizer recognises just a single layer.
 832
 833 To put it a different way, in various parts of the file, the Tokenizer will
 834 recognise different "base" or "substrate" classes. When a Token such as a
 835 comment or a number is finalised by the tokenizer, it "falls back" to the
 836 base state.
 837
 838 This allows proper tokenization of special areas such as __DATA__
 839 and __END__ blocks, which also contain things like comments and POD,
 840 without allowing the creation of any significant Tokens inside these areas.
 841
 842 For the main part of a document we use L<PPI::Token::Whitespace> for this,
 843 with the idea being that code is "floating in a sea of whitespace".
 844
 845 =item Current Token
 846
 847 The final main state variable is the "current token". This is the Token
 848 that is currently being built by the Tokenizer. For certain types, it
 849 can be manipulated and morphed and change class quite a bit while being
 850 assembled, as the Tokenizer's understanding of the token content changes.
 851
 852 When the Tokenizer is confident that it has seen the end of the Token, it
 853 will be "finalized", which adds it to the output token array and resets
 854 the current class to that of the zone that we are currently in.
 855
 856 I should also note at this point that the "current token" variable is
 857 optional. The Tokenizer is capable of knowing what class it is currently
 858 set to, without actually having accumulated any characters in the Token.
 859
 860 =back
 861
 862 =head2 Making It Faster
 863
 864 As I'm sure you can imagine, calling several different methods for each
 865 character and running regexes and other complex heuristics made the first
 866 fully working version of the tokenizer extremely slow.
 867
 868 During testing, I created a metric to measure parsing speed called
 869 LPGC, or "lines per gigacycle" . A gigacycle is simple a billion CPU
 870 cycles on a typical single-core CPU, and so a Tokenizer running at
 871 "1000 lines per gigacycle" should generate around 1200 lines of tokenized
 872 code when running on a 1200 MHz processor.
 873
 874 The first working version of the tokenizer ran at only 350 LPGC, so to
 875 tokenize a typical large module such as L<ExtUtils::MakeMaker> took
 876 10-15 seconds. This sluggishness made it unpractical for many uses.
 877
 878 So in the current parser, there are multiple layers of optimisation
 879 very carefully built in to the basic. This has brought the tokenizer
 880 up to a more reasonable 1000 LPGC, at the expense of making the code
 881 quite a bit twistier.
 882
 883 =head2 Making It Faster - Whole Line Classification
 884
 885 The first step in the optimisation process was to add a hew handler to
 886 enable several of the more basic classes (whitespace, comments) to be
 887 able to be parsed a line at a time. At the start of each line, a
 888 special optional handler (only supported by a few classes) is called to
 889 check and see if the entire line can be parsed in one go.
 890
 891 This is used mainly to handle things like POD, comments, empty lines,
 892 and a few other minor special cases.
 893
 894 =head2 Making It Faster - Inlining
 895
 896 The second stage of the optimisation involved inlining a small
 897 number of critical methods that were repeated an extremely high number
 898 of times. Profiling suggested that there were about 1,000,000 individual
 899 method calls per gigacycle, and by cutting these by two thirds a significant
 900 speed improvement was gained, in the order of about 50%.
 901
 902 You may notice that many methods in the C<PPI::Tokenizer> code look
 903 very nested and long hand. This is primarily due to this inlining.
 904
 905 At around this time, some statistics code that existed in the early
 906 versions of the parser was also removed, as it was determined that
 907 it was consuming around 15% of the CPU for the entire parser, while
 908 making the core more complicated.
 909
 910 A judgment call was made that with the difficulties likely to be
 911 encountered with future planned enhancements, and given the relatively
 912 high cost involved, the statistics features would be removed from the
 913 Tokenizer.
 914
 915 =head2 Making It Faster - Quote Engine
 916
 917 Once inlining had reached diminishing returns, it became obvious from
 918 the profiling results that a huge amount of time was being spent
 919 stepping a char at a time though long, simple and "syntactically boring"
 920 code such as comments and strings.
 921
 922 The existing regex engine was expanded to also encompass quotes and
 923 other quote-like things, and a special abstract base class was added
 924 that provided a number of specialised parsing methods that would "scan
 925 ahead", looking out ahead to find the end of a string, and updating
 926 the cursor to leave it in a valid position for the next call.
 927
 928 This is also the point at which the number of character handler calls began
 929 to greatly differ from the number of characters. But it has been done
 930 in a way that allows the parser to retain the power of the original
 931 version at the critical points, while skipping through the "boring bits"
 932 as needed for additional speed.
 933
 934 The addition of this feature allowed the tokenizer to exceed 1000 LPGC
 935 for the first time.
 936
 937 =head2 Making It Faster - The "Complete" Mechanism
 938
 939 As it became evident that great speed increases were available by using
 940 this "skipping ahead" mechanism, a new handler method was added that
 941 explicitly handles the parsing of an entire token, where the structure
 942 of the token is relatively simple. Tokens such as symbols fit this case,
 943 as once we are passed the initial sigil and word char, we know that we
 944 can skip ahead and "complete" the rest of the token much more easily.
 945
 946 A number of these have been added for most or possibly all of the common
 947 cases, with most of these "complete" handlers implemented using regular
 948 expressions.
 949
 950 In fact, so many have been added that at this point, you could arguably
 951 reclassify the tokenizer as a "hybrid regex, char-by=char heuristic
 952 tokenizer". More tokens are now consumed in "complete" methods in a
 953 typical program than are handled by the normal char-by-char methods.
 954
 955 Many of the these complete-handlers were implemented during the writing
 956 of the Lexer, and this has allowed the full parser to maintain around
 957 1000 LPGC despite the increasing weight of the Lexer.
 958
 959 =head2 Making It Faster - Porting To C (In Progress)
 960
 961 While it would be extraordinarily difficult to port all of the Tokenizer
 962 to C, work has started on a L<PPI::XS> "accelerator" package which acts as
 963 a separate and automatically-detected add-on to the main PPI package.
 964
 965 L<PPI::XS> implements faster versions of a variety of functions scattered
 966 over the entire PPI codebase, from the Tokenizer Core, Quote Engine, and
 967 various other places, and implements them identically in XS/C.
 968
 969 In particular, the skip-ahead methods from the Quote Engine would appear
 970 to be extremely amenable to being done in C, and a number of other
 971 functions could be cherry-picked one at a time and implemented in C.
 972
 973 Each method is heavily tested to ensure that the functionality is
 974 identical, and a versioning mechanism is included to ensure that if a
 975 function gets out of sync, L<PPI::XS> will degrade gracefully and just
 976 not replace that single method.
 977
 978 =head1 TO DO
 979
 980 - Add an option to reset or seek the token stream...
 981
 982 - Implement more Tokenizer functions in L<PPI::XS>
 983
 984 =head1 SUPPORT
 985
 986 See the L<support section|PPI/SUPPORT> in the main module.
 987
 988 =head1 AUTHOR
 989
 990 Adam Kennedy E<lt>adamk@cpan.orgE<gt>
 991
 992 =head1 COPYRIGHT
 993
 994 Copyright 2001 - 2009 Adam Kennedy.
 995
 996 This program is free software; you can redistribute
 997 it and/or modify it under the same terms as Perl itself.
 998
 999 The full text of the license can be found in the
1000 LICENSE file included with this module.
1001
1002 =cut