Commit | Line | Data |
3fea05b9 |
1 | .\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.10) |
2 | .\" |
3 | .\" Standard preamble: |
4 | .\" ======================================================================== |
5 | .de Sp \" Vertical space (when we can't use .PP) |
6 | .if t .sp .5v |
7 | .if n .sp |
8 | .. |
9 | .de Vb \" Begin verbatim text |
10 | .ft CW |
11 | .nf |
12 | .ne \\$1 |
13 | .. |
14 | .de Ve \" End verbatim text |
15 | .ft R |
16 | .fi |
17 | .. |
18 | .\" Set up some character translations and predefined strings. \*(-- will |
19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left |
20 | .\" double quote, and \*(R" will give a right double quote. \*(C+ will |
21 | .\" give a nicer C++. Capital omega is used to do unbreakable dashes and |
22 | .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, |
23 | .\" nothing in troff, for use with C<>. |
24 | .tr \(*W- |
25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' |
26 | .ie n \{\ |
27 | . ds -- \(*W- |
28 | . ds PI pi |
29 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch |
30 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch |
31 | . ds L" "" |
32 | . ds R" "" |
33 | . ds C` "" |
34 | . ds C' "" |
35 | 'br\} |
36 | .el\{\ |
37 | . ds -- \|\(em\| |
38 | . ds PI \(*p |
39 | . ds L" `` |
40 | . ds R" '' |
41 | 'br\} |
42 | .\" |
43 | .\" Escape single quotes in literal strings from groff's Unicode transform. |
44 | .ie \n(.g .ds Aq \(aq |
45 | .el .ds Aq ' |
46 | .\" |
47 | .\" If the F register is turned on, we'll generate index entries on stderr for |
48 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index |
49 | .\" entries marked with X<> in POD. Of course, you'll have to process the |
50 | .\" output yourself in some meaningful fashion. |
51 | .ie \nF \{\ |
52 | . de IX |
53 | . tm Index:\\$1\t\\n%\t"\\$2" |
54 | .. |
55 | . nr % 0 |
56 | . rr F |
57 | .\} |
58 | .el \{\ |
59 | . de IX |
60 | .. |
61 | .\} |
62 | .\" |
63 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). |
64 | .\" Fear. Run. Save yourself. No user-serviceable parts. |
65 | . \" fudge factors for nroff and troff |
66 | .if n \{\ |
67 | . ds #H 0 |
68 | . ds #V .8m |
69 | . ds #F .3m |
70 | . ds #[ \f1 |
71 | . ds #] \fP |
72 | .\} |
73 | .if t \{\ |
74 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) |
75 | . ds #V .6m |
76 | . ds #F 0 |
77 | . ds #[ \& |
78 | . ds #] \& |
79 | .\} |
80 | . \" simple accents for nroff and troff |
81 | .if n \{\ |
82 | . ds ' \& |
83 | . ds ` \& |
84 | . ds ^ \& |
85 | . ds , \& |
86 | . ds ~ ~ |
87 | . ds / |
88 | .\} |
89 | .if t \{\ |
90 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" |
91 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' |
92 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' |
93 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' |
94 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' |
95 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' |
96 | .\} |
97 | . \" troff and (daisy-wheel) nroff accents |
98 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' |
99 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' |
100 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] |
101 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' |
102 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' |
103 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] |
104 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] |
105 | .ds ae a\h'-(\w'a'u*4/10)'e |
106 | .ds Ae A\h'-(\w'A'u*4/10)'E |
107 | . \" corrections for vroff |
108 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' |
109 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' |
110 | . \" for low resolution devices (crt and lpr) |
111 | .if \n(.H>23 .if \n(.V>19 \ |
112 | \{\ |
113 | . ds : e |
114 | . ds 8 ss |
115 | . ds o a |
116 | . ds d- d\h'-1'\(ga |
117 | . ds D- D\h'-1'\(hy |
118 | . ds th \o'bp' |
119 | . ds Th \o'LP' |
120 | . ds ae ae |
121 | . ds Ae AE |
122 | .\} |
123 | .rm #[ #] #H #V #F C |
124 | .\" ======================================================================== |
125 | .\" |
126 | .IX Title "PPI::Tokenizer 3" |
127 | .TH PPI::Tokenizer 3 "2009-08-08" "perl v5.8.7" "User Contributed Perl Documentation" |
128 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes |
129 | .\" way too many mistakes in technical documents. |
130 | .if n .ad l |
131 | .nh |
132 | .SH "NAME" |
133 | PPI::Tokenizer \- The Perl Document Tokenizer |
134 | .SH "SYNOPSIS" |
135 | .IX Header "SYNOPSIS" |
136 | .Vb 4 |
137 | \& # Create a tokenizer for a file, array or string |
138 | \& $Tokenizer = PPI::Tokenizer\->new( \*(Aqfilename.pl\*(Aq ); |
139 | \& $Tokenizer = PPI::Tokenizer\->new( \e@lines ); |
140 | \& $Tokenizer = PPI::Tokenizer\->new( \e$source ); |
141 | \& |
142 | \& # Return all the tokens for the document |
143 | \& my $tokens = $Tokenizer\->all_tokens; |
144 | \& |
145 | \& # Or we can use it as an iterator |
146 | \& while ( my $Token = $Tokenizer\->get_token ) { |
147 | \& print "Found token \*(Aq$Token\*(Aq\en"; |
148 | \& } |
149 | \& |
150 | \& # If we REALLY need to manually nudge the cursor, you |
151 | \& # can do that to (The lexer needs this ability to do rollbacks) |
152 | \& $is_incremented = $Tokenizer\->increment_cursor; |
153 | \& $is_decremented = $Tokenizer\->decrement_cursor; |
154 | .Ve |
155 | .SH "DESCRIPTION" |
156 | .IX Header "DESCRIPTION" |
157 | PPI::Tokenizer is the class that provides Tokenizer objects for use in |
158 | breaking strings of Perl source code into Tokens. |
159 | .PP |
160 | By the time you are reading this, you probably need to know a little |
161 | about the difference between how perl parses Perl \*(L"code\*(R" and how \s-1PPI\s0 |
162 | parsers Perl \*(L"documents\*(R". |
163 | .PP |
164 | \&\*(L"perl\*(R" itself (the interpreter) uses a heavily modified lex specification |
165 | to specify its parsing logic, maintains several types of state as it |
166 | goes, and incrementally tokenizes, lexes \s-1AND\s0 \s-1EXECUTES\s0 at the same time. |
167 | .PP |
168 | In fact, it is provably impossible to use perl's parsing method without |
169 | simultaneously executing code. A formal mathematical proof has been |
170 | published demonstrating the method. |
171 | .PP |
172 | This is where the truism \*(L"Only perl can parse Perl\*(R" comes from. |
173 | .PP |
174 | \&\s-1PPI\s0 uses a completely different approach by abandoning the (impossible) |
175 | ability to parse Perl the same way that the interpreter does, and instead |
176 | parsing the source as a document, using a document structure independantly |
177 | derived from the Perl documentation and approximating the perl interpreter |
178 | interpretation as closely as possible. |
179 | .PP |
180 | It was touch and go for a long time whether we could get it close enough, |
181 | but in the end it turned out that it could be done. |
182 | .PP |
183 | In this approach, the tokenizer \f(CW\*(C`PPI::Tokenizer\*(C'\fR is implemented separately |
184 | from the lexer PPI::Lexer. |
185 | .PP |
186 | The job of \f(CW\*(C`PPI::Tokenizer\*(C'\fR is to take pure source as a string and break it |
187 | up into a stream/set of tokens, and contains most of the \*(L"black magic\*(R" used |
188 | in \s-1PPI\s0. By comparison, the lexer implements a relatively straight forward |
189 | tree structure, and has an implementation that is uncomplicated (compared |
190 | to the insanity in the tokenizer at least). |
191 | .PP |
192 | The Tokenizer uses an immense amount of heuristics, guessing and cruft, |
193 | supported by a very \fB\s-1VERY\s0\fR flexible internal \s-1API\s0, but fortunately it was |
194 | possible to largely encapsulate the black magic, so there is not a lot that |
195 | gets exposed to people using the \f(CW\*(C`PPI::Tokenizer\*(C'\fR itself. |
196 | .SH "METHODS" |
197 | .IX Header "METHODS" |
198 | Despite the incredible complexity, the Tokenizer itself only exposes a |
199 | relatively small number of methods, with most of the complexity implemented |
200 | in private methods. |
201 | .ie n .SS "new $file | \e@lines | \e$source" |
202 | .el .SS "new \f(CW$file\fP | \e@lines | \e$source" |
203 | .IX Subsection "new $file | @lines | $source" |
204 | The main \f(CW\*(C`new\*(C'\fR constructor creates a new Tokenizer object. These |
205 | objects have no configuration parameters, and can only be used once, |
206 | to tokenize a single perl source file. |
207 | .PP |
208 | It takes as argument either a normal scalar containing source code, |
209 | a reference to a scalar containing source code, or a reference to an |
210 | \&\s-1ARRAY\s0 containing newline-terminated lines of source code. |
211 | .PP |
212 | Returns a new \f(CW\*(C`PPI::Tokenizer\*(C'\fR object on success, or throws a |
213 | PPI::Exception exception on error. |
214 | .SS "get_token" |
215 | .IX Subsection "get_token" |
216 | When using the PPI::Tokenizer object as an iterator, the \f(CW\*(C`get_token\*(C'\fR |
217 | method is the primary method that is used. It increments the cursor |
218 | and returns the next Token in the output array. |
219 | .PP |
220 | The actual parsing of the file is done only as-needed, and a line at |
221 | a time. When \f(CW\*(C`get_token\*(C'\fR hits the end of the token array, it will |
222 | cause the parser to pull in the next line and parse it, continuing |
223 | as needed until there are more tokens on the output array that |
224 | get_token can then return. |
225 | .PP |
226 | This means that a number of Tokenizer objects can be created, and |
227 | won't consume significant \s-1CPU\s0 until you actually begin to pull tokens |
228 | from it. |
229 | .PP |
230 | Return a PPI::Token object on success, \f(CW0\fR if the Tokenizer had |
231 | reached the end of the file, or \f(CW\*(C`undef\*(C'\fR on error. |
232 | .SS "all_tokens" |
233 | .IX Subsection "all_tokens" |
234 | When not being used as an iterator, the \f(CW\*(C`all_tokens\*(C'\fR method tells |
235 | the Tokenizer to parse the entire file and return all of the tokens |
236 | in a single \s-1ARRAY\s0 reference. |
237 | .PP |
238 | It should be noted that \f(CW\*(C`all_tokens\*(C'\fR does \fB\s-1NOT\s0\fR interfere with the |
239 | use of the Tokenizer object as an iterator (does not modify the token |
240 | cursor) and use of the two different mechanisms can be mixed safely. |
241 | .PP |
242 | Returns a reference to an \s-1ARRAY\s0 of PPI::Token objects on success |
243 | or throws an exception on error. |
244 | .SS "increment_cursor" |
245 | .IX Subsection "increment_cursor" |
246 | Although exposed as a public method, \f(CW\*(C`increment_method\*(C'\fR is implemented |
247 | for expert use only, when writing lexers or other components that work |
248 | directly on token streams. |
249 | .PP |
250 | It manually increments the token cursor forward through the file, in effect |
251 | \&\*(L"skipping\*(R" the next token. |
252 | .PP |
253 | Return true if the cursor is incremented, \f(CW0\fR if already at the end of |
254 | the file, or \f(CW\*(C`undef\*(C'\fR on error. |
255 | .SS "decrement_cursor" |
256 | .IX Subsection "decrement_cursor" |
257 | Although exposed as a public method, \f(CW\*(C`decrement_method\*(C'\fR is implemented |
258 | for expert use only, when writing lexers or other components that work |
259 | directly on token streams. |
260 | .PP |
261 | It manually decrements the token cursor backwards through the file, in |
262 | effect \*(L"rolling back\*(R" the token stream. And indeed that is what it is |
263 | primarily intended for, when the component that is consuming the token |
264 | stream needs to implement some sort of \*(L"roll back\*(R" feature in its use |
265 | of the token stream. |
266 | .PP |
267 | Return true if the cursor is decremented, \f(CW0\fR if already at the |
268 | beginning of the file, or \f(CW\*(C`undef\*(C'\fR on error. |
269 | .SH "NOTES" |
270 | .IX Header "NOTES" |
271 | .SS "How the Tokenizer Works" |
272 | .IX Subsection "How the Tokenizer Works" |
273 | Understanding the Tokenizer is not for the feint-hearted. It is by far |
274 | the most complex and twisty piece of perl I've ever written that is actually |
275 | still built properly and isn't a terrible spaghetti-like mess. In fact, you |
276 | probably want to skip this section. |
277 | .PP |
278 | But if you really want to understand, well then here goes. |
279 | .SS "Source Input and Clean Up" |
280 | .IX Subsection "Source Input and Clean Up" |
281 | The Tokenizer starts by taking source in a variety of forms, sucking it |
282 | all in and merging into one big string, and doing our own internal line |
283 | split, using a \*(L"universal line separator\*(R" which allows the Tokenizer to |
284 | take source for any platform (and even supports a few known types of |
285 | broken newlines caused by mixed mac/pc/*nix editor screw ups). |
286 | .PP |
287 | The resulting array of lines is used to feed the tokenizer, and is also |
288 | accessed directly by the heredoc-logic to do the line-oriented part of |
289 | here-doc support. |
290 | .SS "Doing Things the Old Fashioned Way" |
291 | .IX Subsection "Doing Things the Old Fashioned Way" |
292 | Due to the complexity of perl, and after 2 previously aborted parser |
293 | attempts, in the end the tokenizer was fashioned around a line-buffered |
294 | character-by-character method. |
295 | .PP |
296 | That is, the Tokenizer pulls and holds a line at a time into a line buffer, |
297 | and then iterates a cursor along it. At each cursor position, a method is |
298 | called in whatever token class we are currently in, which will examine the |
299 | character at the current position, and handle it. |
300 | .PP |
301 | As the handler methods in the various token classes are called, they |
302 | build up a output token array for the source code. |
303 | .PP |
304 | Various parts of the Tokenizer use look-ahead, arbitrary-distance |
305 | look-behind (although currently the maximum is three significant tokens), |
306 | or both, and various other heuristic guesses. |
307 | .PP |
308 | I've been told it is officially termed a \fI\*(L"backtracking parser |
309 | with infinite lookaheads\*(R"\fR. |
310 | .SS "State Variables" |
311 | .IX Subsection "State Variables" |
312 | Aside from the current line and the character cursor, the Tokenizer |
313 | maintains a number of different state variables. |
314 | .IP "Current Class" 4 |
315 | .IX Item "Current Class" |
316 | The Tokenizer maintains the current token class at all times. Much of the |
317 | time is just going to be the \*(L"Whitespace\*(R" class, which is what the base of |
318 | a document is. As the tokenizer executes the various character handlers, |
319 | the class changes a lot as it moves a long. In fact, in some instances, |
320 | the character handler may not handle the character directly itself, but |
321 | rather change the \*(L"current class\*(R" and then hand off to the character |
322 | handler for the new class. |
323 | .Sp |
324 | Because of this, and some other things I'll deal with later, the number of |
325 | times the character handlers are called does not in fact have a direct |
326 | relationship to the number of actual characters in the document. |
327 | .IP "Current Zone" 4 |
328 | .IX Item "Current Zone" |
329 | Rather than create a class stack to allow for infinitely nested layers of |
330 | classes, the Tokenizer recognises just a single layer. |
331 | .Sp |
332 | To put it a different way, in various parts of the file, the Tokenizer will |
333 | recognise different \*(L"base\*(R" or \*(L"substrate\*(R" classes. When a Token such as a |
334 | comment or a number is finalised by the tokenizer, it \*(L"falls back\*(R" to the |
335 | base state. |
336 | .Sp |
337 | This allows proper tokenization of special areas such as _\|_DATA_\|_ |
338 | and _\|_END_\|_ blocks, which also contain things like comments and \s-1POD\s0, |
339 | without allowing the creation of any significant Tokens inside these areas. |
340 | .Sp |
341 | For the main part of a document we use PPI::Token::Whitespace for this, |
342 | with the idea being that code is \*(L"floating in a sea of whitespace\*(R". |
343 | .IP "Current Token" 4 |
344 | .IX Item "Current Token" |
345 | The final main state variable is the \*(L"current token\*(R". This is the Token |
346 | that is currently being built by the Tokenizer. For certain types, it |
347 | can be manipulated and morphed and change class quite a bit while being |
348 | assembled, as the Tokenizer's understanding of the token content changes. |
349 | .Sp |
350 | When the Tokenizer is confident that it has seen the end of the Token, it |
351 | will be \*(L"finalized\*(R", which adds it to the output token array and resets |
352 | the current class to that of the zone that we are currently in. |
353 | .Sp |
354 | I should also note at this point that the \*(L"current token\*(R" variable is |
355 | optional. The Tokenizer is capable of knowing what class it is currently |
356 | set to, without actually having accumulated any characters in the Token. |
357 | .SS "Making It Faster" |
358 | .IX Subsection "Making It Faster" |
359 | As I'm sure you can imagine, calling several different methods for each |
360 | character and running regexes and other complex heuristics made the first |
361 | fully working version of the tokenizer extremely slow. |
362 | .PP |
363 | During testing, I created a metric to measure parsing speed called |
364 | \&\s-1LPGC\s0, or \*(L"lines per gigacycle\*(R" . A gigacycle is simple a billion \s-1CPU\s0 |
365 | cycles on a typical single-core \s-1CPU\s0, and so a Tokenizer running at |
366 | \&\*(L"1000 lines per gigacycle\*(R" should generate around 1200 lines of tokenized |
367 | code when running on a 1200 MHz processor. |
368 | .PP |
369 | The first working version of the tokenizer ran at only 350 \s-1LPGC\s0, so to |
370 | tokenize a typical large module such as ExtUtils::MakeMaker took |
371 | 10\-15 seconds. This sluggishness made it unpractical for many uses. |
372 | .PP |
373 | So in the current parser, there are multiple layers of optimisation |
374 | very carefully built in to the basic. This has brought the tokenizer |
375 | up to a more reasonable 1000 \s-1LPGC\s0, at the expense of making the code |
376 | quite a bit twistier. |
377 | .SS "Making It Faster \- Whole Line Classification" |
378 | .IX Subsection "Making It Faster - Whole Line Classification" |
379 | The first step in the optimisation process was to add a hew handler to |
380 | enable several of the more basic classes (whitespace, comments) to be |
381 | able to be parsed a line at a time. At the start of each line, a |
382 | special optional handler (only supported by a few classes) is called to |
383 | check and see if the entire line can be parsed in one go. |
384 | .PP |
385 | This is used mainly to handle things like \s-1POD\s0, comments, empty lines, |
386 | and a few other minor special cases. |
387 | .SS "Making It Faster \- Inlining" |
388 | .IX Subsection "Making It Faster - Inlining" |
389 | The second stage of the optimisation involved inlining a small |
390 | number of critical methods that were repeated an extremely high number |
391 | of times. Profiling suggested that there were about 1,000,000 individual |
392 | method calls per gigacycle, and by cutting these by two thirds a significant |
393 | speed improvement was gained, in the order of about 50%. |
394 | .PP |
395 | You may notice that many methods in the \f(CW\*(C`PPI::Tokenizer\*(C'\fR code look |
396 | very nested and long hand. This is primarily due to this inlining. |
397 | .PP |
398 | At around this time, some statistics code that existed in the early |
399 | versions of the parser was also removed, as it was determined that |
400 | it was consuming around 15% of the \s-1CPU\s0 for the entire parser, while |
401 | making the core more complicated. |
402 | .PP |
403 | A judgment call was made that with the difficulties likely to be |
404 | encountered with future planned enhancements, and given the relatively |
405 | high cost involved, the statistics features would be removed from the |
406 | Tokenizer. |
407 | .SS "Making It Faster \- Quote Engine" |
408 | .IX Subsection "Making It Faster - Quote Engine" |
409 | Once inlining had reached diminishing returns, it became obvious from |
410 | the profiling results that a huge amount of time was being spent |
411 | stepping a char at a time though long, simple and \*(L"syntactically boring\*(R" |
412 | code such as comments and strings. |
413 | .PP |
414 | The existing regex engine was expanded to also encompass quotes and |
415 | other quote-like things, and a special abstract base class was added |
416 | that provided a number of specialised parsing methods that would \*(L"scan |
417 | ahead\*(R", looking out ahead to find the end of a string, and updating |
418 | the cursor to leave it in a valid position for the next call. |
419 | .PP |
420 | This is also the point at which the number of character handler calls began |
421 | to greatly differ from the number of characters. But it has been done |
422 | in a way that allows the parser to retain the power of the original |
423 | version at the critical points, while skipping through the \*(L"boring bits\*(R" |
424 | as needed for additional speed. |
425 | .PP |
426 | The addition of this feature allowed the tokenizer to exceed 1000 \s-1LPGC\s0 |
427 | for the first time. |
428 | .ie n .SS "Making It Faster \- The ""Complete"" Mechanism" |
429 | .el .SS "Making It Faster \- The ``Complete'' Mechanism" |
430 | .IX Subsection "Making It Faster - The Complete Mechanism" |
431 | As it became evident that great speed increases were available by using |
432 | this \*(L"skipping ahead\*(R" mechanism, a new handler method was added that |
433 | explicitly handles the parsing of an entire token, where the structure |
434 | of the token is relatively simple. Tokens such as symbols fit this case, |
435 | as once we are passed the initial sigil and word char, we know that we |
436 | can skip ahead and \*(L"complete\*(R" the rest of the token much more easily. |
437 | .PP |
438 | A number of these have been added for most or possibly all of the common |
439 | cases, with most of these \*(L"complete\*(R" handlers implemented using regular |
440 | expressions. |
441 | .PP |
442 | In fact, so many have been added that at this point, you could arguably |
443 | reclassify the tokenizer as a \*(L"hybrid regex, char\-by=char heuristic |
444 | tokenizer\*(R". More tokens are now consumed in \*(L"complete\*(R" methods in a |
445 | typical program than are handled by the normal char-by-char methods. |
446 | .PP |
447 | Many of the these complete-handlers were implemented during the writing |
448 | of the Lexer, and this has allowed the full parser to maintain around |
449 | 1000 \s-1LPGC\s0 despite the increasing weight of the Lexer. |
450 | .SS "Making It Faster \- Porting To C (In Progress)" |
451 | .IX Subsection "Making It Faster - Porting To C (In Progress)" |
452 | While it would be extraordinarily difficult to port all of the Tokenizer |
453 | to C, work has started on a \s-1PPI::XS\s0 \*(L"accelerator\*(R" package which acts as |
454 | a separate and automatically-detected add-on to the main \s-1PPI\s0 package. |
455 | .PP |
456 | \&\s-1PPI::XS\s0 implements faster versions of a variety of functions scattered |
457 | over the entire \s-1PPI\s0 codebase, from the Tokenizer Core, Quote Engine, and |
458 | various other places, and implements them identically in \s-1XS/C\s0. |
459 | .PP |
460 | In particular, the skip-ahead methods from the Quote Engine would appear |
461 | to be extremely amenable to being done in C, and a number of other |
462 | functions could be cherry-picked one at a time and implemented in C. |
463 | .PP |
464 | Each method is heavily tested to ensure that the functionality is |
465 | identical, and a versioning mechanism is included to ensure that if a |
466 | function gets out of sync, \s-1PPI::XS\s0 will degrade gracefully and just |
467 | not replace that single method. |
468 | .SH "TO DO" |
469 | .IX Header "TO DO" |
470 | \&\- Add an option to reset or seek the token stream... |
471 | .PP |
472 | \&\- Implement more Tokenizer functions in \s-1PPI::XS\s0 |
473 | .SH "SUPPORT" |
474 | .IX Header "SUPPORT" |
475 | See the support section in the main module. |
476 | .SH "AUTHOR" |
477 | .IX Header "AUTHOR" |
478 | Adam Kennedy <adamk@cpan.org> |
479 | .SH "COPYRIGHT" |
480 | .IX Header "COPYRIGHT" |
481 | Copyright 2001 \- 2009 Adam Kennedy. |
482 | .PP |
483 | This program is free software; you can redistribute |
484 | it and/or modify it under the same terms as Perl itself. |
485 | .PP |
486 | The full text of the license can be found in the |
487 | \&\s-1LICENSE\s0 file included with this module. |