From: Rafael Garcia-Suarez Date: Thu, 8 Jun 2006 14:11:29 +0000 (+0000) Subject: Add the perlreguts manpage, by Yves Orton X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b23a565decf7acb33d46fc5bb7bed5ad79774efe;p=p5sagit%2Fp5-mst-13.2.git Add the perlreguts manpage, by Yves Orton p4raw-id: //depot/perl@28372 --- diff --git a/MANIFEST b/MANIFEST index 96faac6..92ac32d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1007,9 +1007,9 @@ ext/re/re_comp.h re extension wrapper for regcomp.h ext/re/re.pm re extension Perl module ext/re/re_top.h re extension symbol hiding header ext/re/re.xs re extension external subroutines -ext/re/t/re.t see if re pragma works ext/re/t/regop.pl generate debug output for various patterns ext/re/t/regop.t test RE optimizations by scraping debug output +ext/re/t/re.t see if re pragma works ext/Safe/t/safe1.t See if Safe works ext/Safe/t/safe2.t See if Safe works ext/Safe/t/safe3.t See if Safe works @@ -1220,8 +1220,8 @@ ext/XS/APItest/MANIFEST XS::APItest extension ext/XS/APItest/README XS::APItest extension ext/XS/APItest/t/call.t XS::APItest extension ext/XS/APItest/t/exception.t XS::APItest extension -ext/XS/APItest/t/my_cxt.t XS::APItest: test MY_CXT interface ext/XS/APItest/t/hash.t XS::APItest: tests for hash related APIs +ext/XS/APItest/t/my_cxt.t XS::APItest: test MY_CXT interface ext/XS/APItest/t/op.t XS::APItest: tests for OP related APIs ext/XS/APItest/t/printf.t XS::APItest extension ext/XS/APItest/t/push.t XS::APItest extension @@ -2947,6 +2947,7 @@ pod/perlport.pod Perl portability guide pod/perlpragma.pod Perl modules: writing a user pragma pod/perlref.pod Perl references, the rest of the story pod/perlreftut.pod Perl references short introduction +pod/perlreguts.pod Perl regular expression engine internals pod/perlre.pod Perl regular expressions, the rest of the story pod/perlrequick.pod Perl regular expressions quick start pod/perlreref.pod Perl regular expressions quick reference diff --git a/pod.lst b/pod.lst index 57014e0..8b6bd01 100644 --- a/pod.lst +++ b/pod.lst @@ -107,6 +107,7 @@ h Internals and C Language Interface perlclib Internal replacements for standard C library functions perlguts Perl internal functions for those doing extensions perlcall Perl calling conventions from C + perlreguts Perl regular expression engine internals perlapi Perl API listing (autogenerated) perlintern Perl internal functions (autogenerated) diff --git a/pod/perl.pod b/pod/perl.pod index e00a758..114c18e 100644 --- a/pod/perl.pod +++ b/pod/perl.pod @@ -124,6 +124,7 @@ For ease of access, the Perl manual has been split up into several sections. perlclib Internal replacements for standard C library functions perlguts Perl internal functions for those doing extensions perlcall Perl calling conventions from C + perlreguts Perl regular expression engine internals perlapi Perl API listing (autogenerated) perlintern Perl internal functions (autogenerated) diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod new file mode 100644 index 0000000..ce9f3c0 --- /dev/null +++ b/pod/perlreguts.pod @@ -0,0 +1,722 @@ +=head1 NAME + +perlreguts - Description of the Perl regular expression engine. + +=head1 DESCRIPTION + +This document is an attempt to shine some light on the guts of the regex +engine and how it works. The regex engine represents a signifigant chunk +of the perl codebase, but is relatively poorly understood. This document +is a meagre attempt at addressing this situation. It is derived from the +authors experience, comments in the source code, other papers on the +regex engine, feedback in p5p, and no doubt other places as well. + +B It should be clearly understood that this document +represents the state of the regex engine as the author understands it at +the time of writing. It is B an API definition, it is purely an +internals guide for those who want to hack the regex engine, or +understand how the regex engine works. Readers of this document are +expected to understand perls regex syntax and its usage in detail, if +you are a beginner you are in the wrong the place. + +=head1 OVERVIEW + +=head2 A quick note on terms + +There is some debate as to whether to say 'regexp' or 'regex'. In this +document we will use the term "regex" unless there is a special reason +not to, and then we will explain why. + +When speaking about regexes we need to distinguish between their source +code form and their internal form. In this document we will use the term +"pattern" when we speak of their textual, source code form, the term +"program" when we speak of their internal representation. These +correspond to the terms C and C that Mark Jason +Dominus employs in his paper on "Rx"[1]. + +=head2 What is a regular expression engine? + +A regular expression engine is a program whose job is to efficiently +find a section of a string that matches a set criteria of criteria. The +criteria is expressed in text using a formal language. See perlre for a +full definition of the language. + +So the job in less grandiose terms is to some turn a pattern into +something the computer can efficiently use to find the matching point in +the string. + +To do this we need to produce a program by parsing the text. We then +need to execute the program to find the point in the string that +matches. And we need to do the whole thing efficiently. + +=head2 Structure of a Regexp Program + +=head3 High Level + +Although it is a bit confusing and some object to the terminology it +is worth taking a look at a comment that has +been in regexp.h for years: + +I + +The term "railroad normal form" is a bit esoteric, with "syntax +diagram/charts", or "railroad diagram/charts" being more common terms. +Nevertheless it provides a useful mental image of a regex program: Each +node can be thought of as a unit of track, with a single entry and in +most cases a single exit point (there are pieces of track that fork, but +statistically not many), and the total forms a system of track with a +single entry and single exit point. The matching process can be thought +of as a car that moves on the track, with the particular route through +the system being determined by the character read at each possible +connector point. A car can roll off the track at any point but it may +not procede unless it matches the track... + +Thus the pattern C can be thought of as the +following chart: + + [start] + | + + | + +---+---+ + | | | + <\w+> | <\s+> + | <\d+> | + | | | + +---+---+ + | + + | + [end] + +The truth of the matter is that perls regular expressions these days are +way beyond such models, but they can help when trying to get your +bearings, and they do match pretty closely with the current +implementation. + +To be more precise we will say that a regex program is an encoding +of a graph. Each node in the graph corresponds to part of +the original regex pattern, such as a literal string or a branch, +and has a pointer to the nodes representing the next component +to be matched. Since "node" and opcode are overloaded terms in the +perl source, we will call the nodes in a regex program 'regops'. + +The program is represented by an array of regnode structures, one or +more of which together represent a single regop of the program. Struct +regnode is the smallest struct needed and has a field structure which is +shared with all the other larger structures. + +"Next" pointers of all regops except BRANCH implement concatenation; a +"next" pointer with a BRANCH on both ends of it is connecting two +alternatives. [Here we have one of the subtle syntax dependencies: an +individual BRANCH (as opposed to a collection of them) is never +concatenated with anything because of operator precedence. + +The operand of some types of regop is a literal string; for others, +it is a regop leading into a sub-program. In particular, the operand +of a BRANCH node is the first regop of the branch. + +B: As the railroad metaphor suggests this is B a tree +structure: the tail of the branch connects to the thing following the +set of BRANCHes. It is a like a single line of railway track that +splits as it goes into a station or railway yard and rejoins as it comes +out the other side. + +=head3 Regops + +The base structure of a regop is defined in regexp.h as follows: + + struct regnode { + U8 flags; /* Various purposes, sometimes overriden */ + U8 type; /* Opcode value as specified by regnodes.h */ + U16 next_off; /* Offset in size regnode */ + }; + +Other larger regnode-like structures are defined in regcomp.h. They +are almost like subclasses in that they have the same fields as +regnode, with possibly additional fields following in +the structure, and in some cases the specific meaning (and name) +of some of base fields are overriden. The following is a more +complete description. + +=over 4 + +=item regnode_1 + +=item regnode_2 + +regnode_1 structures have the same header, followed by a single +four-byte argument; regnode_2 structures contain two two-byte +arguments instead: + + regnode_1 U32 arg1; + regnode_2 U16 arg1; U16 arg2; + +=item regnode_string + +regnode_string structures, used for literal strings, follow the header +with a one-byte length and then the string data. Strings are padded on +the end with zero bytes so that the total length of the node is a +multiple of four bytes: + + regnode_string char string[1]; + U8 str_len; (overides flags) + +=item regnode_charclass + +character classes are represented by regnode_charclass structures, +which have a four-byte argument and then a 32-byte (256-bit) bitmap +indicating which characters are included in the class. + + regnode_charclass U32 arg1; + char bitmap[ANYOF_BITMAP_SIZE]; + +=item regnode_charclass_class + +There is also a larger form of a char class structure used to represent +POSIX char classes called regnode_charclass_class which contains the +same fields plus an additional 4-byte (32-bit) bitmap indicating which +POSIX char class have been included. + + regnode_charclass_class U32 arg1; + char bitmap[ANYOF_BITMAP_SIZE]; + char classflags[ANYOF_CLASSBITMAP_SIZE]; + +=back + +regnodes.h defines an array called regarglen[] which gives the size +of each opcode in units of size regnode (4-byte). A macro is used +to calculate the size of an EXACT node based on its C field. + +The opcodes are defined in regnodes.h which is generated from +regcomp.sym by regcomp.pl. Currently the maximum possible number +of distinct opcodes is restricted to 256, with about 1/4 already +used. + +There's a set of macros provided to make accessing the fields +easier and more consistent. These include C which is used to tell +the type of a regnode-like structure, NEXT_OFF() which is the offset to +the next node (more on this later), ARG(), ARG1(), ARG2(), ARG_SET(), +and equivelents for reading and setting the arguments, STR_LEN(), +STRING(), and OPERAND() for manipulating strings and regop bearing +types. + +=head3 What opcode is next? + +There are three distinct concepts of "next" in the regex engine, and +it is important to keep them clear. + +=over 4 + +=item * + +There is the "next regnode" from a given regnode, a value which is +rarely useful except that sometimes it matches up in terms of value +with one of the others, and that sometimes the code assumes this to +always be so. + +=item * + +There is the "next opcode" from a given opcode/regnode. This is the +opcode physically located after the the current one, as determined by +the size of the current opcode. This is often useful, such as when +dumping the structure we use this order to traverse. Sometimes the code +assumes that the "next regnode" is the same as the "next opcode", or in +other words assumes that the sizeof a given opcode type is always going +to be 1 regnode large. + +=item * + +There is the "regnext" from a given opcode. This is the opcode which +is reached by jumping forward by the value of NEXT_OFF(), +or in a few cases for longer jumps by the arg1 field of the regnode_1 +structure. The subroutine regnext() handles this transparently. +This is the logical successor of the node, which in some cases, like +that of the BRANCH opcode, has special meaning. + +=back + +=head1 PROCESS OVERVIEW + +Broadly speaking performing a match of a string against a pattern +involves the following steps + + A. Compilation + 1. Parsing for size + 2. Parsing for construction + 3. Peep-hole Optimisation and Analysis + B. Execution + 4. Start position and no-match optimisations + 5. Program execution + +Where these steps occur in the actual execution of a perl program is +determined by whether the pattern involves interpolating any string +variables. If it does then compilation happens at run time. If it +doesn't then it happens at compile time. (The C modifier changes this, +as does C to a certain extent.) The engine doesn't really care that +much. + +=head2 Compilation + +This code exists primarily in regcomp.c, along with the header files +regcomp.h, regexp.h, regnodes.h. + +Compilation starts with C, which is mostly an initialization +wrapper which farms out two other routines for the heavy lifting. The +first being C which is the start point for parsing, and +C which is responsible for optimisation. + +Initialization in C mostly involves the creation and data +filling of a special structure RExC_state_t, (defined in regcomp.c). +Almost all internally used routines in regcomp.h take a pointer to one +of these structures as their first argument, with the name *pRExC_state. +This structure is used to store the compilation state and contains many +fields. Likewise their are many macros defined which operate on this +variable. Anything that looks like RExC_xxxx is a macro that operates on +this pointer/structure. + +=head3 Parsing for size + +In this pass the input pattern is parsed in order to calculate how much +space is needed for each opcode we would need to emit. The size is also +used to determine whether long jumps will be required in the program. + +This stage is controlled by the macro SIZE_ONLY being set. + +The parse procedes pretty much exactly as it does during the +construction phase except that most routines are shortcircuited to +change the size field RExC_size and not actually do anything. + +=head3 Parsing for construcution + +Once the size of the program has been determine the pattern is parsed +again, but this time for real. Now SIZE_ONLY will be false, and the +actual construction can occur. + +C is the start of the parse process. It is responsible for +parsing an arbitrary chunk of pattern up to either the end of the +string, or the first closing parenthesis it encounters in the pattern. +This means it can be used to parse the toplevel regex, or any section +inside of a grouping parenthesis. It also handles the "special parens" +that perls regexes have. For instance when parsing C C +will at one point be called to parse from the '?' symbol up to and +including the ')'. + +Additionally C is responsible for parsing the one or more +branches from the pattern, and for "finishing them off" by correctly +setting their next pointers. In order to do the parsing it repeatedly +calls out to C which is responsible for handling up to the +first C<|> symbol it sees. + +C in turn calls C which is responsible for +handling "things" followed by a quantifier. In order to parse the +"things" C is called. This is the lowest level routine which +is responsible for parsing out constant strings, char classes, and the +various special symbols like C<$>. If C encounters a '(' +character it in turn calls C. + +The routine C is called by both C, C +in order to "set the tail pointer" correctly. When executing and +we get to the end of a branch we need to go to node following the +grouping parens. When parsing however we don't know where the end will +be until we get there, so when we do we must go back and update the +offsets as appropriate. C is used to make this easier. + +A subtlety of the parse process means that a regex like C is +originally parsed into an alternation with a single branch. It is only +afterwards that the optimizer converts single branch alternations into the +simpler form. + +=head3 Parse Call Graph and a Grammar + +The call graph looks like this: + + reg() # parse a top level regex, or inside of parens + regbranch() # parse a single branch of an alternation + regpiece() # parse a pattern followed by a quantifier + regatom() # parse a simple pattern + regclass() # used to handle a class + reg() # used to handle a parenthesized subpattern + .... + ... + regtail() # finish off the branch + ... + regtail() # finish off the branch sequence. Tie each + # branches tail to the tail of the sequence + # (NEW) In Debug mode this is + # regtail_study(). + +A grammar form might be something like this: + + atom : constant | class + quant : '*' | '+' | '?' | '{min,max}' + _branch: piece + | piece _branch + | nothing + branch: _branch + | _branch '|' branch + group : '(' branch ')' + _piece: atom | group + piece : _piece + | _piece quant + +=head3 Debug Output + +In bleadperl you can C<< use re Debug => 'PARSE'; >> to see some trace +information about the parse process. We will start with some simple +patterns and build up to more complex patterns. + +So when we parse C we see something like the following table. The +left shows whats being parsed, the number indicates where the next regop +would go. The stuff on the right is the trace output of the graph. The +names are chosen to be short to make it less dense on the screen. 'tsdy' +is a special form of C which does some extra analysis. + + >foo< 1 reg + brnc + piec + atom + >< 4 tsdy~ EXACT (EXACT) (1) + ~ attach to END (3) offset to 2 + +The resulting program then looks like: + + 1: EXACT (3) + 3: END(0) + +As you can see, even though we parsed out a branch and a piece, it was ultimately +only an atom. The final program shows us how things work. We have an EXACT regop, +followed by an END regop. The number in parens indicates where the 'regnext' of +the node goes. The 'regnext' of an END regop is unused, as END regops mean +we have successfully matched. The number on the left indicates the position of +the regop in the regnode array. + +Now lets try a harder pattern. We will add a quantifier so we have the pattern +C. We will see that C calls C regpiece twice. + + >foo+< 1 reg + brnc + piec + atom + >o+< 3 piec + atom + >< 6 tail~ EXACT (1) + 7 tsdy~ EXACT (EXACT) (1) + ~ PLUS (END) (3) + ~ attach to END (6) offset to 3 + +And we end up with the program: + + 1: EXACT (3) + 3: PLUS(6) + 4: EXACT (0) + 6: END(0) + +Now we have a special case. The EXACT regop has a regnext of 0. This is +because if it matches it should try to match itself again. The PLUS regop +handles the actual failure of the EXACT regop and acts appropriately (going +to regnode 6 if the EXACT matched at least once, or failing if it didn't.) + +Now for something much more complex: C + + >x(?:foo*|b... 1 reg + brnc + piec + atom + >(?:foo*|b[... 3 piec + atom + >?:foo*|b[a... reg + >foo*|b[a][... brnc + piec + atom + >o*|b[a][rR... 5 piec + atom + >|b[a][rR])... 8 tail~ EXACT (3) + >b[a][rR])(... 9 brnc + 10 piec + atom + >[a][rR])(f... 12 piec + atom + >a][rR])(fo... clas + >[rR])(foo|... 14 tail~ EXACT (10) + piec + atom + >rR])(foo|b... clas + >)(foo|bar)... 25 tail~ EXACT (12) + tail~ BRANCH (3) + 26 tsdy~ BRANCH (END) (9) + ~ attach to TAIL (25) offset to 16 + tsdy~ EXACT (EXACT) (4) + ~ STAR (END) (6) + ~ attach to TAIL (25) offset to 19 + tsdy~ EXACT (EXACT) (10) + ~ EXACT (EXACT) (12) + ~ ANYOF[Rr] (END) (14) + ~ attach to TAIL (25) offset to 11 + >(foo|bar)$< tail~ EXACT (1) + piec + atom + >foo|bar)$< reg + 28 brnc + piec + atom + >|bar)$< 31 tail~ OPEN1 (26) + >bar)$< brnc + 32 piec + atom + >)$< 34 tail~ BRANCH (28) + 36 tsdy~ BRANCH (END) (31) + ~ attach to CLOSE1 (34) offset to 3 + tsdy~ EXACT (EXACT) (29) + ~ attach to CLOSE1 (34) offset to 5 + tsdy~ EXACT (EXACT) (32) + ~ attach to CLOSE1 (34) offset to 2 + >$< tail~ BRANCH (3) + ~ BRANCH (9) + ~ TAIL (25) + piec + atom + >< 37 tail~ OPEN1 (26) + ~ BRANCH (28) + ~ BRANCH (31) + ~ CLOSE1 (34) + 38 tsdy~ EXACT (EXACT) (1) + ~ BRANCH (END) (3) + ~ BRANCH (END) (9) + ~ TAIL (END) (25) + ~ OPEN1 (END) (26) + ~ BRANCH (END) (28) + ~ BRANCH (END) (31) + ~ CLOSE1 (END) (34) + ~ EOL (END) (36) + ~ attach to END (37) offset to 1
+ +Resulting in the program + + 1: EXACT (3) + 3: BRANCH(9) + 4: EXACT (6) + 6: STAR(26) + 7: EXACT (0) + 9: BRANCH(25) + 10: EXACT (14) + 12: OPTIMIZED (2 nodes) + 14: ANYOF[Rr](26) + 25: TAIL(26) + 26: OPEN1(28) + 28: TRIE-EXACT(34) + [StS:1 Wds:2 Cs:6 Uq:5 #Sts:7 Mn:3 Mx:3 Stcls:bf] + + + 30: OPTIMIZED (4 nodes) + 34: CLOSE1(36) + 36: EOL(37) + 37: END(0) + +Here we can see a much more complex program, with various optimisations in +play. At regnode 10 we can see an example where a char class with only +one character in it was turned into an EXACT node. We can also see where +an entire alternation was turned into a TRIE-EXACT node. As a consequence +some of the regnodes have been marked as optimised away. We can see that +the C<$> symbol has been converted into an EOL regop, a special piece of +code that looks for \n or the end of a string. + +The next pointer for BRANCHes is interesting in that it points at where +execution should go if the branch fails. When executing if the engine +tries to traverse from a branch to a regnext that isnt a branch then +the engine will know the overall series of branches have failed. + +=head3 Peep-hole Optimisation and Analysis + +The regular expression engine can be a weighty tool to wield. On long +strings and complex patterns it can end up having to do a lot of work +to find a match, and even more to decide that no match is possible. +Consider a situation like the following pattern. + + 'ababababababababababab' =~ /(a|b)*z/ + +The C<(a|b)*> part can match at every char in the string, and then fail +every time because there is no C in the string. So obviously we can +not bother to use the regex engine unless there is a 'z' in the string. +Likewise in a pattern like: + + /foo(\w+)bar/ + +In this case we know that the string must contain a C which must be +followed by C. We can use Fast Boyer-More matching as implemented +in fbm_instr() to find the location of these strings. If they dont exist +then we dont need to resort to the much more expensive regex engine. +Even better if they do exist then we can use their positions to +reduce the search space that the regex engine needs to cover to determine +if the entire pattern does match. + +There are various aspects of the pattern that can be used to facilitate +optimisations along these lines: + + * anchored fixed strings + * floating fixed strings + * minimum and maximum length requirements + * start class + * Beginning/End of line positions + +Another form of optimisation that can occur is post-parse "peep-hole" +optimisations, where inefficient constructs are modified so that they +are more efficient. An example of this is TAIL regops which are used +during parsing to mark the end of branches and the end of groups. These +regops are used as place holders during construction and "always match" +so they can be "optimised away" by making the things that point to the +TAIL point to thing the TAIL points to, in essence "skipping" the node. + +Another optimisation that can occur is that of "EXACT merging" which is +where two consecutive EXACT nodes are merged into a single more efficient +to execute regop. An even more agressive form of this is that a branch +sequence of the form EXACT BRANCH ... EXACT can be converted into a TRIE +regop. + +All of this occurs in the routine study_chunk() which uses a special +structure scan_data_t to store the analysis that it has performed, and +as it goes does the "peep-hole" optimisations. + +The code involved in study_chunk() is extremely cryptic. Be careful. :-) + +=head2 Execution + +Execution of a regex generally involves two phases, the first being +finding the start point in the string where we should match from, +and the second being running the regop interpreter. + +If we can tell that there is no valid start point we don't bother running +interpreter at all. Likewise if we know from the analysis phase that we +can not optimise detection of the start position we go straight to the +interpreter. + +The two entry points are re_intuit_start() and pregexec(). These routines +have a somewhat incestuous relationship with overlap between their functions, +and pregexec() may even call re_intuit_start() on its own. Nevertheless +the perl source code may call into either, or both. + +Execution of the interpreter itself used to be recursive. Due to the +efforts of Dave Mitchel in blead perl it no longer is. Instead an +internal stack is maintained on the heap and the routine is fully +iterative. This can make it tricky as the code is quite conservative +about what state it stores which means that two consecutive lines in the +code can actually be running in totally different contexts due to the +simulated recursion. + +=head3 Start position and no-match optimisations + +re_intuit_start() is responsible for handling start point and no match +optimisations as determined by the results of the analysis done by +study_chunk() (and described in L). + +The basic structure of this routine is to try to find the start and/or +end points of where the pattern could match, and to ensure that the string +is long enough to match the pattern. It tries to use more efficent +methods over less efficient methods and may involve considerable cross +checking of constraints to find the place in the string that matches. +For instance it may try to determine that a given fixed string must be +not only present but a certain number of chars before the end of the +string, or whatever. + +It calls out into several other routines, like fbm_instr() which does +"Fast Boyer More" matching and find_byclass() which is responsible for +finding the start using the first mandatory regop in the program. + +When the optimisation criteria have been satisfied reg_try() is called +to perform the match. + +=head3 Program execution + +C is the main entry point for running a regex. It contains +support for initializing the regex interpreters state, running +re_intuit_start() if needed, and running the intepreter on the string +from various start positions as needed. When its necessary to use +the regex interpreter C calls C. + +C is the entry point into the regex interpreter. It expects +as arguments a pointer to a regmatch_info structure and a pointer to +a string. It returns an integer 1 for success and a 0 for failure. +It is basically a setup wrapper around C. + +C is the main "recursive loop" of the interpreter. It is +basically a giant switch statement that executes the regops based on +their type. A few of the regops are implemented as subroutines but +the bulk are inline code. + +=head1 MISCELLANEOUS + +=head2 UNICODE and Localization Support + +No matter how you look at it unicode support is going to be a pain in a +regex engine. Tricks that might be fine when you have 256 possible +characters often won't scale to handle the size of the 'utf8' character +set. Things you can take for granted with ASCII may not be true with +unicode. For instance in ASCII its safe to assume that +C, in utf8 it isn't. Unicode case folding is +vastly more complex than the simple rules of English, and even when not +using unicode but only localized single byte encodings things can get +tricky (technically GERMAN-SHARP-ESS should match 'ss' in localized case +insensitive matching.) + +Making things worse is that C support was a later addition to the +regex engine (as it was to perl) and necessarily this made things a lot +more complicated. Obviously its easier to design a regex engine with +unicode support from the beginning than it is to retrofit one that +wasn't designed with it in mind. + +Pretty well every regop that involves looking at the input string has +two cases, one for 'utf8' and one not. In fact its often more complex +than that, as the pattern may be 'utf8' as well. + +Care must be taken when making changes to make sure that you handle +utf8 properly both at compile time and at execution time, including +when the string and pattern are mismatched. + +The following comment in regcomp.h gives an example of exactly how +tricky this can be: + + Two problematic code points in Unicode casefolding of EXACT nodes: + + U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + + which casefold to + + Unicode UTF-8 + + U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + + This means that in case-insensitive matching (or "loose matching", + as Unicode calls it), an EXACTF of length six (the UTF-8 encoded + byte length of the above casefolded versions) can match a target + string of length two (the byte length of UTF-8 encoded U+0390 or + U+03B0). This would rather mess up the minimum length computation. + + What we'll do is to look for the tail four bytes, and then peek + at the preceding two bytes to see whether we need to decrease + the minimum length by four (six minus two). + + Thanks to the design of UTF-8, there cannot be false matches: + A sequence of valid UTF-8 bytes cannot be a subsequence of + another valid sequence of UTF-8 bytes. + +=head1 AUTHOR + +by Yves Orton, 2006. + +With excerpts from Perl, and contributions and suggestions from +Ronald J. Kimball, Dave Mitchell, Dominic Dunlop, Mark Jason Dominus, +and Stephen McCamant. + +=head1 LICENSE + +Same terms as Perl. + +=head1 REFERENCES + +[1] http://perl.plover.com/Rx/paper/ + +=cut diff --git a/pod/perltoc.pod b/pod/perltoc.pod index 1439bd9..e611429 100644 --- a/pod/perltoc.pod +++ b/pod/perltoc.pod @@ -3774,12 +3774,15 @@ autodetected, C needed to upgrade non-Latin-1 byte strings =item Effects of Character Semantics -=item Scripts +=item Unicode Character Properties -=item Blocks +General Category, Bidirectional Character Types, Scripts, Extended property +classes, Use of "Is" Prefix, Blocks =item User-Defined Character Properties +=item User-Defined Case Mappings + =item Character Encodings for Input and Output =item Unicode Regular Expression Support Level @@ -5289,6 +5292,53 @@ callback =back +=head2 perlreguts - Description of the Perl regular expression engine. + +=over 4 + +=item DESCRIPTION + +=item OVERVIEW + +=over 4 + +=item A quick note on terms + +=item What is a regular expression engine? + +=item Structure of a Regexp Program + +regnode_1, regnode_2, regnode_string, regnode_charclass, +regnode_charclass_class + +=back + +=item PROCESS OVERVIEW + +=over 4 + +=item Compilation + +=item Execution + +=back + +=item MISCELLANEOUS + +=over 4 + +=item UNICODE and Localization Support + +=back + +=item AUTHOR + +=item LICENSE + +=item REFERENCES + +=back + =head2 perlapi - autogenerated documentation for the perl public API =over 4 @@ -11193,6 +11243,8 @@ upgrading for byte-strings =item the 'err' feature +=item the 'dor' feature + =item the 'state' feature =back @@ -21417,6 +21469,10 @@ handling =item DESCRIPTION +=item See Also + +L, L + =item WARNING =back diff --git a/vms/descrip_mms.template b/vms/descrip_mms.template index 0902516..f6e6f48 100644 --- a/vms/descrip_mms.template +++ b/vms/descrip_mms.template @@ -407,13 +407,13 @@ pod17 = [.lib.pods]perlmodinstall.pod [.lib.pods]perlmodlib.pod [.lib.pods]perlm pod18 = [.lib.pods]perlnewmod.pod [.lib.pods]perlnumber.pod [.lib.pods]perlobj.pod [.lib.pods]perlop.pod [.lib.pods]perlopenbsd.pod pod19 = [.lib.pods]perlopentut.pod [.lib.pods]perlos2.pod [.lib.pods]perlos390.pod [.lib.pods]perlos400.pod [.lib.pods]perlothrtut.pod pod20 = [.lib.pods]perlpacktut.pod [.lib.pods]perlplan9.pod [.lib.pods]perlpod.pod [.lib.pods]perlpodspec.pod [.lib.pods]perlport.pod -pod21 = [.lib.pods]perlpragma.pod [.lib.pods]perlqnx.pod [.lib.pods]perlre.pod [.lib.pods]perlref.pod [.lib.pods]perlreftut.pod [.lib.pods]perlrequick.pod -pod22 = [.lib.pods]perlreref.pod [.lib.pods]perlretut.pod [.lib.pods]perlriscos.pod [.lib.pods]perlrun.pod [.lib.pods]perlsec.pod [.lib.pods]perlsolaris.pod -pod23 = [.lib.pods]perlstyle.pod [.lib.pods]perlsub.pod [.lib.pods]perlsymbian.pod [.lib.pods]perlsyn.pod [.lib.pods]perlthrtut.pod [.lib.pods]perltie.pod -pod24 = [.lib.pods]perltoc.pod [.lib.pods]perltodo.pod [.lib.pods]perltooc.pod [.lib.pods]perltoot.pod [.lib.pods]perltrap.pod [.lib.pods]perltru64.pod -pod25 = [.lib.pods]perltw.pod [.lib.pods]perlunicode.pod [.lib.pods]perluniintro.pod [.lib.pods]perlunitut.pod [.lib.pods]perlutil.pod [.lib.pods]perluts.pod -pod26 = [.lib.pods]perlvar.pod [.lib.pods]perlvmesa.pod [.lib.pods]perlvms.pod [.lib.pods]perlvos.pod [.lib.pods]perlwin32.pod [.lib.pods]perlxs.pod -pod27 = [.lib.pods]perlxstut.pod +pod21 = [.lib.pods]perlpragma.pod [.lib.pods]perlqnx.pod [.lib.pods]perlre.pod [.lib.pods]perlref.pod [.lib.pods]perlreftut.pod [.lib.pods]perlreguts.pod +pod22 = [.lib.pods]perlrequick.pod [.lib.pods]perlreref.pod [.lib.pods]perlretut.pod [.lib.pods]perlriscos.pod [.lib.pods]perlrun.pod [.lib.pods]perlsec.pod +pod23 = [.lib.pods]perlsolaris.pod [.lib.pods]perlstyle.pod [.lib.pods]perlsub.pod [.lib.pods]perlsymbian.pod [.lib.pods]perlsyn.pod +pod24 = [.lib.pods]perlthrtut.pod [.lib.pods]perltie.pod [.lib.pods]perltoc.pod [.lib.pods]perltodo.pod [.lib.pods]perltooc.pod [.lib.pods]perltoot.pod +pod25 = [.lib.pods]perltrap.pod [.lib.pods]perltru64.pod [.lib.pods]perltw.pod [.lib.pods]perlunicode.pod [.lib.pods]perluniintro.pod +pod26 = [.lib.pods]perlunitut.pod [.lib.pods]perlutil.pod [.lib.pods]perluts.pod [.lib.pods]perlvar.pod [.lib.pods]perlvmesa.pod [.lib.pods]perlvms.pod +pod27 = [.lib.pods]perlvos.pod [.lib.pods]perlwin32.pod [.lib.pods]perlxs.pod [.lib.pods]perlxstut.pod pod = $(pod0) $(pod1) $(pod2) $(pod3) $(pod4) $(pod5) $(pod6) $(pod7) $(pod8) $(pod9) $(pod10) $(pod11) $(pod12) $(pod13) $(pod14) $(pod15) $(pod16) $(pod17) $(pod18) $(pod19) $(pod20) $(pod21) $(pod22) $(pod23) $(pod24) $(pod25) $(pod26) $(pod27) # Would be useful to automate the generation of this rule from pod/buildtoc @@ -1167,6 +1167,10 @@ preplibrary : $(MINIPERL_EXE) $(LIBPREREQ) @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] +[.lib.pods]perlreguts.pod : [.pod]perlreguts.pod + @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] + Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] + [.lib.pods]perlrequick.pod : [.pod]perlrequick.pod @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] diff --git a/win32/pod.mak b/win32/pod.mak index 5f3bf61..07181de 100644 --- a/win32/pod.mak +++ b/win32/pod.mak @@ -103,6 +103,7 @@ POD = \ perlre.pod \ perlref.pod \ perlreftut.pod \ + perlreguts.pod \ perlrequick.pod \ perlreref.pod \ perlretut.pod \ @@ -215,6 +216,7 @@ MAN = \ perlre.man \ perlref.man \ perlreftut.man \ + perlreguts.man \ perlrequick.man \ perlreref.man \ perlretut.man \ @@ -232,6 +234,7 @@ MAN = \ perltrap.man \ perlunicode.man \ perluniintro.man \ + perlunitut.man \ perlutil.man \ perlvar.man \ perlxs.man \ @@ -326,6 +329,7 @@ HTML = \ perlre.html \ perlref.html \ perlreftut.html \ + perlreguts.html \ perlrequick.html \ perlreref.html \ perlretut.html \ @@ -342,6 +346,7 @@ HTML = \ perltrap.html \ perlunicode.html \ perluniintro.html \ + perlunitut.html \ perlutil.html \ perlvar.html \ perlxs.html \ @@ -437,6 +442,7 @@ TEX = \ perlre.tex \ perlref.tex \ perlreftut.tex \ + perlreguts.tex \ perlrequick.tex \ perlreref.tex \ perlretut.tex \ @@ -454,6 +460,7 @@ TEX = \ perltrap.tex \ perlunicode.tex \ perluniintro.tex \ + perlunitut.tex \ perlutil.tex \ perlvar.tex \ perlxs.tex \