[ANNOUNCE] Math::BigInt v1.69

[p5sagit/p5-mst-13.2.git] / lib / Text / Balanced.pm
diff --git a/lib/Text/Balanced.pm b/lib/Text/Balanced.pm

index f50e2f5..820ae25 100644 (file)
--- a/lib/Text/Balanced.pm
+++ b/lib/Text/Balanced.pm
@@ -10,7 +10,7 @@ use Exporter;
 use SelfLoader;
 use vars qw { $VERSION @ISA %EXPORT_TAGS };
 
-$VERSION = '1.86';
+$VERSION = '1.95';
 @ISA           = qw ( Exporter );
                     
 %EXPORT_TAGS   = ( ALL => [ qw(
@@ -338,7 +338,16 @@ sub _match_tagged  # ($$$$$$$)
        }
        else
        {
-               $rdelspec = eval "qq{$rdel}";
+               $rdelspec = eval "qq{$rdel}" || do {
+                       my $del;
+                       for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
+                               { next if $rdel =~ /\Q$_/; $del = $_; last }
+                       unless ($del) {
+                               use Carp;
+                               croak "Can't interpolate right delimiter $rdel"
+                       }
+                       eval "qq$del$rdel$del";
+               };
        }
 
        while (pos($$textref) < length($$textref))
@@ -429,6 +438,9 @@ sub extract_variable (;$$)
 
 sub _match_variable($$)
 {
+#  $#
+#  $^
+#  $$
        my ($textref, $pre) = @_;
        my $startpos = pos($$textref) = pos($$textref)||0;
        unless ($$textref =~ m/\G($pre)/gc)
@@ -437,23 +449,29 @@ sub _match_variable($$)
                return;
        }
        my $varpos = pos($$textref);
-       unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc)
+        unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
        {
+           unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
+           {
                _failmsg "Did not find leading dereferencer", pos $$textref;
                pos $$textref = $startpos;
                return;
-       }
+           }
+           my $deref = $1;
 
-       unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
-               or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0))
-       {
+           unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
+               or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
+               or $deref eq '$#' or $deref eq '$$' )
+           {
                _failmsg "Bad identifier after dereferencer", pos $$textref;
                pos $$textref = $startpos;
                return;
+           }
        }
 
        while (1)
        {
+               next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
                next if _match_codeblock($textref,
                                         qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
                                         qr/[({[]/, qr/[)}\]]/,
@@ -565,11 +583,13 @@ sub _match_codeblock($$$$$$$)
 
 
                # NEED TO COVER MANY MORE CASES HERE!!!
-               if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
+               if ($$textref =~ m#\G\s*(?!$ldel_inner)
+                                       ( [-+*x/%^&|.]=?
                                        | [!=]~
                                        | =(?!>)
                                        | (\*\*|&&|\|\||<<|>>)=?
                                        | split|grep|map|return
+                                       | [([]
                                        )#gcx)
                {
                        $patvalid = 1;
@@ -699,7 +719,7 @@ sub _match_quotelike($$$$)  # ($textref, $prepat, $allow_raw_match)
                       );
        }
 
-       unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
+       unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
        {
                _failmsg q{No quotelike operator found after prefix at "} .
                             substr($$textref, pos($$textref), 20) .
@@ -854,13 +874,13 @@ sub extract_multiple (;$$$$)      # ($text, $functions_ref, $max_fields, $ignoreunkno
        my ($lastpos, $firstpos);
        my @fields = ();
 
-       for ($$textref)
+       #for ($$textref)
        {
                my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
                my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
                my $igunk = $_[3];
 
-               pos ||= 0;
+               pos $$textref ||= 0;
 
                unless (wantarray)
                {
@@ -888,51 +908,59 @@ sub extract_multiple (;$$$$)      # ($text, $functions_ref, $max_fields, $ignoreunkno
                        }
                }
 
-               FIELD: while (pos() < length())
+               FIELD: while (pos($$textref) < length($$textref))
                {
-                       my $field;
+                       my ($field, $rem);
+                       my @bits;
                        foreach my $i ( 0..$#func )
                        {
+                               my $pref;
                                $func = $func[$i];
                                $class = $class[$i];
-                               $lastpos = pos;
+                               $lastpos = pos $$textref;
                                if (ref($func) eq 'CODE')
-                                       { ($field) = $func->($_) }
+                                       { ($field,$rem,$pref) = @bits = $func->($$textref);
+                                       # print "[$field|$rem]" if $field;
+                                       }
                                elsif (ref($func) eq 'Text::Balanced::Extractor')
-                                       { $field = $func->extract($_) }
-                               elsif( m/\G$func/gc )
-                                       { $field = defined($1) ? $1 : $& }
-
+                                       { @bits = $field = $func->extract($$textref) }
+                               elsif( $$textref =~ m/\G$func/gc )
+                                       { @bits = $field = defined($1) ? $1 : $& }
+                               $pref ||= "";
                                if (defined($field) && length($field))
                                {
-                                       if (defined($unkpos) && !$igunk)
-                                       {
-                                               push @fields, substr($_, $unkpos, $lastpos-$unkpos);
-                                               $firstpos = $unkpos unless defined $firstpos;
-                                               undef $unkpos;
-                                               last FIELD if @fields == $max;
+                                       if (!$igunk) {
+                                               $unkpos = pos $$textref
+                                                       if length($pref) && !defined($unkpos);
+                                               if (defined $unkpos)
+                                               {
+                                                       push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
+                                                       $firstpos = $unkpos unless defined $firstpos;
+                                                       undef $unkpos;
+                                                       last FIELD if @fields == $max;
+                                               }
                                        }
-                                       push @fields, $class 
-                                               ? bless(\$field, $class)
+                                       push @fields, $class
+                                               ? bless (\$field, $class)
                                                : $field;
                                        $firstpos = $lastpos unless defined $firstpos;
-                                       $lastpos = pos;
+                                       $lastpos = pos $$textref;
                                        last FIELD if @fields == $max;
                                        next FIELD;
                                }
                        }
-                       if (/\G(.)/gcs)
+                       if ($$textref =~ /\G(.)/gcs)
                        {
-                               $unkpos = pos()-1
+                               $unkpos = pos($$textref)-1
                                        unless $igunk || defined $unkpos;
                        }
                }
                
                if (defined $unkpos)
                {
-                       push @fields, substr($_, $unkpos);
+                       push @fields, substr($$textref, $unkpos);
                        $firstpos = $unkpos unless defined $firstpos;
-                       $lastpos = length;
+                       $lastpos = length $$textref;
                }
                last;
        }
@@ -1032,7 +1060,7 @@ Text::Balanced - Extract delimited text sequences from strings.
 
 
  # Extract the initial substring of $text that is bounded by
- # an HTML/XML tag.
+ # an XML tag.
 
        ($extracted, $remainder) = extract_tagged($text);
 
@@ -1088,11 +1116,23 @@ Text::Balanced - Extract delimited text sequences from strings.
 
 =head1 DESCRIPTION
 
-The various C<extract_...> subroutines may be used to extract a 
-delimited string (possibly after skipping a specified prefix string).
-The search for the string always begins at the current C<pos>
-location of the string's variable (or at index zero, if no C<pos>
-position is defined).
+The various C<extract_...> subroutines may be used to
+extract a delimited substring, possibly after skipping a
+specified prefix string. By default, that prefix is
+optional whitespace (C</\s*/>), but you can change it to whatever
+you wish (see below).
+
+The substring to be extracted must appear at the
+current C<pos> location of the string's variable
+(or at index zero, if no C<pos> position is defined).
+In other words, the C<extract_...> subroutines I<don't>
+extract the first occurance of a substring anywhere
+in a string (like an unanchored regex would). Rather,
+they extract an occurance of the substring appearing
+immediately at the current matching position in the
+string (like a C<\G>-anchored regex would).
+
+
 
 =head2 General behaviour in list contexts
 
@@ -1350,6 +1390,58 @@ would correctly match something like this:
 See also: C<"extract_quotelike"> and C<"extract_codeblock">.
 
 
+=head2 C<extract_variable>
+
+C<extract_variable> extracts any valid Perl variable or
+variable-involved expression, including scalars, arrays, hashes, array
+accesses, hash look-ups, method calls through objects, subroutine calles
+through subroutine references, etc.
+
+The subroutine takes up to two optional arguments:
+
+=over 4
+
+=item 1.
+
+A string to be processed (C<$_> if the string is omitted or C<undef>)
+
+=item 2.
+
+A string specifying a pattern to be matched as a prefix (which is to be
+skipped). If omitted, optional whitespace is skipped.
+
+=back
+
+On success in a list context, an array of 3 elements is returned. The
+elements are:
+
+=over 4
+
+=item [0]
+
+the extracted variable, or variablish expression
+
+=item [1]
+
+the remainder of the input text,
+
+=item [2]
+
+the prefix substring (if any),
+
+=back
+
+On failure, all of these values (except the remaining text) are C<undef>.
+
+In a scalar context, C<extract_variable> returns just the complete
+substring that matched a variablish expression. C<undef> is returned on
+failure. In addition, the original input text has the returned substring
+(and any prefix) removed from it.
+
+In a void context, the input text just has the matched substring (and
+any specified prefix) removed.
+
+
 =head2 C<extract_tagged>
 
 C<extract_tagged> extracts and segments text between (balanced)
@@ -1367,7 +1459,7 @@ A string to be processed (C<$_> if the string is omitted or C<undef>)
 
 A string specifying a pattern to be matched as the opening tag.
 If the pattern string is omitted (or C<undef>) then a pattern
-that matches any standard HTML/XML tag is used.
+that matches any standard XML tag is used.
 
 =item 3.
 
@@ -1402,7 +1494,7 @@ that must I<not> appear within the tagged text.
 For example, to extract
 an HTML link (which should not contain nested links) use:
 
-       extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
+        extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
 
 =item C<ignore =E<gt> $listref>
 
@@ -1412,7 +1504,7 @@ that are I<not> be be treated as nested tags within the tagged text
 
 For example, to extract an arbitrary XML tag, but ignore "empty" elements:
 
-       extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
+        extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
 
 (also see L<"gen_delimited_pat"> below).
 
@@ -1435,22 +1527,22 @@ For example, suppose the start tag "/para" introduces a paragraph, which then
 continues until the next "/endpara" tag or until another "/para" tag is
 encountered:
 
-       $text = "/para line 1\n\nline 3\n/para line 4";
+        $text = "/para line 1\n\nline 3\n/para line 4";
 
-       extract_tagged($text, '/para', '/endpara', undef,
-                               {reject => '/para', fail => MAX );
+        extract_tagged($text, '/para', '/endpara', undef,
+                                {reject => '/para', fail => MAX );
 
-       # EXTRACTED: "/para line 1\n\nline 3\n"
+        # EXTRACTED: "/para line 1\n\nline 3\n"
 
 Suppose instead, that if no matching "/endpara" tag is found, the "/para"
 tag refers only to the immediately following paragraph:
 
-       $text = "/para line 1\n\nline 3\n/para line 4";
+        $text = "/para line 1\n\nline 3\n/para line 4";
 
-       extract_tagged($text, '/para', '/endpara', undef,
-                       {reject => '/para', fail => MAX );
+        extract_tagged($text, '/para', '/endpara', undef,
+                        {reject => '/para', fail => MAX );
 
-       # EXTRACTED: "/para line 1\n"
+        # EXTRACTED: "/para line 1\n"
 
 Note that the specified C<fail> behaviour applies to nested tags as well.
 
@@ -1533,12 +1625,12 @@ be extracted from).
 In other words, the implementation of C<extract_tagged> is exactly
 equivalent to:
 
-       sub extract_tagged
-       {
-               my $text = shift;
-               $extractor = gen_extract_tagged(@_);
-               return $extractor->($text);
-       }
+        sub extract_tagged
+        {
+                my $text = shift;
+                $extractor = gen_extract_tagged(@_);
+                return $extractor->($text);
+        }
 
 (although C<extract_tagged> is not currently implemented that way, in order
 to preserve pre-5.005 compatibility).
@@ -1557,13 +1649,13 @@ L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
 delimiters (for the quotelike operators), and trailing modifiers are
 all caught. For example, in:
 
-       extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
-       
-       extract_quotelike '  "You said, \"Use sed\"."  '
+        extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
+        
+        extract_quotelike '  "You said, \"Use sed\"."  '
 
-       extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
+        extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
 
-       extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
+        extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
 
 the full Perl quotelike operations are all extracted correctly.
 
@@ -1571,17 +1663,17 @@ Note too that, when using the /x modifier on a regex, any comment
 containing the current pattern delimiter will cause the regex to be
 immediately terminated. In other words:
 
-       'm /
-               (?i)            # CASE INSENSITIVE
-               [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
-               [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
-          /x'
+        'm /
+                (?i)            # CASE INSENSITIVE
+                [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
+                [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
+           /x'
 
 will be extracted as if it were:
 
-       'm /
-               (?i)            # CASE INSENSITIVE
-               [a-z_]          # LEADING ALPHABETIC/'
+        'm /
+                (?i)            # CASE INSENSITIVE
+                [a-z_]          # LEADING ALPHABETIC/'
 
 This behaviour is identical to that of the actual compiler.
 
@@ -1658,27 +1750,27 @@ prefix) removed.
 
 Examples:
 
-       # Remove the first quotelike literal that appears in text
+        # Remove the first quotelike literal that appears in text
 
-               $quotelike = extract_quotelike($text,'.*?');
+                $quotelike = extract_quotelike($text,'.*?');
 
-       # Replace one or more leading whitespace-separated quotelike
-       # literals in $_ with "<QLL>"
+        # Replace one or more leading whitespace-separated quotelike
+        # literals in $_ with "<QLL>"
 
-               do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
+                do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
 
 
-       # Isolate the search pattern in a quotelike operation from $text
+        # Isolate the search pattern in a quotelike operation from $text
 
-               ($op,$pat) = (extract_quotelike $text)[3,5];
-               if ($op =~ /[ms]/)
-               {
-                       print "search pattern: $pat\n";
-               }
-               else
-               {
-                       print "$op is not a pattern matching operation\n";
-               }
+                ($op,$pat) = (extract_quotelike $text)[3,5];
+                if ($op =~ /[ms]/)
+                {
+                        print "search pattern: $pat\n";
+                }
+                else
+                {
+                        print "$op is not a pattern matching operation\n";
+                }
 
 
 =head2 C<extract_quotelike> and "here documents"
@@ -1693,7 +1785,7 @@ here document might look like this:
         <<'EOMSG' || die;
         This is the message.
         EOMSG
-       exit;
+        exit;
 
 Given this as an input string in a scalar context, C<extract_quotelike>
 would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
@@ -1754,7 +1846,7 @@ rearranges the string to an equivalent piece of Perl:
         This is the message.
         EOMSG
         || die;
-       exit;
+        exit;
 
 in which the here document I<is> contiguous. It still leaves the
 matching position after the here document, but now the rest of the line
@@ -1925,13 +2017,18 @@ such substrings are skipped. Otherwise, they are returned.
 =back
 
 The extraction process works by applying each extractor in
-sequence to the text string. If the extractor is a subroutine it
-is called in a list
-context and is expected to return a list of a single element, namely
-the extracted text.
-Note that the value returned by an extractor subroutine need not bear any
-relationship to the corresponding substring of the original text (see
-examples below).
+sequence to the text string.
+
+If the extractor is a subroutine it is called in a list context and is
+expected to return a list of a single element, namely the extracted
+text. It may optionally also return two further arguments: a string
+representing the text left after extraction (like $' for a pattern
+match), and a string representing any prefix skipped before the
+extraction (like $` in a pattern match). Note that this is designed
+to facilitate the use of other Text::Balanced subroutines with
+C<extract_multiple>. Note too that the value returned by an extractor
+subroutine need not bear any relationship to the corresponding substring
+of the original text (see examples below).
 
 If the extractor is a precompiled regular expression or a string,
 it is matched against the text in a scalar context with a leading