Integrate from perlio:

[p5sagit/p5-mst-13.2.git] / lib / Text / Balanced.pm
diff --git a/lib/Text/Balanced.pm b/lib/Text/Balanced.pm

index f50e2f5..ee83e54 100644 (file)
--- a/lib/Text/Balanced.pm
+++ b/lib/Text/Balanced.pm
@@ -10,7 +10,7 @@ use Exporter;
 use SelfLoader;
 use vars qw { $VERSION @ISA %EXPORT_TAGS };
 
-$VERSION = '1.86';
+$VERSION = '1.89';
 @ISA           = qw ( Exporter );
                     
 %EXPORT_TAGS   = ( ALL => [ qw(
@@ -30,6 +30,15 @@ $VERSION = '1.86';
 
 Exporter::export_ok_tags('ALL');
 
+##
+## These shenanagins are to avoid using $& in perl5.6+
+##
+my $GetMatchedText = ($] < 5.006) ? eval 'sub { $& } '
+                                  : eval 'sub { 
+                                           substr($_[0], $-[0], $+[0] - $-[0])
+                                          }';
+
+
 # PROTOTYPES
 
 sub _match_bracketed($$$$$$);
@@ -328,7 +337,8 @@ sub _match_tagged   # ($$$$$$$)
 
        if (!defined $rdel)
        {
-               $rdelspec = $&;
+               $rdelspec = &$GetMatchedText($$textref);
+
                unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
                {
                        _failmsg "Unable to construct closing tag to match: $rdel",
@@ -429,6 +439,9 @@ sub extract_variable (;$$)
 
 sub _match_variable($$)
 {
+#  $#
+#  $^
+#  $$
        my ($textref, $pre) = @_;
        my $startpos = pos($$textref) = pos($$textref)||0;
        unless ($$textref =~ m/\G($pre)/gc)
@@ -437,19 +450,24 @@ sub _match_variable($$)
                return;
        }
        my $varpos = pos($$textref);
-       unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc)
+        unless ($$textref =~ m{\G\$\s*(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
        {
+           unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
+           {
                _failmsg "Did not find leading dereferencer", pos $$textref;
                pos $$textref = $startpos;
                return;
-       }
+           }
+           my $deref = $1;
 
-       unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
-               or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0))
-       {
+           unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
+               or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
+               or $deref eq '$#' or $deref eq '$$' )
+           {
                _failmsg "Bad identifier after dereferencer", pos $$textref;
                pos $$textref = $startpos;
                return;
+           }
        }
 
        while (1)
@@ -854,13 +872,13 @@ sub extract_multiple (;$$$$)      # ($text, $functions_ref, $max_fields, $ignoreunkno
        my ($lastpos, $firstpos);
        my @fields = ();
 
-       for ($$textref)
+       #for ($$textref)
        {
                my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
                my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
                my $igunk = $_[3];
 
-               pos ||= 0;
+               pos $$textref ||= 0;
 
                unless (wantarray)
                {
@@ -888,51 +906,58 @@ sub extract_multiple (;$$$$)      # ($text, $functions_ref, $max_fields, $ignoreunkno
                        }
                }
 
-               FIELD: while (pos() < length())
+               FIELD: while (pos($$textref) < length($$textref))
                {
                        my $field;
+                       my @bits;
                        foreach my $i ( 0..$#func )
                        {
+                               my $pref;
                                $func = $func[$i];
                                $class = $class[$i];
-                               $lastpos = pos;
+                               $lastpos = pos $$textref;
                                if (ref($func) eq 'CODE')
-                                       { ($field) = $func->($_) }
+                                       { ($field,undef,$pref) = @bits = $func->($$textref) }
                                elsif (ref($func) eq 'Text::Balanced::Extractor')
-                                       { $field = $func->extract($_) }
-                               elsif( m/\G$func/gc )
-                                       { $field = defined($1) ? $1 : $& }
-
+                                       { @bits = $field = $func->extract($$textref) }
+                               elsif( $$textref =~ m/\G$func/gc )
+                                       { @bits = $field = defined($1) ? $1 : &$GetMatchedText($$textref) }
+                                       # substr() on previous line is "$&", without the pain
+                               $pref ||= "";
                                if (defined($field) && length($field))
                                {
-                                       if (defined($unkpos) && !$igunk)
-                                       {
-                                               push @fields, substr($_, $unkpos, $lastpos-$unkpos);
-                                               $firstpos = $unkpos unless defined $firstpos;
-                                               undef $unkpos;
-                                               last FIELD if @fields == $max;
+                                       if (!$igunk) {
+                                               $unkpos = pos $$textref
+                                                       if length($pref) && !defined($unkpos);
+                                               if (defined $unkpos)
+                                               {
+                                                       push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
+                                                       $firstpos = $unkpos unless defined $firstpos;
+                                                       undef $unkpos;
+                                                       last FIELD if @fields == $max;
+                                               }
                                        }
-                                       push @fields, $class 
-                                               ? bless(\$field, $class)
+                                       push @fields, $class
+                                               ? bless (\$field, $class)
                                                : $field;
                                        $firstpos = $lastpos unless defined $firstpos;
-                                       $lastpos = pos;
+                                       $lastpos = pos $$textref;
                                        last FIELD if @fields == $max;
                                        next FIELD;
                                }
                        }
-                       if (/\G(.)/gcs)
+                       if ($$textref =~ /\G(.)/gcs)
                        {
-                               $unkpos = pos()-1
+                               $unkpos = pos($$textref)-1
                                        unless $igunk || defined $unkpos;
                        }
                }
                
                if (defined $unkpos)
                {
-                       push @fields, substr($_, $unkpos);
+                       push @fields, substr($$textref, $unkpos);
                        $firstpos = $unkpos unless defined $firstpos;
-                       $lastpos = length;
+                       $lastpos = length $$textref;
                }
                last;
        }
@@ -1194,7 +1219,7 @@ pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
 is also not specified, the set C</["'`]/> is used. If the text to be processed
 is not specified either, C<$_> is used.
 
-In list context, C<extract_delimited> returns a array of three
+In list context, C<extract_delimited> returns an array of three
 elements, the extracted substring (I<including the surrounding
 delimiters>), the remainder of the text, and the skipped prefix (if
 any). If a suitable delimited substring is not found, the first
@@ -1429,7 +1454,7 @@ C<extract_tagged> returns the complete text up to the point of failure.
 If the string is "PARA", C<extract_tagged> returns only the first paragraph
 after the tag (up to the first line that is either empty or contains
 only whitespace characters).
-If the string is "", the the default behaviour (i.e. failure) is reinstated.
+If the string is "", the default behaviour (i.e. failure) is reinstated.
 
 For example, suppose the start tag "/para" introduces a paragraph, which then
 continues until the next "/endpara" tag or until another "/para" tag is
@@ -1628,7 +1653,7 @@ the right delimiter of the first block of the operation,
 =item [7]
 
 the left delimiter of the second block of the operation
-(that is, if it is a C<s>, C<tr>, or C<y>),
+(that is, if it is an C<s>, C<tr>, or C<y>),
 
 =item [8]
 
@@ -1746,7 +1771,7 @@ However, the matching position of the input variable would be set to
 which would cause the earlier " || die;\nexit;" to be skipped in any
 sequence of code fragment extractions.
 
-To avoid this problem, when it encounters a here document whilst
+To avoid this problem, when it encounters a here document while
 extracting from a modifiable string, C<extract_quotelike> silently
 rearranges the string to an equivalent piece of Perl:
 
@@ -1786,7 +1811,7 @@ Omitting the third argument (prefix argument) implies optional whitespace at the
 Omitting the fourth argument (outermost delimiter brackets) indicates that the
 value of the second argument is to be used for the outermost delimiters.
 
-Once the prefix an dthe outermost opening delimiter bracket have been
+Once the prefix an the outermost opening delimiter bracket have been
 recognized, code blocks are extracted by stepping through the input text and
 trying the following alternatives in sequence:
 
@@ -1873,7 +1898,7 @@ extracted substring removed from it. In all contexts
 C<extract_multiple> starts at the current C<pos> of the string, and
 sets that C<pos> appropriately after it matches.
 
-Hence, the aim of of a call to C<extract_multiple> in a list context
+Hence, the aim of a call to C<extract_multiple> in a list context
 is to split the processed string into as many non-overlapping fields as
 possible, by repeatedly applying each of the specified extractors
 to the remainder of the string. Thus C<extract_multiple> is
@@ -1905,7 +1930,7 @@ is used.
 
 =item 3.
 
-An number specifying the maximum number of fields to return. If this
+A number specifying the maximum number of fields to return. If this
 argument is omitted (or C<undef>), split continues as long as possible.
 
 If the third argument is I<N>, then extraction continues until I<N> fields
@@ -1925,13 +1950,18 @@ such substrings are skipped. Otherwise, they are returned.
 =back
 
 The extraction process works by applying each extractor in
-sequence to the text string. If the extractor is a subroutine it
-is called in a list
-context and is expected to return a list of a single element, namely
-the extracted text.
-Note that the value returned by an extractor subroutine need not bear any
-relationship to the corresponding substring of the original text (see
-examples below).
+sequence to the text string.
+
+If the extractor is a subroutine it is called in a list context and is
+expected to return a list of a single element, namely the extracted
+text. It may optionally also return two further arguments: a string
+representing the text left after extraction (like $' for a pattern
+match), and a string representing any prefix skipped before the
+extraction (like $` in a pattern match). Note that this is designed
+to facilitate the use of other Text::Balanced subroutines with
+C<extract_multiple>. Note too that the value returned by an extractor
+subroutine need not bear any relationship to the corresponding substring
+of the original text (see examples below).
 
 If the extractor is a precompiled regular expression or a string,
 it is matched against the text in a scalar context with a leading
@@ -1956,7 +1986,7 @@ If none of the extractor subroutines succeeds, then one
 character is extracted from the start of the text and the extraction
 subroutines reapplied. Characters which are thus removed are accumulated and
 eventually become the next field (unless the fourth argument is true, in which
-case they are disgarded).
+case they are discarded).
 
 For example, the following extracts substrings that are valid Perl variables: