use SelfLoader;
use vars qw { $VERSION @ISA %EXPORT_TAGS };
-$VERSION = '1.86';
+$VERSION = '1.95';
@ISA = qw ( Exporter );
%EXPORT_TAGS = ( ALL => [ qw(
}
else
{
- $rdelspec = eval "qq{$rdel}";
+ $rdelspec = eval "qq{$rdel}" || do {
+ my $del;
+ for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
+ { next if $rdel =~ /\Q$_/; $del = $_; last }
+ unless ($del) {
+ use Carp;
+ croak "Can't interpolate right delimiter $rdel"
+ }
+ eval "qq$del$rdel$del";
+ };
}
while (pos($$textref) < length($$textref))
sub _match_variable($$)
{
+# $#
+# $^
+# $$
my ($textref, $pre) = @_;
my $startpos = pos($$textref) = pos($$textref)||0;
unless ($$textref =~ m/\G($pre)/gc)
return;
}
my $varpos = pos($$textref);
- unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc)
+ unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
{
+ unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
+ {
_failmsg "Did not find leading dereferencer", pos $$textref;
pos $$textref = $startpos;
return;
- }
+ }
+ my $deref = $1;
- unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
- or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0))
- {
+ unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
+ or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
+ or $deref eq '$#' or $deref eq '$$' )
+ {
_failmsg "Bad identifier after dereferencer", pos $$textref;
pos $$textref = $startpos;
return;
+ }
}
while (1)
{
+ next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
next if _match_codeblock($textref,
qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
qr/[({[]/, qr/[)}\]]/,
# NEED TO COVER MANY MORE CASES HERE!!!
- if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
+ if ($$textref =~ m#\G\s*(?!$ldel_inner)
+ ( [-+*x/%^&|.]=?
| [!=]~
| =(?!>)
| (\*\*|&&|\|\||<<|>>)=?
| split|grep|map|return
+ | [([]
)#gcx)
{
$patvalid = 1;
);
}
- unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
+ unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
{
_failmsg q{No quotelike operator found after prefix at "} .
substr($$textref, pos($$textref), 20) .
my ($lastpos, $firstpos);
my @fields = ();
- for ($$textref)
+ #for ($$textref)
{
my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
my $igunk = $_[3];
- pos ||= 0;
+ pos $$textref ||= 0;
unless (wantarray)
{
}
}
- FIELD: while (pos() < length())
+ FIELD: while (pos($$textref) < length($$textref))
{
- my $field;
+ my ($field, $rem);
+ my @bits;
foreach my $i ( 0..$#func )
{
+ my $pref;
$func = $func[$i];
$class = $class[$i];
- $lastpos = pos;
+ $lastpos = pos $$textref;
if (ref($func) eq 'CODE')
- { ($field) = $func->($_) }
+ { ($field,$rem,$pref) = @bits = $func->($$textref);
+ # print "[$field|$rem]" if $field;
+ }
elsif (ref($func) eq 'Text::Balanced::Extractor')
- { $field = $func->extract($_) }
- elsif( m/\G$func/gc )
- { $field = defined($1) ? $1 : $& }
-
+ { @bits = $field = $func->extract($$textref) }
+ elsif( $$textref =~ m/\G$func/gc )
+ { @bits = $field = defined($1) ? $1 : $& }
+ $pref ||= "";
if (defined($field) && length($field))
{
- if (defined($unkpos) && !$igunk)
- {
- push @fields, substr($_, $unkpos, $lastpos-$unkpos);
- $firstpos = $unkpos unless defined $firstpos;
- undef $unkpos;
- last FIELD if @fields == $max;
+ if (!$igunk) {
+ $unkpos = pos $$textref
+ if length($pref) && !defined($unkpos);
+ if (defined $unkpos)
+ {
+ push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
+ $firstpos = $unkpos unless defined $firstpos;
+ undef $unkpos;
+ last FIELD if @fields == $max;
+ }
}
- push @fields, $class
- ? bless(\$field, $class)
+ push @fields, $class
+ ? bless (\$field, $class)
: $field;
$firstpos = $lastpos unless defined $firstpos;
- $lastpos = pos;
+ $lastpos = pos $$textref;
last FIELD if @fields == $max;
next FIELD;
}
}
- if (/\G(.)/gcs)
+ if ($$textref =~ /\G(.)/gcs)
{
- $unkpos = pos()-1
+ $unkpos = pos($$textref)-1
unless $igunk || defined $unkpos;
}
}
if (defined $unkpos)
{
- push @fields, substr($_, $unkpos);
+ push @fields, substr($$textref, $unkpos);
$firstpos = $unkpos unless defined $firstpos;
- $lastpos = length;
+ $lastpos = length $$textref;
}
last;
}
# Extract the initial substring of $text that is bounded by
- # an HTML/XML tag.
+ # an XML tag.
($extracted, $remainder) = extract_tagged($text);
=head1 DESCRIPTION
-The various C<extract_...> subroutines may be used to extract a
-delimited string (possibly after skipping a specified prefix string).
-The search for the string always begins at the current C<pos>
-location of the string's variable (or at index zero, if no C<pos>
-position is defined).
+The various C<extract_...> subroutines may be used to
+extract a delimited substring, possibly after skipping a
+specified prefix string. By default, that prefix is
+optional whitespace (C</\s*/>), but you can change it to whatever
+you wish (see below).
+
+The substring to be extracted must appear at the
+current C<pos> location of the string's variable
+(or at index zero, if no C<pos> position is defined).
+In other words, the C<extract_...> subroutines I<don't>
+extract the first occurance of a substring anywhere
+in a string (like an unanchored regex would). Rather,
+they extract an occurance of the substring appearing
+immediately at the current matching position in the
+string (like a C<\G>-anchored regex would).
+
+
=head2 General behaviour in list contexts
See also: C<"extract_quotelike"> and C<"extract_codeblock">.
+=head2 C<extract_variable>
+
+C<extract_variable> extracts any valid Perl variable or
+variable-involved expression, including scalars, arrays, hashes, array
+accesses, hash look-ups, method calls through objects, subroutine calles
+through subroutine references, etc.
+
+The subroutine takes up to two optional arguments:
+
+=over 4
+
+=item 1.
+
+A string to be processed (C<$_> if the string is omitted or C<undef>)
+
+=item 2.
+
+A string specifying a pattern to be matched as a prefix (which is to be
+skipped). If omitted, optional whitespace is skipped.
+
+=back
+
+On success in a list context, an array of 3 elements is returned. The
+elements are:
+
+=over 4
+
+=item [0]
+
+the extracted variable, or variablish expression
+
+=item [1]
+
+the remainder of the input text,
+
+=item [2]
+
+the prefix substring (if any),
+
+=back
+
+On failure, all of these values (except the remaining text) are C<undef>.
+
+In a scalar context, C<extract_variable> returns just the complete
+substring that matched a variablish expression. C<undef> is returned on
+failure. In addition, the original input text has the returned substring
+(and any prefix) removed from it.
+
+In a void context, the input text just has the matched substring (and
+any specified prefix) removed.
+
+
=head2 C<extract_tagged>
C<extract_tagged> extracts and segments text between (balanced)
A string specifying a pattern to be matched as the opening tag.
If the pattern string is omitted (or C<undef>) then a pattern
-that matches any standard HTML/XML tag is used.
+that matches any standard XML tag is used.
=item 3.
For example, to extract
an HTML link (which should not contain nested links) use:
- extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
+ extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
=item C<ignore =E<gt> $listref>
For example, to extract an arbitrary XML tag, but ignore "empty" elements:
- extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
+ extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
(also see L<"gen_delimited_pat"> below).
continues until the next "/endpara" tag or until another "/para" tag is
encountered:
- $text = "/para line 1\n\nline 3\n/para line 4";
+ $text = "/para line 1\n\nline 3\n/para line 4";
- extract_tagged($text, '/para', '/endpara', undef,
- {reject => '/para', fail => MAX );
+ extract_tagged($text, '/para', '/endpara', undef,
+ {reject => '/para', fail => MAX );
- # EXTRACTED: "/para line 1\n\nline 3\n"
+ # EXTRACTED: "/para line 1\n\nline 3\n"
Suppose instead, that if no matching "/endpara" tag is found, the "/para"
tag refers only to the immediately following paragraph:
- $text = "/para line 1\n\nline 3\n/para line 4";
+ $text = "/para line 1\n\nline 3\n/para line 4";
- extract_tagged($text, '/para', '/endpara', undef,
- {reject => '/para', fail => MAX );
+ extract_tagged($text, '/para', '/endpara', undef,
+ {reject => '/para', fail => MAX );
- # EXTRACTED: "/para line 1\n"
+ # EXTRACTED: "/para line 1\n"
Note that the specified C<fail> behaviour applies to nested tags as well.
In other words, the implementation of C<extract_tagged> is exactly
equivalent to:
- sub extract_tagged
- {
- my $text = shift;
- $extractor = gen_extract_tagged(@_);
- return $extractor->($text);
- }
+ sub extract_tagged
+ {
+ my $text = shift;
+ $extractor = gen_extract_tagged(@_);
+ return $extractor->($text);
+ }
(although C<extract_tagged> is not currently implemented that way, in order
to preserve pre-5.005 compatibility).
delimiters (for the quotelike operators), and trailing modifiers are
all caught. For example, in:
- extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
-
- extract_quotelike ' "You said, \"Use sed\"." '
+ extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
+
+ extract_quotelike ' "You said, \"Use sed\"." '
- extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
+ extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
- extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
+ extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
the full Perl quotelike operations are all extracted correctly.
containing the current pattern delimiter will cause the regex to be
immediately terminated. In other words:
- 'm /
- (?i) # CASE INSENSITIVE
- [a-z_] # LEADING ALPHABETIC/UNDERSCORE
- [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
- /x'
+ 'm /
+ (?i) # CASE INSENSITIVE
+ [a-z_] # LEADING ALPHABETIC/UNDERSCORE
+ [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
+ /x'
will be extracted as if it were:
- 'm /
- (?i) # CASE INSENSITIVE
- [a-z_] # LEADING ALPHABETIC/'
+ 'm /
+ (?i) # CASE INSENSITIVE
+ [a-z_] # LEADING ALPHABETIC/'
This behaviour is identical to that of the actual compiler.
Examples:
- # Remove the first quotelike literal that appears in text
+ # Remove the first quotelike literal that appears in text
- $quotelike = extract_quotelike($text,'.*?');
+ $quotelike = extract_quotelike($text,'.*?');
- # Replace one or more leading whitespace-separated quotelike
- # literals in $_ with "<QLL>"
+ # Replace one or more leading whitespace-separated quotelike
+ # literals in $_ with "<QLL>"
- do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
+ do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
- # Isolate the search pattern in a quotelike operation from $text
+ # Isolate the search pattern in a quotelike operation from $text
- ($op,$pat) = (extract_quotelike $text)[3,5];
- if ($op =~ /[ms]/)
- {
- print "search pattern: $pat\n";
- }
- else
- {
- print "$op is not a pattern matching operation\n";
- }
+ ($op,$pat) = (extract_quotelike $text)[3,5];
+ if ($op =~ /[ms]/)
+ {
+ print "search pattern: $pat\n";
+ }
+ else
+ {
+ print "$op is not a pattern matching operation\n";
+ }
=head2 C<extract_quotelike> and "here documents"
<<'EOMSG' || die;
This is the message.
EOMSG
- exit;
+ exit;
Given this as an input string in a scalar context, C<extract_quotelike>
would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
This is the message.
EOMSG
|| die;
- exit;
+ exit;
in which the here document I<is> contiguous. It still leaves the
matching position after the here document, but now the rest of the line
=back
The extraction process works by applying each extractor in
-sequence to the text string. If the extractor is a subroutine it
-is called in a list
-context and is expected to return a list of a single element, namely
-the extracted text.
-Note that the value returned by an extractor subroutine need not bear any
-relationship to the corresponding substring of the original text (see
-examples below).
+sequence to the text string.
+
+If the extractor is a subroutine it is called in a list context and is
+expected to return a list of a single element, namely the extracted
+text. It may optionally also return two further arguments: a string
+representing the text left after extraction (like $' for a pattern
+match), and a string representing any prefix skipped before the
+extraction (like $` in a pattern match). Note that this is designed
+to facilitate the use of other Text::Balanced subroutines with
+C<extract_multiple>. Note too that the value returned by an extractor
+subroutine need not bear any relationship to the corresponding substring
+of the original text (see examples below).
If the extractor is a precompiled regular expression or a string,
it is matched against the text in a scalar context with a leading