From: Jarkko Hietaniemi Date: Fri, 4 Jul 2003 13:20:51 +0000 (+0000) Subject: Upgrade to Text::Balanced 1.94. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=48f821bf9e0932f4bcb9a9fde6c937aaf95d95db;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Text::Balanced 1.94. p4raw-id: //depot/perl@19989 --- diff --git a/lib/Text/Balanced.pm b/lib/Text/Balanced.pm index ee83e54..820ae25 100644 --- a/lib/Text/Balanced.pm +++ b/lib/Text/Balanced.pm @@ -10,7 +10,7 @@ use Exporter; use SelfLoader; use vars qw { $VERSION @ISA %EXPORT_TAGS }; -$VERSION = '1.89'; +$VERSION = '1.95'; @ISA = qw ( Exporter ); %EXPORT_TAGS = ( ALL => [ qw( @@ -30,15 +30,6 @@ $VERSION = '1.89'; Exporter::export_ok_tags('ALL'); -## -## These shenanagins are to avoid using $& in perl5.6+ -## -my $GetMatchedText = ($] < 5.006) ? eval 'sub { $& } ' - : eval 'sub { - substr($_[0], $-[0], $+[0] - $-[0]) - }'; - - # PROTOTYPES sub _match_bracketed($$$$$$); @@ -337,8 +328,7 @@ sub _match_tagged # ($$$$$$$) if (!defined $rdel) { - $rdelspec = &$GetMatchedText($$textref); - + $rdelspec = $&; unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes) { _failmsg "Unable to construct closing tag to match: $rdel", @@ -348,7 +338,16 @@ sub _match_tagged # ($$$$$$$) } else { - $rdelspec = eval "qq{$rdel}"; + $rdelspec = eval "qq{$rdel}" || do { + my $del; + for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) + { next if $rdel =~ /\Q$_/; $del = $_; last } + unless ($del) { + use Carp; + croak "Can't interpolate right delimiter $rdel" + } + eval "qq$del$rdel$del"; + }; } while (pos($$textref) < length($$textref)) @@ -450,7 +449,7 @@ sub _match_variable($$) return; } my $varpos = pos($$textref); - unless ($$textref =~ m{\G\$\s*(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) + unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) { unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) { @@ -472,6 +471,7 @@ sub _match_variable($$) while (1) { + next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; next if _match_codeblock($textref, qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, qr/[({[]/, qr/[)}\]]/, @@ -583,11 +583,13 @@ sub _match_codeblock($$$$$$$) # NEED TO COVER MANY MORE CASES HERE!!! - if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=? + if ($$textref =~ m#\G\s*(?!$ldel_inner) + ( [-+*x/%^&|.]=? | [!=]~ | =(?!>) | (\*\*|&&|\|\||<<|>>)=? | split|grep|map|return + | [([] )#gcx) { $patvalid = 1; @@ -717,7 +719,7 @@ sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) ); } - unless ($$textref =~ m{\G((?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) + unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) { _failmsg q{No quotelike operator found after prefix at "} . substr($$textref, pos($$textref), 20) . @@ -908,7 +910,7 @@ sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunkno FIELD: while (pos($$textref) < length($$textref)) { - my $field; + my ($field, $rem); my @bits; foreach my $i ( 0..$#func ) { @@ -917,12 +919,13 @@ sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunkno $class = $class[$i]; $lastpos = pos $$textref; if (ref($func) eq 'CODE') - { ($field,undef,$pref) = @bits = $func->($$textref) } + { ($field,$rem,$pref) = @bits = $func->($$textref); + # print "[$field|$rem]" if $field; + } elsif (ref($func) eq 'Text::Balanced::Extractor') { @bits = $field = $func->extract($$textref) } elsif( $$textref =~ m/\G$func/gc ) - { @bits = $field = defined($1) ? $1 : &$GetMatchedText($$textref) } - # substr() on previous line is "$&", without the pain + { @bits = $field = defined($1) ? $1 : $& } $pref ||= ""; if (defined($field) && length($field)) { @@ -1057,7 +1060,7 @@ Text::Balanced - Extract delimited text sequences from strings. # Extract the initial substring of $text that is bounded by - # an HTML/XML tag. + # an XML tag. ($extracted, $remainder) = extract_tagged($text); @@ -1113,11 +1116,23 @@ Text::Balanced - Extract delimited text sequences from strings. =head1 DESCRIPTION -The various C subroutines may be used to extract a -delimited string (possibly after skipping a specified prefix string). -The search for the string always begins at the current C -location of the string's variable (or at index zero, if no C -position is defined). +The various C subroutines may be used to +extract a delimited substring, possibly after skipping a +specified prefix string. By default, that prefix is +optional whitespace (C), but you can change it to whatever +you wish (see below). + +The substring to be extracted must appear at the +current C location of the string's variable +(or at index zero, if no C position is defined). +In other words, the C subroutines I +extract the first occurance of a substring anywhere +in a string (like an unanchored regex would). Rather, +they extract an occurance of the substring appearing +immediately at the current matching position in the +string (like a C<\G>-anchored regex would). + + =head2 General behaviour in list contexts @@ -1219,7 +1234,7 @@ pattern C<'\s*'> - optional whitespace - is used. If the delimiter set is also not specified, the set C is used. If the text to be processed is not specified either, C<$_> is used. -In list context, C returns an array of three +In list context, C returns a array of three elements, the extracted substring (I), the remainder of the text, and the skipped prefix (if any). If a suitable delimited substring is not found, the first @@ -1375,6 +1390,58 @@ would correctly match something like this: See also: C<"extract_quotelike"> and C<"extract_codeblock">. +=head2 C + +C extracts any valid Perl variable or +variable-involved expression, including scalars, arrays, hashes, array +accesses, hash look-ups, method calls through objects, subroutine calles +through subroutine references, etc. + +The subroutine takes up to two optional arguments: + +=over 4 + +=item 1. + +A string to be processed (C<$_> if the string is omitted or C) + +=item 2. + +A string specifying a pattern to be matched as a prefix (which is to be +skipped). If omitted, optional whitespace is skipped. + +=back + +On success in a list context, an array of 3 elements is returned. The +elements are: + +=over 4 + +=item [0] + +the extracted variable, or variablish expression + +=item [1] + +the remainder of the input text, + +=item [2] + +the prefix substring (if any), + +=back + +On failure, all of these values (except the remaining text) are C. + +In a scalar context, C returns just the complete +substring that matched a variablish expression. C is returned on +failure. In addition, the original input text has the returned substring +(and any prefix) removed from it. + +In a void context, the input text just has the matched substring (and +any specified prefix) removed. + + =head2 C C extracts and segments text between (balanced) @@ -1392,7 +1459,7 @@ A string to be processed (C<$_> if the string is omitted or C) A string specifying a pattern to be matched as the opening tag. If the pattern string is omitted (or C) then a pattern -that matches any standard HTML/XML tag is used. +that matches any standard XML tag is used. =item 3. @@ -1427,7 +1494,7 @@ that must I appear within the tagged text. For example, to extract an HTML link (which should not contain nested links) use: - extract_tagged($text, '', '', undef, {reject => ['']} ); + extract_tagged($text, '', '', undef, {reject => ['']} ); =item C $listref> @@ -1437,7 +1504,7 @@ that are I be be treated as nested tags within the tagged text For example, to extract an arbitrary XML tag, but ignore "empty" elements: - extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); + extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); (also see L<"gen_delimited_pat"> below). @@ -1454,28 +1521,28 @@ C returns the complete text up to the point of failure. If the string is "PARA", C returns only the first paragraph after the tag (up to the first line that is either empty or contains only whitespace characters). -If the string is "", the default behaviour (i.e. failure) is reinstated. +If the string is "", the the default behaviour (i.e. failure) is reinstated. For example, suppose the start tag "/para" introduces a paragraph, which then continues until the next "/endpara" tag or until another "/para" tag is encountered: - $text = "/para line 1\n\nline 3\n/para line 4"; + $text = "/para line 1\n\nline 3\n/para line 4"; - extract_tagged($text, '/para', '/endpara', undef, - {reject => '/para', fail => MAX ); + extract_tagged($text, '/para', '/endpara', undef, + {reject => '/para', fail => MAX ); - # EXTRACTED: "/para line 1\n\nline 3\n" + # EXTRACTED: "/para line 1\n\nline 3\n" Suppose instead, that if no matching "/endpara" tag is found, the "/para" tag refers only to the immediately following paragraph: - $text = "/para line 1\n\nline 3\n/para line 4"; + $text = "/para line 1\n\nline 3\n/para line 4"; - extract_tagged($text, '/para', '/endpara', undef, - {reject => '/para', fail => MAX ); + extract_tagged($text, '/para', '/endpara', undef, + {reject => '/para', fail => MAX ); - # EXTRACTED: "/para line 1\n" + # EXTRACTED: "/para line 1\n" Note that the specified C behaviour applies to nested tags as well. @@ -1558,12 +1625,12 @@ be extracted from). In other words, the implementation of C is exactly equivalent to: - sub extract_tagged - { - my $text = shift; - $extractor = gen_extract_tagged(@_); - return $extractor->($text); - } + sub extract_tagged + { + my $text = shift; + $extractor = gen_extract_tagged(@_); + return $extractor->($text); + } (although C is not currently implemented that way, in order to preserve pre-5.005 compatibility). @@ -1582,13 +1649,13 @@ L) Nested backslashed delimiters, embedded balanced bracket delimiters (for the quotelike operators), and trailing modifiers are all caught. For example, in: - extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' - - extract_quotelike ' "You said, \"Use sed\"." ' + extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' + + extract_quotelike ' "You said, \"Use sed\"." ' - extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' + extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' - extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' + extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' the full Perl quotelike operations are all extracted correctly. @@ -1596,17 +1663,17 @@ Note too that, when using the /x modifier on a regex, any comment containing the current pattern delimiter will cause the regex to be immediately terminated. In other words: - 'm / - (?i) # CASE INSENSITIVE - [a-z_] # LEADING ALPHABETIC/UNDERSCORE - [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS - /x' + 'm / + (?i) # CASE INSENSITIVE + [a-z_] # LEADING ALPHABETIC/UNDERSCORE + [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS + /x' will be extracted as if it were: - 'm / - (?i) # CASE INSENSITIVE - [a-z_] # LEADING ALPHABETIC/' + 'm / + (?i) # CASE INSENSITIVE + [a-z_] # LEADING ALPHABETIC/' This behaviour is identical to that of the actual compiler. @@ -1653,7 +1720,7 @@ the right delimiter of the first block of the operation, =item [7] the left delimiter of the second block of the operation -(that is, if it is an C, C, or C), +(that is, if it is a C, C, or C), =item [8] @@ -1683,27 +1750,27 @@ prefix) removed. Examples: - # Remove the first quotelike literal that appears in text + # Remove the first quotelike literal that appears in text - $quotelike = extract_quotelike($text,'.*?'); + $quotelike = extract_quotelike($text,'.*?'); - # Replace one or more leading whitespace-separated quotelike - # literals in $_ with "" + # Replace one or more leading whitespace-separated quotelike + # literals in $_ with "" - do { $_ = join '', (extract_quotelike)[2,1] } until $@; + do { $_ = join '', (extract_quotelike)[2,1] } until $@; - # Isolate the search pattern in a quotelike operation from $text + # Isolate the search pattern in a quotelike operation from $text - ($op,$pat) = (extract_quotelike $text)[3,5]; - if ($op =~ /[ms]/) - { - print "search pattern: $pat\n"; - } - else - { - print "$op is not a pattern matching operation\n"; - } + ($op,$pat) = (extract_quotelike $text)[3,5]; + if ($op =~ /[ms]/) + { + print "search pattern: $pat\n"; + } + else + { + print "$op is not a pattern matching operation\n"; + } =head2 C and "here documents" @@ -1718,7 +1785,7 @@ here document might look like this: <<'EOMSG' || die; This is the message. EOMSG - exit; + exit; Given this as an input string in a scalar context, C would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", @@ -1771,7 +1838,7 @@ However, the matching position of the input variable would be set to which would cause the earlier " || die;\nexit;" to be skipped in any sequence of code fragment extractions. -To avoid this problem, when it encounters a here document while +To avoid this problem, when it encounters a here document whilst extracting from a modifiable string, C silently rearranges the string to an equivalent piece of Perl: @@ -1779,7 +1846,7 @@ rearranges the string to an equivalent piece of Perl: This is the message. EOMSG || die; - exit; + exit; in which the here document I contiguous. It still leaves the matching position after the here document, but now the rest of the line @@ -1811,7 +1878,7 @@ Omitting the third argument (prefix argument) implies optional whitespace at the Omitting the fourth argument (outermost delimiter brackets) indicates that the value of the second argument is to be used for the outermost delimiters. -Once the prefix an the outermost opening delimiter bracket have been +Once the prefix an dthe outermost opening delimiter bracket have been recognized, code blocks are extracted by stepping through the input text and trying the following alternatives in sequence: @@ -1898,7 +1965,7 @@ extracted substring removed from it. In all contexts C starts at the current C of the string, and sets that C appropriately after it matches. -Hence, the aim of a call to C in a list context +Hence, the aim of of a call to C in a list context is to split the processed string into as many non-overlapping fields as possible, by repeatedly applying each of the specified extractors to the remainder of the string. Thus C is @@ -1930,7 +1997,7 @@ is used. =item 3. -A number specifying the maximum number of fields to return. If this +An number specifying the maximum number of fields to return. If this argument is omitted (or C), split continues as long as possible. If the third argument is I, then extraction continues until I fields @@ -1986,7 +2053,7 @@ If none of the extractor subroutines succeeds, then one character is extracted from the start of the text and the extraction subroutines reapplied. Characters which are thus removed are accumulated and eventually become the next field (unless the fourth argument is true, in which -case they are discarded). +case they are disgarded). For example, the following extracts substrings that are valid Perl variables: diff --git a/lib/Text/Balanced/Changes b/lib/Text/Balanced/Changes index 2b42f94..c8c79fb 100644 --- a/lib/Text/Balanced/Changes +++ b/lib/Text/Balanced/Changes @@ -261,3 +261,41 @@ Revision history for Perl extension Text::Balanced. 1.89 Sun Nov 18 22:49:50 2001 - Fixed extvar.t tests + + +1.90 Tue Mar 25 11:14:38 2003 + + - Fixed subtle bug in gen_extract_tagged (thanks Martin) + + - Doc fix: removed suggestion that extract_tagged defaults + to matching HTML tags + + - Doc fix: clarified general matching behaviour + + - Fixed bug in parsing /.../ after a ( + + - Doc fix: documented extract_variable + + - Fixed extract_variable handling of $h{qr}, $h{tr}, etc. + (thanks, Briac) + + - Fixed incorrect handling of $::var (thanks Tim) + + +1.91 Fri Mar 28 23:19:17 2003 + + - Fixed error count on t/extract_variable.t + + - Fixed bug in extract_codelike when non-standard delimiters used + + +1.94 Sun Apr 13 02:18:41 2003 + + - rereleased in attempt to fix CPAN problems + + +1.95 Mon Apr 28 00:22:04 2003 + + - Constrainted _match_quote to only match at word boundaries + (so "exemplum(hic)" doesn't match "m(hic)") + (thanks Craig) diff --git a/lib/Text/Balanced/README b/lib/Text/Balanced/README index ef2f376..032bb23 100755 --- a/lib/Text/Balanced/README +++ b/lib/Text/Balanced/README @@ -1,5 +1,5 @@ ============================================================================== - Release of version 1.89 of Text::Balanced + Release of version 1.95 of Text::Balanced ============================================================================== @@ -66,10 +66,12 @@ COPYRIGHT ============================================================================== -CHANGES IN VERSION 1.89 +CHANGES IN VERSION 1.95 - - Fixed extvar.t tests + - Constrainted _match_quote to only match at word boundaries + (so "exemplum(hic)" doesn't match "m(hic)") + (thanks Craig) ============================================================================== @@ -77,8 +79,5 @@ CHANGES IN VERSION 1.89 AVAILABILITY Text::Balanced has been uploaded to the CPAN -and is also available from: - - http://www.csse.monash.edu.au/~damian/CPAN/Text-Balanced.tar.gz ============================================================================== diff --git a/lib/Text/Balanced/t/extcbk.t b/lib/Text/Balanced/t/extcbk.t index 69957ed..47b0045 100644 --- a/lib/Text/Balanced/t/extcbk.t +++ b/lib/Text/Balanced/t/extcbk.t @@ -59,12 +59,12 @@ while (defined($str = )) __DATA__ -# USING: extract_codeblock($str); -{ $data[4] =~ /['"]/; }; - # USING: extract_codeblock($str,'(){}',undef,'()'); (Foo(')')); +# USING: extract_codeblock($str); +{ $data[4] =~ /['"]/; }; + # USING: extract_codeblock($str,'<>'); < %x = ( try => "this") >; < %x = () >; diff --git a/lib/Text/Balanced/t/extvar.t b/lib/Text/Balanced/t/extvar.t index f8a46bb..2bda381 100644 --- a/lib/Text/Balanced/t/extvar.t +++ b/lib/Text/Balanced/t/extvar.t @@ -13,7 +13,7 @@ BEGIN { # Change 1..1 below to 1..last_test_to_print . # (It may become useful if the test is moved to ./t subdirectory.) -BEGIN { $| = 1; print "1..181\n"; } +BEGIN { $| = 1; print "1..183\n"; } END {print "not ok 1\n" unless $loaded;} use Text::Balanced qw ( extract_variable ); $loaded = 1; @@ -65,6 +65,7 @@ $a->; $a (1..3) { print $a }; # USING: extract_variable($str); +$::obj; $obj->nextval; *var; *$var; diff --git a/lib/Text/Balanced/t/gentag.t b/lib/Text/Balanced/t/gentag.t index ae94c54..7b150a6 100644 --- a/lib/Text/Balanced/t/gentag.t +++ b/lib/Text/Balanced/t/gentag.t @@ -13,7 +13,7 @@ BEGIN { # Change 1..1 below to 1..last_test_to_print . # (It may become useful if the test is moved to ./t subdirectory.) -BEGIN { $| = 1; print "1..35\n"; } +BEGIN { $| = 1; print "1..37\n"; } END {print "not ok 1\n" unless $loaded;} use Text::Balanced qw ( gen_extract_tagged ); $loaded = 1; @@ -65,6 +65,9 @@ while (defined($str = )) __DATA__ +# USING: gen_extract_tagged('{','}'); + { a test }; + # USING: gen_extract_tagged(qr/<[A-Z]+>/,undef, undef, {ignore=>["
"]});
aaabbb
ccc
ddd
;