X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FParseWords.pm;h=1411986cd9a7e4f305925a13ff88378fb603b96c;hb=46f0e7a52641befa6f3ed588e017a5454a12be04;hp=f2e151497255718906071725c131be7f288c9b50;hpb=456e8aa72f2beaf704043243be43dba10a5d648f;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/lib/Text/ParseWords.pm b/lib/Text/ParseWords.pm
index f2e1514..1411986 100644
--- a/lib/Text/ParseWords.pm
+++ b/lib/Text/ParseWords.pm
@@ -1,133 +1,104 @@
 package Text::ParseWords;
 
+use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
+$VERSION = "3.25";
+
 require 5.000;
-require Exporter;
-require AutoLoader;
-use Carp;
 
-@ISA = qw(Exporter AutoLoader);
-@EXPORT = qw(shellwords quotewords);
+use Exporter;
+@ISA = qw(Exporter);
+@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
 @EXPORT_OK = qw(old_shellwords);
 
-=head1 NAME
-
-Text::ParseWords - parse text into an array of tokens
 
-=head1 SYNOPSIS
+sub shellwords {
+    my (@lines) = @_;
+    my @allwords;
 
-  use Text::ParseWords;
-  @words = &quotewords($delim, $keep, @lines);
-  @words = &shellwords(@lines);
-  @words = &old_shellwords(@lines);
+    foreach my $line (@lines) {
+	$line =~ s/^\s+//;
+	my @words = parse_line('\s+', 0, $line);
+	pop @words if (@words and !defined $words[-1]);
+	return() unless (@words || !length($line));
+	push(@allwords, @words);
+    }
+    return(@allwords);
+}
 
-=head1 DESCRIPTION
 
-&quotewords() accepts a delimiter (which can be a regular expression)
-and a list of lines and then breaks those lines up into a list of
-words ignoring delimiters that appear inside quotes.
 
-The $keep argument is a boolean flag.  If true, the quotes are kept
-with each word, otherwise quotes are stripped in the splitting process.
-$keep also defines whether unprotected backslashes are retained.
+sub quotewords {
+    my($delim, $keep, @lines) = @_;
+    my($line, @words, @allwords);
 
-A &shellwords() replacement is included to demonstrate the new package.
-This version differs from the original in that it will _NOT_ default
-to using $_ if no arguments are given.  I personally find the old behavior
-to be a mis-feature.
+    foreach $line (@lines) {
+	@words = parse_line($delim, $keep, $line);
+	return() unless (@words || !length($line));
+	push(@allwords, @words);
+    }
+    return(@allwords);
+}
 
 
-&quotewords() works by simply jamming all of @lines into a single
-string in $_ and then pulling off words a bit at a time until $_
-is exhausted.
 
-=head1 AUTHORS
+sub nested_quotewords {
+    my($delim, $keep, @lines) = @_;
+    my($i, @allwords);
 
-Hal Pomeranz (pomeranz@netcom.com), 23 March 1994
+    for ($i = 0; $i < @lines; $i++) {
+	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
+	return() unless (@{$allwords[$i]} || !length($lines[$i]));
+    }
+    return(@allwords);
+}
 
-Basically an update and generalization of the old shellwords.pl.
-Much code shamelessly stolen from the old version (author unknown).
 
-=cut
 
-1;
-__END__
+sub parse_line {
+    my($delimiter, $keep, $line) = @_;
+    my($word, @pieces);
 
-sub shellwords {
-    local(@lines) = @_;
-    $lines[$#lines] =~ s/\s+$//;
-    &quotewords('\s+', 0, @lines);
-}
+    no warnings 'uninitialized';	# we will be testing undef strings
 
+    while (length($line)) {
+	$line =~ s/^(["'])			# a $quote
+        	    ((?:\\.|(?!\1)[^\\])*)	# and $quoted text
+		    \1				# followed by the same quote
+		   |				# --OR--
+		   ^((?:\\.|[^\\"'])*?)		# an $unquoted text
+		    (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))  
+		    				# plus EOL, delimiter, or quote
+		  //xs or return;		# extended layout
+	my($quote, $quoted, $unquoted, $delim) = ($1, $2, $3, $4);
+	return() unless( defined($quote) || length($unquoted) || length($delim));
 
-
-sub quotewords {
-
-# The inner "for" loop builds up each word (or $field) one $snippet
-# at a time.  A $snippet is a quoted string, a backslashed character,
-# or an unquoted string.  We fall out of the "for" loop when we reach
-# the end of $_ or when we hit a delimiter.  Falling out of the "for"
-# loop, we push the $field we've been building up onto the list of
-# @words we'll be returning, and then loop back and pull another word
-# off of $_.
-#
-# The first two cases inside the "for" loop deal with quoted strings.
-# The first case matches a double quoted string, removes it from $_,
-# and assigns the double quoted string to $snippet in the body of the
-# conditional.  The second case handles single quoted strings.  In
-# the third case we've found a quote at the current beginning of $_,
-# but it didn't match the quoted string regexps in the first two cases,
-# so it must be an unbalanced quote and we croak with an error (which can
-# be caught by eval()).
-#
-# The next case handles backslashed characters, and the next case is the
-# exit case on reaching the end of the string or finding a delimiter.
-#
-# Otherwise, we've found an unquoted thing and we pull of characters one
-# at a time until we reach something that could start another $snippet--
-# a quote of some sort, a backslash, or the delimiter.  This one character
-# at a time behavior was necessary if the delimiter was going to be a
-# regexp (love to hear it if you can figure out a better way).
-
-    local($delim, $keep, @lines) = @_;
-    local(@words,$snippet,$field,$_);
-
-    $_ = join('', @lines);
-    while (length($_)) {
-	$field = '';
-	for (;;) {
-	    $snippet = '';
-	    if (s/^"(([^"\\]|\\.)*)"//) {
-		$snippet = $1;
-                $snippet = "\"$snippet\"" if ($keep);
-	    }
-	    elsif (s/^'(([^'\\]|\\.)*)'//) {
-		$snippet = $1;
-                $snippet = "'$snippet'" if ($keep);
-	    }
-	    elsif (/^["']/) {
-		croak "Unmatched quote";
-	    }
-            elsif (s/^\\(.)//) {
-                $snippet = $1;
-                $snippet = "\\$snippet" if ($keep);
+        if ($keep) {
+	    $quoted = "$quote$quoted$quote";
+	}
+        else {
+	    $unquoted =~ s/\\(.)/$1/sg;
+	    if (defined $quote) {
+		$quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
+		$quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
             }
-	    elsif (!length($_) || s/^$delim//) {
-               last;
-	    }
-	    else {
-                while ($_ ne '' && !(/^$delim/ || /^['"\\]/)) {
-		   $snippet .=  substr($_, 0, 1);
-                   substr($_, 0, 1) = '';
-                }
-	    }
-	    $field .= $snippet;
 	}
-	push(@words, $field);
+        $word .= substr($line, 0, 0);	# leave results tainted
+        $word .= defined $quote ? $quoted : $unquoted;
+ 
+        if (length($delim)) {
+            push(@pieces, $word);
+            push(@pieces, $delim) if ($keep eq 'delimiters');
+            undef $word;
+        }
+        if (!length($line)) {
+            push(@pieces, $word);
+	}
     }
-    @words;
+    return(@pieces);
 }
 
 
+
 sub old_shellwords {
 
     # Usage:
@@ -135,39 +106,166 @@ sub old_shellwords {
     #	@words = old_shellwords($line);
     #	or
     #	@words = old_shellwords(@lines);
+    #	or
+    #	@words = old_shellwords();	# defaults to $_ (and clobbers it)
 
-    local($_) = join('', @_);
-    my(@words,$snippet,$field);
+    no warnings 'uninitialized';	# we will be testing undef strings
+    local *_ = \join('', @_) if @_;
+    my (@words, $snippet);
 
-    s/^\s+//;
+    s/\A\s+//;
     while ($_ ne '') {
-	$field = '';
+	my $field = substr($_, 0, 0);	# leave results tainted
 	for (;;) {
-	    if (s/^"(([^"\\]|\\.)*)"//) {
-		($snippet = $1) =~ s#\\(.)#$1#g;
+	    if (s/\A"(([^"\\]|\\.)*)"//s) {
+		($snippet = $1) =~ s#\\(.)#$1#sg;
 	    }
-	    elsif (/^"/) {
-		croak "Unmatched double quote: $_";
+	    elsif (/\A"/) {
+		require Carp;
+		Carp::carp("Unmatched double quote: $_");
+		return();
 	    }
-	    elsif (s/^'(([^'\\]|\\.)*)'//) {
-		($snippet = $1) =~ s#\\(.)#$1#g;
+	    elsif (s/\A'(([^'\\]|\\.)*)'//s) {
+		($snippet = $1) =~ s#\\(.)#$1#sg;
 	    }
-	    elsif (/^'/) {
-		croak "Unmatched single quote: $_";
+	    elsif (/\A'/) {
+		require Carp;
+		Carp::carp("Unmatched single quote: $_");
+		return();
 	    }
-	    elsif (s/^\\(.)//) {
+	    elsif (s/\A\\(.)//s) {
 		$snippet = $1;
 	    }
-	    elsif (s/^([^\s\\'"]+)//) {
+	    elsif (s/\A([^\s\\'"]+)//) {
 		$snippet = $1;
 	    }
 	    else {
-		s/^\s+//;
+		s/\A\s+//;
 		last;
 	    }
 	    $field .= $snippet;
 	}
 	push(@words, $field);
     }
-    @words;
+    return @words;
 }
+
+1;
+
+__END__
+
+=head1 NAME
+
+Text::ParseWords - parse text into an array of tokens or array of arrays
+
+=head1 SYNOPSIS
+
+  use Text::ParseWords;
+  @lists = &nested_quotewords($delim, $keep, @lines);
+  @words = &quotewords($delim, $keep, @lines);
+  @words = &shellwords(@lines);
+  @words = &parse_line($delim, $keep, $line);
+  @words = &old_shellwords(@lines); # DEPRECATED!
+
+=head1 DESCRIPTION
+
+The &nested_quotewords() and &quotewords() functions accept a delimiter 
+(which can be a regular expression)
+and a list of lines and then breaks those lines up into a list of
+words ignoring delimiters that appear inside quotes.  &quotewords()
+returns all of the tokens in a single long list, while &nested_quotewords()
+returns a list of token lists corresponding to the elements of @lines.
+&parse_line() does tokenizing on a single string.  The &*quotewords()
+functions simply call &parse_line(), so if you're only splitting
+one line you can call &parse_line() directly and save a function
+call.
+
+The $keep argument is a boolean flag.  If true, then the tokens are
+split on the specified delimiter, but all other characters (quotes,
+backslashes, etc.) are kept in the tokens.  If $keep is false then the
+&*quotewords() functions remove all quotes and backslashes that are
+not themselves backslash-escaped or inside of single quotes (i.e.,
+&quotewords() tries to interpret these characters just like the Bourne
+shell).  NB: these semantics are significantly different from the
+original version of this module shipped with Perl 5.000 through 5.004.
+As an additional feature, $keep may be the keyword "delimiters" which
+causes the functions to preserve the delimiters in each string as
+tokens in the token lists, in addition to preserving quote and
+backslash characters.
+
+&shellwords() is written as a special case of &quotewords(), and it
+does token parsing with whitespace as a delimiter-- similar to most
+Unix shells.
+
+=head1 EXAMPLES
+
+The sample program:
+
+  use Text::ParseWords;
+  @words = &quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
+  $i = 0;
+  foreach (@words) {
+      print "$i: <$_>\n";
+      $i++;
+  }
+
+produces:
+
+  0: <this>
+  1: <is>
+  2: <a test>
+  3: <of quotewords>
+  4: <"for>
+  5: <you>
+
+demonstrating:
+
+=over 4
+
+=item 0
+
+a simple word
+
+=item 1
+
+multiple spaces are skipped because of our $delim
+
+=item 2
+
+use of quotes to include a space in a word
+
+=item 3
+
+use of a backslash to include a space in a word
+
+=item 4
+
+use of a backslash to remove the special meaning of a double-quote
+
+=item 5
+
+another simple word (note the lack of effect of the
+backslashed double-quote)
+
+=back
+
+Replacing C<&quotewords('\s+', 0, q{this   is...})>
+with C<&shellwords(q{this   is...})>
+is a simpler way to accomplish the same thing.
+
+=head1 AUTHORS
+
+Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
+author unknown).  Much of the code for &parse_line() (including the
+primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
+
+Examples section another documentation provided by John Heidemann 
+<johnh@ISI.EDU>
+
+Bug reports, patches, and nagging provided by lots of folks-- thanks
+everybody!  Special thanks to Michael Schwern <schwern@envirolink.org>
+for assuring me that a &nested_quotewords() would be useful, and to 
+Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
+error-checking (sort of-- you had to be there).
+
+=cut