1 package Text::ParseWords;
3 use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
10 @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
11 @EXPORT_OK = qw(old_shellwords);
18 foreach my $line (@lines) {
20 my @words = parse_line('\s+', 0, $line);
21 pop @words if (@words and !defined $words[-1]);
22 return() unless (@words || !length($line));
23 push(@allwords, @words);
31 my($delim, $keep, @lines) = @_;
32 my($line, @words, @allwords);
34 foreach $line (@lines) {
35 @words = parse_line($delim, $keep, $line);
36 return() unless (@words || !length($line));
37 push(@allwords, @words);
44 sub nested_quotewords {
45 my($delim, $keep, @lines) = @_;
48 for ($i = 0; $i < @lines; $i++) {
49 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
50 return() unless (@{$allwords[$i]} || !length($lines[$i]));
58 my($delimiter, $keep, $line) = @_;
61 no warnings 'uninitialized'; # we will be testing undef strings
63 while (length($line)) {
64 # This pattern is optimised to be stack conservative on older perls.
65 # Do not refactor without being careful and testing it on very long strings.
66 # See Perl bug #42980 for an example of a stack busting input.
69 # double quoted string
71 ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted
75 ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted
85 (?-x:$delimiter) # delimiter
87 (?!^)(?=["']) # a quote
89 )//xs or return; # extended layout
90 my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
93 return() unless( defined($quote) || length($unquoted) || length($delim));
96 $quoted = "$quote$quoted$quote";
99 $unquoted =~ s/\\(.)/$1/sg;
100 if (defined $quote) {
101 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
102 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
105 $word .= substr($line, 0, 0); # leave results tainted
106 $word .= defined $quote ? $quoted : $unquoted;
108 if (length($delim)) {
109 push(@pieces, $word);
110 push(@pieces, $delim) if ($keep eq 'delimiters');
113 if (!length($line)) {
114 push(@pieces, $word);
126 # @words = old_shellwords($line);
128 # @words = old_shellwords(@lines);
130 # @words = old_shellwords(); # defaults to $_ (and clobbers it)
132 no warnings 'uninitialized'; # we will be testing undef strings
133 local *_ = \join('', @_) if @_;
134 my (@words, $snippet);
138 my $field = substr($_, 0, 0); # leave results tainted
140 if (s/\A"(([^"\\]|\\.)*)"//s) {
141 ($snippet = $1) =~ s#\\(.)#$1#sg;
145 Carp::carp("Unmatched double quote: $_");
148 elsif (s/\A'(([^'\\]|\\.)*)'//s) {
149 ($snippet = $1) =~ s#\\(.)#$1#sg;
153 Carp::carp("Unmatched single quote: $_");
156 elsif (s/\A\\(.?)//s) {
159 elsif (s/\A([^\s\\'"]+)//) {
168 push(@words, $field);
179 Text::ParseWords - parse text into an array of tokens or array of arrays
183 use Text::ParseWords;
184 @lists = &nested_quotewords($delim, $keep, @lines);
185 @words = "ewords($delim, $keep, @lines);
186 @words = &shellwords(@lines);
187 @words = &parse_line($delim, $keep, $line);
188 @words = &old_shellwords(@lines); # DEPRECATED!
192 The &nested_quotewords() and "ewords() functions accept a delimiter
193 (which can be a regular expression)
194 and a list of lines and then breaks those lines up into a list of
195 words ignoring delimiters that appear inside quotes. "ewords()
196 returns all of the tokens in a single long list, while &nested_quotewords()
197 returns a list of token lists corresponding to the elements of @lines.
198 &parse_line() does tokenizing on a single string. The &*quotewords()
199 functions simply call &parse_line(), so if you're only splitting
200 one line you can call &parse_line() directly and save a function
203 The $keep argument is a boolean flag. If true, then the tokens are
204 split on the specified delimiter, but all other characters (quotes,
205 backslashes, etc.) are kept in the tokens. If $keep is false then the
206 &*quotewords() functions remove all quotes and backslashes that are
207 not themselves backslash-escaped or inside of single quotes (i.e.,
208 "ewords() tries to interpret these characters just like the Bourne
209 shell). NB: these semantics are significantly different from the
210 original version of this module shipped with Perl 5.000 through 5.004.
211 As an additional feature, $keep may be the keyword "delimiters" which
212 causes the functions to preserve the delimiters in each string as
213 tokens in the token lists, in addition to preserving quote and
214 backslash characters.
216 &shellwords() is written as a special case of "ewords(), and it
217 does token parsing with whitespace as a delimiter-- similar to most
224 use Text::ParseWords;
225 @words = "ewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
251 multiple spaces are skipped because of our $delim
255 use of quotes to include a space in a word
259 use of a backslash to include a space in a word
263 use of a backslash to remove the special meaning of a double-quote
267 another simple word (note the lack of effect of the
268 backslashed double-quote)
272 Replacing C<"ewords('\s+', 0, q{this is...})>
273 with C<&shellwords(q{this is...})>
274 is a simpler way to accomplish the same thing.
278 Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
279 author unknown). Much of the code for &parse_line() (including the
280 primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
282 Examples section another documentation provided by John Heidemann
285 Bug reports, patches, and nagging provided by lots of folks-- thanks
286 everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
287 for assuring me that a &nested_quotewords() would be useful, and to
288 Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
289 error-checking (sort of-- you had to be there).