1 package Text::ParseWords;
9 our @ISA = qw(Exporter);
10 our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
11 our @EXPORT_OK = qw(old_shellwords);
12 our $PERL_SINGLE_QUOTE;
19 foreach my $line (@lines) {
21 my @words = parse_line('\s+', 0, $line);
22 pop @words if (@words and !defined $words[-1]);
23 return() unless (@words || !length($line));
24 push(@allwords, @words);
32 my($delim, $keep, @lines) = @_;
33 my($line, @words, @allwords);
35 foreach $line (@lines) {
36 @words = parse_line($delim, $keep, $line);
37 return() unless (@words || !length($line));
38 push(@allwords, @words);
45 sub nested_quotewords {
46 my($delim, $keep, @lines) = @_;
49 for ($i = 0; $i < @lines; $i++) {
50 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
51 return() unless (@{$allwords[$i]} || !length($lines[$i]));
59 my($delimiter, $keep, $line) = @_;
62 no warnings 'uninitialized'; # we will be testing undef strings
64 while (length($line)) {
65 # This pattern is optimised to be stack conservative on older perls.
66 # Do not refactor without being careful and testing it on very long strings.
67 # See Perl bug #42980 for an example of a stack busting input.
70 # double quoted string
72 ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted
76 ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted
86 (?-x:$delimiter) # delimiter
88 (?!^)(?=["']) # a quote
90 )//xs or return; # extended layout
91 my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
94 return() unless( defined($quote) || length($unquoted) || length($delim));
97 $quoted = "$quote$quoted$quote";
100 $unquoted =~ s/\\(.)/$1/sg;
101 if (defined $quote) {
102 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
103 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
106 $word .= substr($line, 0, 0); # leave results tainted
107 $word .= defined $quote ? $quoted : $unquoted;
109 if (length($delim)) {
110 push(@pieces, $word);
111 push(@pieces, $delim) if ($keep eq 'delimiters');
114 if (!length($line)) {
115 push(@pieces, $word);
127 # @words = old_shellwords($line);
129 # @words = old_shellwords(@lines);
131 # @words = old_shellwords(); # defaults to $_ (and clobbers it)
133 no warnings 'uninitialized'; # we will be testing undef strings
134 local *_ = \join('', @_) if @_;
135 my (@words, $snippet);
139 my $field = substr($_, 0, 0); # leave results tainted
141 if (s/\A"(([^"\\]|\\.)*)"//s) {
142 ($snippet = $1) =~ s#\\(.)#$1#sg;
146 Carp::carp("Unmatched double quote: $_");
149 elsif (s/\A'(([^'\\]|\\.)*)'//s) {
150 ($snippet = $1) =~ s#\\(.)#$1#sg;
154 Carp::carp("Unmatched single quote: $_");
157 elsif (s/\A\\(.?)//s) {
160 elsif (s/\A([^\s\\'"]+)//) {
169 push(@words, $field);
180 Text::ParseWords - parse text into an array of tokens or array of arrays
184 use Text::ParseWords;
185 @lists = nested_quotewords($delim, $keep, @lines);
186 @words = quotewords($delim, $keep, @lines);
187 @words = shellwords(@lines);
188 @words = parse_line($delim, $keep, $line);
189 @words = old_shellwords(@lines); # DEPRECATED!
193 The &nested_quotewords() and "ewords() functions accept a delimiter
194 (which can be a regular expression)
195 and a list of lines and then breaks those lines up into a list of
196 words ignoring delimiters that appear inside quotes. "ewords()
197 returns all of the tokens in a single long list, while &nested_quotewords()
198 returns a list of token lists corresponding to the elements of @lines.
199 &parse_line() does tokenizing on a single string. The &*quotewords()
200 functions simply call &parse_line(), so if you're only splitting
201 one line you can call &parse_line() directly and save a function
204 The $keep argument is a boolean flag. If true, then the tokens are
205 split on the specified delimiter, but all other characters (quotes,
206 backslashes, etc.) are kept in the tokens. If $keep is false then the
207 &*quotewords() functions remove all quotes and backslashes that are
208 not themselves backslash-escaped or inside of single quotes (i.e.,
209 "ewords() tries to interpret these characters just like the Bourne
210 shell). NB: these semantics are significantly different from the
211 original version of this module shipped with Perl 5.000 through 5.004.
212 As an additional feature, $keep may be the keyword "delimiters" which
213 causes the functions to preserve the delimiters in each string as
214 tokens in the token lists, in addition to preserving quote and
215 backslash characters.
217 &shellwords() is written as a special case of "ewords(), and it
218 does token parsing with whitespace as a delimiter-- similar to most
225 use Text::ParseWords;
226 @words = quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
252 multiple spaces are skipped because of our $delim
256 use of quotes to include a space in a word
260 use of a backslash to include a space in a word
264 use of a backslash to remove the special meaning of a double-quote
268 another simple word (note the lack of effect of the
269 backslashed double-quote)
273 Replacing C<quotewords('\s+', 0, q{this is...})>
274 with C<shellwords(q{this is...})>
275 is a simpler way to accomplish the same thing.
279 Maintainer: Alexandr Ciornii <alexchornyATgmail.com>.
281 Previous maintainer: Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
282 author unknown). Much of the code for &parse_line() (including the
283 primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
285 Examples section another documentation provided by John Heidemann
288 Bug reports, patches, and nagging provided by lots of folks-- thanks
289 everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
290 for assuring me that a &nested_quotewords() would be useful, and to
291 Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
292 error-checking (sort of-- you had to be there).