Fix for: [perl #30442] Text::ParseWords does not handle backslashed newline inside...
[p5sagit/p5-mst-13.2.git] / lib / Text / ParseWords.pm
CommitLineData
a0d0e21e 1package Text::ParseWords;
2
b174585d 3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
a8c6c617 4$VERSION = "3.22";
a0d0e21e 5
9b599b2a 6require 5.000;
dc848c6f 7
9b599b2a 8use Exporter;
dc848c6f 9@ISA = qw(Exporter);
9b599b2a 10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
a0d0e21e 11@EXPORT_OK = qw(old_shellwords);
12
a5f75d66 13
9b599b2a 14sub shellwords {
15 local(@lines) = @_;
16 $lines[$#lines] =~ s/\s+$//;
17 return(quotewords('\s+', 0, @lines));
18}
a5f75d66 19
a5f75d66 20
a5f75d66 21
9b599b2a 22sub quotewords {
23 my($delim, $keep, @lines) = @_;
24 my($line, @words, @allwords);
25
26
27 foreach $line (@lines) {
28 @words = parse_line($delim, $keep, $line);
29 return() unless (@words || !length($line));
30 push(@allwords, @words);
31 }
32 return(@allwords);
33}
a5f75d66 34
a5f75d66 35
a5f75d66 36
9b599b2a 37sub nested_quotewords {
38 my($delim, $keep, @lines) = @_;
39 my($i, @allwords);
40
41 for ($i = 0; $i < @lines; $i++) {
42 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
43 return() unless (@{$allwords[$i]} || !length($lines[$i]));
44 }
45 return(@allwords);
2304df62 46}
47
48
a0d0e21e 49
9b599b2a 50sub parse_line {
b174585d 51 # We will be testing undef strings
db376a24 52 no warnings;
86a5040c 53 use re 'taint'; # if it's tainted, leave it as such
b174585d 54
9b599b2a 55 my($delimiter, $keep, $line) = @_;
56 my($quote, $quoted, $unquoted, $delim, $word, @pieces);
936c8837 57
9b599b2a 58 while (length($line)) {
b174585d 59
60 ($quote, $quoted, undef, $unquoted, $delim, undef) =
9b599b2a 61 $line =~ m/^(["']) # a $quote
a8c6c617 62 ((?:\\[\000-\377]|(?!\1)[^\\])*) # and $quoted text
b174585d 63 \1 # followed by the same quote
64 ([\000-\377]*) # and the rest
9b599b2a 65 | # --OR--
a8c6c617 66 ^((?:\\[\000-\377]|[^\\"'])*?) # an $unquoted text
f3a6e335 67 (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))
9b599b2a 68 # plus EOL, delimiter, or quote
b174585d 69 ([\000-\377]*) # the rest
70 /x; # extended layout
71 return() unless( $quote || length($unquoted) || length($delim));
936c8837 72
b174585d 73 $line = $+;
936c8837 74
9b599b2a 75 if ($keep) {
76 $quoted = "$quote$quoted$quote";
77 }
78 else {
a8c6c617 79 $unquoted =~ s/\\([\000-\377])/$1/g;
167b9ebc 80 if (defined $quote) {
a8c6c617 81 $quoted =~ s/\\([\000-\377])/$1/g if ($quote eq '"');
167b9ebc 82 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
83 }
9b599b2a 84 }
167b9ebc 85 $word .= defined $quote ? $quoted : $unquoted;
9b599b2a 86
87 if (length($delim)) {
88 push(@pieces, $word);
89 push(@pieces, $delim) if ($keep eq 'delimiters');
90 undef $word;
91 }
92 if (!length($line)) {
93 push(@pieces, $word);
2304df62 94 }
2304df62 95 }
9b599b2a 96 return(@pieces);
2304df62 97}
2304df62 98
99
9b599b2a 100
a0d0e21e 101sub old_shellwords {
102
103 # Usage:
104 # use ParseWords;
105 # @words = old_shellwords($line);
106 # or
107 # @words = old_shellwords(@lines);
108
109 local($_) = join('', @_);
110 my(@words,$snippet,$field);
111
112 s/^\s+//;
113 while ($_ ne '') {
114 $field = '';
115 for (;;) {
116 if (s/^"(([^"\\]|\\.)*)"//) {
117 ($snippet = $1) =~ s#\\(.)#$1#g;
118 }
119 elsif (/^"/) {
9b599b2a 120 return();
a0d0e21e 121 }
122 elsif (s/^'(([^'\\]|\\.)*)'//) {
123 ($snippet = $1) =~ s#\\(.)#$1#g;
124 }
125 elsif (/^'/) {
9b599b2a 126 return();
a0d0e21e 127 }
128 elsif (s/^\\(.)//) {
129 $snippet = $1;
130 }
131 elsif (s/^([^\s\\'"]+)//) {
132 $snippet = $1;
133 }
134 else {
135 s/^\s+//;
136 last;
137 }
138 $field .= $snippet;
139 }
140 push(@words, $field);
141 }
142 @words;
143}
9b599b2a 144
1451;
146
147__END__
148
149=head1 NAME
150
151Text::ParseWords - parse text into an array of tokens or array of arrays
152
153=head1 SYNOPSIS
154
155 use Text::ParseWords;
156 @lists = &nested_quotewords($delim, $keep, @lines);
157 @words = &quotewords($delim, $keep, @lines);
158 @words = &shellwords(@lines);
159 @words = &parse_line($delim, $keep, $line);
160 @words = &old_shellwords(@lines); # DEPRECATED!
161
162=head1 DESCRIPTION
163
164The &nested_quotewords() and &quotewords() functions accept a delimiter
165(which can be a regular expression)
166and a list of lines and then breaks those lines up into a list of
167words ignoring delimiters that appear inside quotes. &quotewords()
168returns all of the tokens in a single long list, while &nested_quotewords()
169returns a list of token lists corresponding to the elements of @lines.
170&parse_line() does tokenizing on a single string. The &*quotewords()
cf18bebb 171functions simply call &parse_line(), so if you're only splitting
172one line you can call &parse_line() directly and save a function
9b599b2a 173call.
174
175The $keep argument is a boolean flag. If true, then the tokens are
176split on the specified delimiter, but all other characters (quotes,
177backslashes, etc.) are kept in the tokens. If $keep is false then the
178&*quotewords() functions remove all quotes and backslashes that are
179not themselves backslash-escaped or inside of single quotes (i.e.,
180&quotewords() tries to interpret these characters just like the Bourne
181shell). NB: these semantics are significantly different from the
182original version of this module shipped with Perl 5.000 through 5.004.
183As an additional feature, $keep may be the keyword "delimiters" which
184causes the functions to preserve the delimiters in each string as
185tokens in the token lists, in addition to preserving quote and
186backslash characters.
187
188&shellwords() is written as a special case of &quotewords(), and it
189does token parsing with whitespace as a delimiter-- similar to most
190Unix shells.
191
192=head1 EXAMPLES
193
194The sample program:
195
196 use Text::ParseWords;
197 @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
198 $i = 0;
199 foreach (@words) {
200 print "$i: <$_>\n";
201 $i++;
202 }
203
204produces:
205
206 0: <this>
207 1: <is>
208 2: <a test>
209 3: <of quotewords>
210 4: <"for>
211 5: <you>
212
213demonstrating:
214
215=over 4
216
217=item 0
551e1d92 218
9b599b2a 219a simple word
220
221=item 1
551e1d92 222
9b599b2a 223multiple spaces are skipped because of our $delim
224
225=item 2
551e1d92 226
9b599b2a 227use of quotes to include a space in a word
228
229=item 3
551e1d92 230
9b599b2a 231use of a backslash to include a space in a word
232
233=item 4
551e1d92 234
9b599b2a 235use of a backslash to remove the special meaning of a double-quote
236
237=item 5
551e1d92 238
9b599b2a 239another simple word (note the lack of effect of the
240backslashed double-quote)
241
242=back
243
244Replacing C<&quotewords('\s+', 0, q{this is...})>
245with C<&shellwords(q{this is...})>
246is a simpler way to accomplish the same thing.
247
248=head1 AUTHORS
249
250Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
251author unknown). Much of the code for &parse_line() (including the
252primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
253
254Examples section another documentation provided by John Heidemann
255<johnh@ISI.EDU>
256
257Bug reports, patches, and nagging provided by lots of folks-- thanks
258everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
259for assuring me that a &nested_quotewords() would be useful, and to
260Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
261error-checking (sort of-- you had to be there).
262
263=cut