Cleanup the main regex in Text::ParseWords and make the
[p5sagit/p5-mst-13.2.git] / lib / Text / ParseWords.pm
CommitLineData
a0d0e21e 1package Text::ParseWords;
2
b174585d 3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
429b060a 4$VERSION = "3.23";
a0d0e21e 5
9b599b2a 6require 5.000;
dc848c6f 7
9b599b2a 8use Exporter;
dc848c6f 9@ISA = qw(Exporter);
9b599b2a 10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
a0d0e21e 11@EXPORT_OK = qw(old_shellwords);
12
a5f75d66 13
9b599b2a 14sub shellwords {
15 local(@lines) = @_;
16 $lines[$#lines] =~ s/\s+$//;
17 return(quotewords('\s+', 0, @lines));
18}
a5f75d66 19
a5f75d66 20
a5f75d66 21
9b599b2a 22sub quotewords {
23 my($delim, $keep, @lines) = @_;
24 my($line, @words, @allwords);
25
26
27 foreach $line (@lines) {
28 @words = parse_line($delim, $keep, $line);
29 return() unless (@words || !length($line));
30 push(@allwords, @words);
31 }
32 return(@allwords);
33}
a5f75d66 34
a5f75d66 35
a5f75d66 36
9b599b2a 37sub nested_quotewords {
38 my($delim, $keep, @lines) = @_;
39 my($i, @allwords);
40
41 for ($i = 0; $i < @lines; $i++) {
42 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
43 return() unless (@{$allwords[$i]} || !length($lines[$i]));
44 }
45 return(@allwords);
2304df62 46}
47
48
a0d0e21e 49
9b599b2a 50sub parse_line {
b174585d 51 # We will be testing undef strings
db376a24 52 no warnings;
86a5040c 53 use re 'taint'; # if it's tainted, leave it as such
b174585d 54
9b599b2a 55 my($delimiter, $keep, $line) = @_;
429b060a 56 my($word, @pieces);
936c8837 57
9b599b2a 58 while (length($line)) {
429b060a 59 $line =~ s/^(["']) # a $quote
60 ((?:\\.|(?!\1)[^\\])*) # and $quoted text
61 \1 # followed by the same quote
62 | # --OR--
63 ^((?:\\.|[^\\"'])*?) # an $unquoted text
64 (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))
65 # plus EOL, delimiter, or quote
66 //xs; # extended layout
67 my($quote, $quoted, $unquoted, $delim) = ($1, $2, $3, $4);
68 return() unless( defined($quote) || length($unquoted) || length($delim));
936c8837 69
9b599b2a 70 if ($keep) {
71 $quoted = "$quote$quoted$quote";
72 }
73 else {
429b060a 74 $unquoted =~ s/\\(.)/$1/sg;
167b9ebc 75 if (defined $quote) {
429b060a 76 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
167b9ebc 77 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
78 }
9b599b2a 79 }
167b9ebc 80 $word .= defined $quote ? $quoted : $unquoted;
9b599b2a 81
82 if (length($delim)) {
83 push(@pieces, $word);
84 push(@pieces, $delim) if ($keep eq 'delimiters');
85 undef $word;
86 }
87 if (!length($line)) {
88 push(@pieces, $word);
2304df62 89 }
2304df62 90 }
9b599b2a 91 return(@pieces);
2304df62 92}
2304df62 93
94
9b599b2a 95
a0d0e21e 96sub old_shellwords {
97
98 # Usage:
99 # use ParseWords;
100 # @words = old_shellwords($line);
101 # or
102 # @words = old_shellwords(@lines);
103
104 local($_) = join('', @_);
105 my(@words,$snippet,$field);
106
107 s/^\s+//;
108 while ($_ ne '') {
109 $field = '';
110 for (;;) {
111 if (s/^"(([^"\\]|\\.)*)"//) {
112 ($snippet = $1) =~ s#\\(.)#$1#g;
113 }
114 elsif (/^"/) {
9b599b2a 115 return();
a0d0e21e 116 }
117 elsif (s/^'(([^'\\]|\\.)*)'//) {
118 ($snippet = $1) =~ s#\\(.)#$1#g;
119 }
120 elsif (/^'/) {
9b599b2a 121 return();
a0d0e21e 122 }
123 elsif (s/^\\(.)//) {
124 $snippet = $1;
125 }
126 elsif (s/^([^\s\\'"]+)//) {
127 $snippet = $1;
128 }
129 else {
130 s/^\s+//;
131 last;
132 }
133 $field .= $snippet;
134 }
135 push(@words, $field);
136 }
137 @words;
138}
9b599b2a 139
1401;
141
142__END__
143
144=head1 NAME
145
146Text::ParseWords - parse text into an array of tokens or array of arrays
147
148=head1 SYNOPSIS
149
150 use Text::ParseWords;
151 @lists = &nested_quotewords($delim, $keep, @lines);
152 @words = &quotewords($delim, $keep, @lines);
153 @words = &shellwords(@lines);
154 @words = &parse_line($delim, $keep, $line);
155 @words = &old_shellwords(@lines); # DEPRECATED!
156
157=head1 DESCRIPTION
158
159The &nested_quotewords() and &quotewords() functions accept a delimiter
160(which can be a regular expression)
161and a list of lines and then breaks those lines up into a list of
162words ignoring delimiters that appear inside quotes. &quotewords()
163returns all of the tokens in a single long list, while &nested_quotewords()
164returns a list of token lists corresponding to the elements of @lines.
165&parse_line() does tokenizing on a single string. The &*quotewords()
cf18bebb 166functions simply call &parse_line(), so if you're only splitting
167one line you can call &parse_line() directly and save a function
9b599b2a 168call.
169
170The $keep argument is a boolean flag. If true, then the tokens are
171split on the specified delimiter, but all other characters (quotes,
172backslashes, etc.) are kept in the tokens. If $keep is false then the
173&*quotewords() functions remove all quotes and backslashes that are
174not themselves backslash-escaped or inside of single quotes (i.e.,
175&quotewords() tries to interpret these characters just like the Bourne
176shell). NB: these semantics are significantly different from the
177original version of this module shipped with Perl 5.000 through 5.004.
178As an additional feature, $keep may be the keyword "delimiters" which
179causes the functions to preserve the delimiters in each string as
180tokens in the token lists, in addition to preserving quote and
181backslash characters.
182
183&shellwords() is written as a special case of &quotewords(), and it
184does token parsing with whitespace as a delimiter-- similar to most
185Unix shells.
186
187=head1 EXAMPLES
188
189The sample program:
190
191 use Text::ParseWords;
192 @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
193 $i = 0;
194 foreach (@words) {
195 print "$i: <$_>\n";
196 $i++;
197 }
198
199produces:
200
201 0: <this>
202 1: <is>
203 2: <a test>
204 3: <of quotewords>
205 4: <"for>
206 5: <you>
207
208demonstrating:
209
210=over 4
211
212=item 0
551e1d92 213
9b599b2a 214a simple word
215
216=item 1
551e1d92 217
9b599b2a 218multiple spaces are skipped because of our $delim
219
220=item 2
551e1d92 221
9b599b2a 222use of quotes to include a space in a word
223
224=item 3
551e1d92 225
9b599b2a 226use of a backslash to include a space in a word
227
228=item 4
551e1d92 229
9b599b2a 230use of a backslash to remove the special meaning of a double-quote
231
232=item 5
551e1d92 233
9b599b2a 234another simple word (note the lack of effect of the
235backslashed double-quote)
236
237=back
238
239Replacing C<&quotewords('\s+', 0, q{this is...})>
240with C<&shellwords(q{this is...})>
241is a simpler way to accomplish the same thing.
242
243=head1 AUTHORS
244
245Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
246author unknown). Much of the code for &parse_line() (including the
247primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
248
249Examples section another documentation provided by John Heidemann
250<johnh@ISI.EDU>
251
252Bug reports, patches, and nagging provided by lots of folks-- thanks
253everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
254for assuring me that a &nested_quotewords() would be useful, and to
255Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
256error-checking (sort of-- you had to be there).
257
258=cut