inc version (for CPAN.pm sanity)
[p5sagit/p5-mst-13.2.git] / lib / Text / ParseWords.pm
CommitLineData
a0d0e21e 1package Text::ParseWords;
2
b174585d 3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
28e1f846 4$VERSION = "3.2";
a0d0e21e 5
9b599b2a 6require 5.000;
dc848c6f 7
9b599b2a 8use Exporter;
dc848c6f 9@ISA = qw(Exporter);
9b599b2a 10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
a0d0e21e 11@EXPORT_OK = qw(old_shellwords);
12
a5f75d66 13
9b599b2a 14sub shellwords {
15 local(@lines) = @_;
16 $lines[$#lines] =~ s/\s+$//;
17 return(quotewords('\s+', 0, @lines));
18}
a5f75d66 19
a5f75d66 20
a5f75d66 21
9b599b2a 22sub quotewords {
23 my($delim, $keep, @lines) = @_;
24 my($line, @words, @allwords);
25
26
27 foreach $line (@lines) {
28 @words = parse_line($delim, $keep, $line);
29 return() unless (@words || !length($line));
30 push(@allwords, @words);
31 }
32 return(@allwords);
33}
a5f75d66 34
a5f75d66 35
a5f75d66 36
9b599b2a 37sub nested_quotewords {
38 my($delim, $keep, @lines) = @_;
39 my($i, @allwords);
40
41 for ($i = 0; $i < @lines; $i++) {
42 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
43 return() unless (@{$allwords[$i]} || !length($lines[$i]));
44 }
45 return(@allwords);
2304df62 46}
47
48
a0d0e21e 49
9b599b2a 50sub parse_line {
b174585d 51 # We will be testing undef strings
52 local($^W) = 0;
53
9b599b2a 54 my($delimiter, $keep, $line) = @_;
55 my($quote, $quoted, $unquoted, $delim, $word, @pieces);
936c8837 56
9b599b2a 57 while (length($line)) {
b174585d 58
59 ($quote, $quoted, undef, $unquoted, $delim, undef) =
9b599b2a 60 $line =~ m/^(["']) # a $quote
b174585d 61 ((?:\\.|(?!\1)[^\\])*) # and $quoted text
62 \1 # followed by the same quote
63 ([\000-\377]*) # and the rest
9b599b2a 64 | # --OR--
65 ^((?:\\.|[^\\"'])*?) # an $unquoted text
f3a6e335 66 (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))
9b599b2a 67 # plus EOL, delimiter, or quote
b174585d 68 ([\000-\377]*) # the rest
69 /x; # extended layout
70 return() unless( $quote || length($unquoted) || length($delim));
936c8837 71
b174585d 72 $line = $+;
936c8837 73
9b599b2a 74 if ($keep) {
75 $quoted = "$quote$quoted$quote";
76 }
77 else {
78 $unquoted =~ s/\\(.)/$1/g;
167b9ebc 79 if (defined $quote) {
80 $quoted =~ s/\\(.)/$1/g if ($quote eq '"');
81 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
82 }
9b599b2a 83 }
167b9ebc 84 $word .= defined $quote ? $quoted : $unquoted;
9b599b2a 85
86 if (length($delim)) {
87 push(@pieces, $word);
88 push(@pieces, $delim) if ($keep eq 'delimiters');
89 undef $word;
90 }
91 if (!length($line)) {
92 push(@pieces, $word);
2304df62 93 }
2304df62 94 }
9b599b2a 95 return(@pieces);
2304df62 96}
2304df62 97
98
9b599b2a 99
a0d0e21e 100sub old_shellwords {
101
102 # Usage:
103 # use ParseWords;
104 # @words = old_shellwords($line);
105 # or
106 # @words = old_shellwords(@lines);
107
108 local($_) = join('', @_);
109 my(@words,$snippet,$field);
110
111 s/^\s+//;
112 while ($_ ne '') {
113 $field = '';
114 for (;;) {
115 if (s/^"(([^"\\]|\\.)*)"//) {
116 ($snippet = $1) =~ s#\\(.)#$1#g;
117 }
118 elsif (/^"/) {
9b599b2a 119 return();
a0d0e21e 120 }
121 elsif (s/^'(([^'\\]|\\.)*)'//) {
122 ($snippet = $1) =~ s#\\(.)#$1#g;
123 }
124 elsif (/^'/) {
9b599b2a 125 return();
a0d0e21e 126 }
127 elsif (s/^\\(.)//) {
128 $snippet = $1;
129 }
130 elsif (s/^([^\s\\'"]+)//) {
131 $snippet = $1;
132 }
133 else {
134 s/^\s+//;
135 last;
136 }
137 $field .= $snippet;
138 }
139 push(@words, $field);
140 }
141 @words;
142}
9b599b2a 143
1441;
145
146__END__
147
148=head1 NAME
149
150Text::ParseWords - parse text into an array of tokens or array of arrays
151
152=head1 SYNOPSIS
153
154 use Text::ParseWords;
155 @lists = &nested_quotewords($delim, $keep, @lines);
156 @words = &quotewords($delim, $keep, @lines);
157 @words = &shellwords(@lines);
158 @words = &parse_line($delim, $keep, $line);
159 @words = &old_shellwords(@lines); # DEPRECATED!
160
161=head1 DESCRIPTION
162
163The &nested_quotewords() and &quotewords() functions accept a delimiter
164(which can be a regular expression)
165and a list of lines and then breaks those lines up into a list of
166words ignoring delimiters that appear inside quotes. &quotewords()
167returns all of the tokens in a single long list, while &nested_quotewords()
168returns a list of token lists corresponding to the elements of @lines.
169&parse_line() does tokenizing on a single string. The &*quotewords()
170functions simply call &parse_lines(), so if you're only splitting
171one line you can call &parse_lines() directly and save a function
172call.
173
174The $keep argument is a boolean flag. If true, then the tokens are
175split on the specified delimiter, but all other characters (quotes,
176backslashes, etc.) are kept in the tokens. If $keep is false then the
177&*quotewords() functions remove all quotes and backslashes that are
178not themselves backslash-escaped or inside of single quotes (i.e.,
179&quotewords() tries to interpret these characters just like the Bourne
180shell). NB: these semantics are significantly different from the
181original version of this module shipped with Perl 5.000 through 5.004.
182As an additional feature, $keep may be the keyword "delimiters" which
183causes the functions to preserve the delimiters in each string as
184tokens in the token lists, in addition to preserving quote and
185backslash characters.
186
187&shellwords() is written as a special case of &quotewords(), and it
188does token parsing with whitespace as a delimiter-- similar to most
189Unix shells.
190
191=head1 EXAMPLES
192
193The sample program:
194
195 use Text::ParseWords;
196 @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
197 $i = 0;
198 foreach (@words) {
199 print "$i: <$_>\n";
200 $i++;
201 }
202
203produces:
204
205 0: <this>
206 1: <is>
207 2: <a test>
208 3: <of quotewords>
209 4: <"for>
210 5: <you>
211
212demonstrating:
213
214=over 4
215
216=item 0
217a simple word
218
219=item 1
220multiple spaces are skipped because of our $delim
221
222=item 2
223use of quotes to include a space in a word
224
225=item 3
226use of a backslash to include a space in a word
227
228=item 4
229use of a backslash to remove the special meaning of a double-quote
230
231=item 5
232another simple word (note the lack of effect of the
233backslashed double-quote)
234
235=back
236
237Replacing C<&quotewords('\s+', 0, q{this is...})>
238with C<&shellwords(q{this is...})>
239is a simpler way to accomplish the same thing.
240
241=head1 AUTHORS
242
243Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
244author unknown). Much of the code for &parse_line() (including the
245primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
246
247Examples section another documentation provided by John Heidemann
248<johnh@ISI.EDU>
249
250Bug reports, patches, and nagging provided by lots of folks-- thanks
251everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
252for assuring me that a &nested_quotewords() would be useful, and to
253Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
254error-checking (sort of-- you had to be there).
255
256=cut