manual integration of all outstanding ansi branch stuff into mainline
[p5sagit/p5-mst-13.2.git] / lib / Text / ParseWords.pm
CommitLineData
a0d0e21e 1package Text::ParseWords;
2
9b599b2a 3use vars qw($VERSION @ISA @EXPORT);
4$VERSION = "3.0";
a0d0e21e 5
9b599b2a 6require 5.000;
dc848c6f 7
9b599b2a 8use Exporter;
dc848c6f 9@ISA = qw(Exporter);
9b599b2a 10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
a0d0e21e 11@EXPORT_OK = qw(old_shellwords);
12
a5f75d66 13
9b599b2a 14sub shellwords {
15 local(@lines) = @_;
16 $lines[$#lines] =~ s/\s+$//;
17 return(quotewords('\s+', 0, @lines));
18}
a5f75d66 19
a5f75d66 20
a5f75d66 21
9b599b2a 22sub quotewords {
23 my($delim, $keep, @lines) = @_;
24 my($line, @words, @allwords);
25
26
27 foreach $line (@lines) {
28 @words = parse_line($delim, $keep, $line);
29 return() unless (@words || !length($line));
30 push(@allwords, @words);
31 }
32 return(@allwords);
33}
a5f75d66 34
a5f75d66 35
a5f75d66 36
9b599b2a 37sub nested_quotewords {
38 my($delim, $keep, @lines) = @_;
39 my($i, @allwords);
40
41 for ($i = 0; $i < @lines; $i++) {
42 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
43 return() unless (@{$allwords[$i]} || !length($lines[$i]));
44 }
45 return(@allwords);
2304df62 46}
47
48
a0d0e21e 49
9b599b2a 50sub parse_line {
51 my($delimiter, $keep, $line) = @_;
52 my($quote, $quoted, $unquoted, $delim, $word, @pieces);
936c8837 53
9b599b2a 54 while (length($line)) {
55 ($quote, $quoted, $unquoted, $delim) =
56 $line =~ m/^(["']) # a $quote
57 ((?:\\.|[^\1\\])*?) # and $quoted text
58 \1 # followed by the same quote
59 | # --OR--
60 ^((?:\\.|[^\\"'])*?) # an $unquoted text
61 (\Z(?!\n)|$delimiter|(?!^)(?=["']))
62 # plus EOL, delimiter, or quote
63 /x; # extended layout
936c8837 64
9b599b2a 65 return() unless(length($&));
66 $line = $';
936c8837 67
9b599b2a 68 if ($keep) {
69 $quoted = "$quote$quoted$quote";
70 }
71 else {
72 $unquoted =~ s/\\(.)/$1/g;
73 $quoted =~ s/\\(.)/$1/g if ($quote eq '"');
74 }
75 $word .= ($quote) ? $quoted : $unquoted;
76
77 if (length($delim)) {
78 push(@pieces, $word);
79 push(@pieces, $delim) if ($keep eq 'delimiters');
80 undef $word;
81 }
82 if (!length($line)) {
83 push(@pieces, $word);
2304df62 84 }
2304df62 85 }
9b599b2a 86 return(@pieces);
2304df62 87}
2304df62 88
89
9b599b2a 90
a0d0e21e 91sub old_shellwords {
92
93 # Usage:
94 # use ParseWords;
95 # @words = old_shellwords($line);
96 # or
97 # @words = old_shellwords(@lines);
98
99 local($_) = join('', @_);
100 my(@words,$snippet,$field);
101
102 s/^\s+//;
103 while ($_ ne '') {
104 $field = '';
105 for (;;) {
106 if (s/^"(([^"\\]|\\.)*)"//) {
107 ($snippet = $1) =~ s#\\(.)#$1#g;
108 }
109 elsif (/^"/) {
9b599b2a 110 return();
a0d0e21e 111 }
112 elsif (s/^'(([^'\\]|\\.)*)'//) {
113 ($snippet = $1) =~ s#\\(.)#$1#g;
114 }
115 elsif (/^'/) {
9b599b2a 116 return();
a0d0e21e 117 }
118 elsif (s/^\\(.)//) {
119 $snippet = $1;
120 }
121 elsif (s/^([^\s\\'"]+)//) {
122 $snippet = $1;
123 }
124 else {
125 s/^\s+//;
126 last;
127 }
128 $field .= $snippet;
129 }
130 push(@words, $field);
131 }
132 @words;
133}
9b599b2a 134
1351;
136
137__END__
138
139=head1 NAME
140
141Text::ParseWords - parse text into an array of tokens or array of arrays
142
143=head1 SYNOPSIS
144
145 use Text::ParseWords;
146 @lists = &nested_quotewords($delim, $keep, @lines);
147 @words = &quotewords($delim, $keep, @lines);
148 @words = &shellwords(@lines);
149 @words = &parse_line($delim, $keep, $line);
150 @words = &old_shellwords(@lines); # DEPRECATED!
151
152=head1 DESCRIPTION
153
154The &nested_quotewords() and &quotewords() functions accept a delimiter
155(which can be a regular expression)
156and a list of lines and then breaks those lines up into a list of
157words ignoring delimiters that appear inside quotes. &quotewords()
158returns all of the tokens in a single long list, while &nested_quotewords()
159returns a list of token lists corresponding to the elements of @lines.
160&parse_line() does tokenizing on a single string. The &*quotewords()
161functions simply call &parse_lines(), so if you're only splitting
162one line you can call &parse_lines() directly and save a function
163call.
164
165The $keep argument is a boolean flag. If true, then the tokens are
166split on the specified delimiter, but all other characters (quotes,
167backslashes, etc.) are kept in the tokens. If $keep is false then the
168&*quotewords() functions remove all quotes and backslashes that are
169not themselves backslash-escaped or inside of single quotes (i.e.,
170&quotewords() tries to interpret these characters just like the Bourne
171shell). NB: these semantics are significantly different from the
172original version of this module shipped with Perl 5.000 through 5.004.
173As an additional feature, $keep may be the keyword "delimiters" which
174causes the functions to preserve the delimiters in each string as
175tokens in the token lists, in addition to preserving quote and
176backslash characters.
177
178&shellwords() is written as a special case of &quotewords(), and it
179does token parsing with whitespace as a delimiter-- similar to most
180Unix shells.
181
182=head1 EXAMPLES
183
184The sample program:
185
186 use Text::ParseWords;
187 @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
188 $i = 0;
189 foreach (@words) {
190 print "$i: <$_>\n";
191 $i++;
192 }
193
194produces:
195
196 0: <this>
197 1: <is>
198 2: <a test>
199 3: <of quotewords>
200 4: <"for>
201 5: <you>
202
203demonstrating:
204
205=over 4
206
207=item 0
208a simple word
209
210=item 1
211multiple spaces are skipped because of our $delim
212
213=item 2
214use of quotes to include a space in a word
215
216=item 3
217use of a backslash to include a space in a word
218
219=item 4
220use of a backslash to remove the special meaning of a double-quote
221
222=item 5
223another simple word (note the lack of effect of the
224backslashed double-quote)
225
226=back
227
228Replacing C<&quotewords('\s+', 0, q{this is...})>
229with C<&shellwords(q{this is...})>
230is a simpler way to accomplish the same thing.
231
232=head1 AUTHORS
233
234Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
235author unknown). Much of the code for &parse_line() (including the
236primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
237
238Examples section another documentation provided by John Heidemann
239<johnh@ISI.EDU>
240
241Bug reports, patches, and nagging provided by lots of folks-- thanks
242everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
243for assuring me that a &nested_quotewords() would be useful, and to
244Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
245error-checking (sort of-- you had to be there).
246
247=cut