Replace a call to utf8::encode by a pack/unpack combination,
[p5sagit/p5-mst-13.2.git] / lib / Text / ParseWords.pm
CommitLineData
a0d0e21e 1package Text::ParseWords;
2
b174585d 3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
d5c14ab2 4$VERSION = "3.25";
a0d0e21e 5
9b599b2a 6require 5.000;
dc848c6f 7
9b599b2a 8use Exporter;
dc848c6f 9@ISA = qw(Exporter);
9b599b2a 10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
a0d0e21e 11@EXPORT_OK = qw(old_shellwords);
12
a5f75d66 13
9b599b2a 14sub shellwords {
d5c14ab2 15 my (@lines) = @_;
16 my @allwords;
17
18 foreach my $line (@lines) {
19 $line =~ s/^\s+//;
20 my @words = parse_line('\s+', 0, $line);
21 pop @words if (@words and !defined $words[-1]);
22 return() unless (@words || !length($line));
23 push(@allwords, @words);
24 }
25 return(@allwords);
9b599b2a 26}
a5f75d66 27
a5f75d66 28
a5f75d66 29
9b599b2a 30sub quotewords {
31 my($delim, $keep, @lines) = @_;
32 my($line, @words, @allwords);
9b599b2a 33
34 foreach $line (@lines) {
35 @words = parse_line($delim, $keep, $line);
36 return() unless (@words || !length($line));
37 push(@allwords, @words);
38 }
39 return(@allwords);
40}
a5f75d66 41
a5f75d66 42
a5f75d66 43
9b599b2a 44sub nested_quotewords {
45 my($delim, $keep, @lines) = @_;
46 my($i, @allwords);
6a724e38 47
9b599b2a 48 for ($i = 0; $i < @lines; $i++) {
49 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
50 return() unless (@{$allwords[$i]} || !length($lines[$i]));
51 }
52 return(@allwords);
2304df62 53}
54
55
a0d0e21e 56
9b599b2a 57sub parse_line {
58 my($delimiter, $keep, $line) = @_;
429b060a 59 my($word, @pieces);
936c8837 60
6a724e38 61 no warnings 'uninitialized'; # we will be testing undef strings
62
9b599b2a 63 while (length($line)) {
429b060a 64 $line =~ s/^(["']) # a $quote
65 ((?:\\.|(?!\1)[^\\])*) # and $quoted text
66 \1 # followed by the same quote
67 | # --OR--
68 ^((?:\\.|[^\\"'])*?) # an $unquoted text
69 (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))
70 # plus EOL, delimiter, or quote
30799d55 71 //xs or return; # extended layout
429b060a 72 my($quote, $quoted, $unquoted, $delim) = ($1, $2, $3, $4);
73 return() unless( defined($quote) || length($unquoted) || length($delim));
936c8837 74
9b599b2a 75 if ($keep) {
76 $quoted = "$quote$quoted$quote";
77 }
78 else {
429b060a 79 $unquoted =~ s/\\(.)/$1/sg;
167b9ebc 80 if (defined $quote) {
429b060a 81 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
167b9ebc 82 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
83 }
9b599b2a 84 }
6a724e38 85 $word .= substr($line, 0, 0); # leave results tainted
167b9ebc 86 $word .= defined $quote ? $quoted : $unquoted;
9b599b2a 87
88 if (length($delim)) {
89 push(@pieces, $word);
90 push(@pieces, $delim) if ($keep eq 'delimiters');
91 undef $word;
92 }
93 if (!length($line)) {
94 push(@pieces, $word);
2304df62 95 }
2304df62 96 }
9b599b2a 97 return(@pieces);
2304df62 98}
2304df62 99
100
9b599b2a 101
a0d0e21e 102sub old_shellwords {
103
104 # Usage:
105 # use ParseWords;
106 # @words = old_shellwords($line);
107 # or
108 # @words = old_shellwords(@lines);
6a724e38 109 # or
110 # @words = old_shellwords(); # defaults to $_ (and clobbers it)
a0d0e21e 111
6a724e38 112 no warnings 'uninitialized'; # we will be testing undef strings
113 local *_ = \join('', @_) if @_;
114 my (@words, $snippet);
a0d0e21e 115
6a724e38 116 s/\A\s+//;
a0d0e21e 117 while ($_ ne '') {
6a724e38 118 my $field = substr($_, 0, 0); # leave results tainted
a0d0e21e 119 for (;;) {
6a724e38 120 if (s/\A"(([^"\\]|\\.)*)"//s) {
121 ($snippet = $1) =~ s#\\(.)#$1#sg;
a0d0e21e 122 }
6a724e38 123 elsif (/\A"/) {
124 require Carp;
125 Carp::carp("Unmatched double quote: $_");
9b599b2a 126 return();
a0d0e21e 127 }
6a724e38 128 elsif (s/\A'(([^'\\]|\\.)*)'//s) {
129 ($snippet = $1) =~ s#\\(.)#$1#sg;
a0d0e21e 130 }
6a724e38 131 elsif (/\A'/) {
132 require Carp;
133 Carp::carp("Unmatched single quote: $_");
9b599b2a 134 return();
a0d0e21e 135 }
9983eac8 136 elsif (s/\A\\(.?)//s) {
a0d0e21e 137 $snippet = $1;
138 }
6a724e38 139 elsif (s/\A([^\s\\'"]+)//) {
a0d0e21e 140 $snippet = $1;
141 }
142 else {
6a724e38 143 s/\A\s+//;
a0d0e21e 144 last;
145 }
146 $field .= $snippet;
147 }
148 push(@words, $field);
149 }
6a724e38 150 return @words;
a0d0e21e 151}
9b599b2a 152
1531;
154
155__END__
156
157=head1 NAME
158
159Text::ParseWords - parse text into an array of tokens or array of arrays
160
161=head1 SYNOPSIS
162
163 use Text::ParseWords;
164 @lists = &nested_quotewords($delim, $keep, @lines);
165 @words = &quotewords($delim, $keep, @lines);
166 @words = &shellwords(@lines);
167 @words = &parse_line($delim, $keep, $line);
168 @words = &old_shellwords(@lines); # DEPRECATED!
169
170=head1 DESCRIPTION
171
172The &nested_quotewords() and &quotewords() functions accept a delimiter
173(which can be a regular expression)
174and a list of lines and then breaks those lines up into a list of
175words ignoring delimiters that appear inside quotes. &quotewords()
176returns all of the tokens in a single long list, while &nested_quotewords()
177returns a list of token lists corresponding to the elements of @lines.
178&parse_line() does tokenizing on a single string. The &*quotewords()
cf18bebb 179functions simply call &parse_line(), so if you're only splitting
180one line you can call &parse_line() directly and save a function
9b599b2a 181call.
182
183The $keep argument is a boolean flag. If true, then the tokens are
184split on the specified delimiter, but all other characters (quotes,
185backslashes, etc.) are kept in the tokens. If $keep is false then the
186&*quotewords() functions remove all quotes and backslashes that are
187not themselves backslash-escaped or inside of single quotes (i.e.,
188&quotewords() tries to interpret these characters just like the Bourne
189shell). NB: these semantics are significantly different from the
190original version of this module shipped with Perl 5.000 through 5.004.
191As an additional feature, $keep may be the keyword "delimiters" which
192causes the functions to preserve the delimiters in each string as
193tokens in the token lists, in addition to preserving quote and
194backslash characters.
195
196&shellwords() is written as a special case of &quotewords(), and it
197does token parsing with whitespace as a delimiter-- similar to most
198Unix shells.
199
200=head1 EXAMPLES
201
202The sample program:
203
204 use Text::ParseWords;
205 @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
206 $i = 0;
207 foreach (@words) {
208 print "$i: <$_>\n";
209 $i++;
210 }
211
212produces:
213
214 0: <this>
215 1: <is>
216 2: <a test>
217 3: <of quotewords>
218 4: <"for>
219 5: <you>
220
221demonstrating:
222
223=over 4
224
225=item 0
551e1d92 226
9b599b2a 227a simple word
228
229=item 1
551e1d92 230
9b599b2a 231multiple spaces are skipped because of our $delim
232
233=item 2
551e1d92 234
9b599b2a 235use of quotes to include a space in a word
236
237=item 3
551e1d92 238
9b599b2a 239use of a backslash to include a space in a word
240
241=item 4
551e1d92 242
9b599b2a 243use of a backslash to remove the special meaning of a double-quote
244
245=item 5
551e1d92 246
9b599b2a 247another simple word (note the lack of effect of the
248backslashed double-quote)
249
250=back
251
252Replacing C<&quotewords('\s+', 0, q{this is...})>
253with C<&shellwords(q{this is...})>
254is a simpler way to accomplish the same thing.
255
256=head1 AUTHORS
257
258Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
259author unknown). Much of the code for &parse_line() (including the
260primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
261
262Examples section another documentation provided by John Heidemann
263<johnh@ISI.EDU>
264
265Bug reports, patches, and nagging provided by lots of folks-- thanks
266everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
267for assuring me that a &nested_quotewords() would be useful, and to
268Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
269error-checking (sort of-- you had to be there).
270
271=cut