Commit | Line | Data |
a0d0e21e |
1 | package Text::ParseWords; |
2 | |
3 | require 5.000; |
4 | require Exporter; |
5 | require AutoLoader; |
6 | use Carp; |
7 | |
8 | @ISA = qw(Exporter AutoLoader); |
9 | @EXPORT = qw(shellwords quotewords); |
10 | @EXPORT_OK = qw(old_shellwords); |
11 | |
a5f75d66 |
12 | =head1 NAME |
2304df62 |
13 | |
a5f75d66 |
14 | Text::ParseWords - parse text into an array of tokens |
a0d0e21e |
15 | |
a5f75d66 |
16 | =head1 SYNOPSIS |
a0d0e21e |
17 | |
a5f75d66 |
18 | use Text::ParseWords; |
19 | @words = "ewords($delim, $keep, @lines); |
20 | @words = &shellwords(@lines); |
21 | @words = &old_shellwords(@lines); |
22 | |
23 | =head1 DESCRIPTION |
24 | |
25 | "ewords() accepts a delimiter (which can be a regular expression) |
26 | and a list of lines and then breaks those lines up into a list of |
27 | words ignoring delimiters that appear inside quotes. |
28 | |
29 | The $keep argument is a boolean flag. If true, the quotes are kept |
30 | with each word, otherwise quotes are stripped in the splitting process. |
31 | $keep also defines whether unprotected backslashes are retained. |
32 | |
33 | A &shellwords() replacement is included to demonstrate the new package. |
34 | This version differs from the original in that it will _NOT_ default |
35 | to using $_ if no arguments are given. I personally find the old behavior |
36 | to be a mis-feature. |
37 | |
38 | |
39 | "ewords() works by simply jamming all of @lines into a single |
40 | string in $_ and then pulling off words a bit at a time until $_ |
41 | is exhausted. |
42 | |
43 | The inner "for" loop builds up each word (or $field) one $snippet |
44 | at a time. A $snippet is a quoted string, a backslashed character, |
45 | or an unquoted string. We fall out of the "for" loop when we reach |
46 | the end of $_ or when we hit a delimiter. Falling out of the "for" |
47 | loop, we push the $field we've been building up onto the list of |
48 | @words we'll be returning, and then loop back and pull another word |
49 | off of $_. |
50 | |
51 | The first two cases inside the "for" loop deal with quoted strings. |
52 | The first case matches a double quoted string, removes it from $_, |
53 | and assigns the double quoted string to $snippet in the body of the |
54 | conditional. The second case handles single quoted strings. In |
55 | the third case we've found a quote at the current beginning of $_, |
56 | but it didn't match the quoted string regexps in the first two cases, |
57 | so it must be an unbalanced quote and we croak with an error (which can |
58 | be caught by eval()). |
a0d0e21e |
59 | |
a5f75d66 |
60 | The next case handles backslashed characters, and the next case is the |
61 | exit case on reaching the end of the string or finding a delimiter. |
2304df62 |
62 | |
a5f75d66 |
63 | Otherwise, we've found an unquoted thing and we pull of characters one |
64 | at a time until we reach something that could start another $snippet-- |
65 | a quote of some sort, a backslash, or the delimiter. This one character |
66 | at a time behavior was necessary if the delimiter was going to be a |
67 | regexp (love to hear it if you can figure out a better way). |
68 | |
69 | =head1 AUTHORS |
70 | |
71 | Hal Pomeranz (pomeranz@netcom.com), 23 March 1994 |
72 | |
73 | Basically an update and generalization of the old shellwords.pl. |
74 | Much code shamelessly stolen from the old version (author unknown). |
75 | |
76 | =cut |
77 | |
78 | 1; |
79 | __END__ |
80 | |
81 | sub shellwords { |
2304df62 |
82 | local(@lines) = @_; |
83 | $lines[$#lines] =~ s/\s+$//; |
a0d0e21e |
84 | "ewords('\s+', 0, @lines); |
2304df62 |
85 | } |
86 | |
87 | |
a0d0e21e |
88 | |
89 | sub quotewords { |
2304df62 |
90 | local($delim, $keep, @lines) = @_; |
91 | local(@words,$snippet,$field,$_); |
92 | |
93 | $_ = join('', @lines); |
94 | while ($_) { |
95 | $field = ''; |
96 | for (;;) { |
97 | $snippet = ''; |
98 | if (s/^"(([^"\\]|\\[\\"])*)"//) { |
99 | $snippet = $1; |
100 | $snippet = "\"$snippet\"" if ($keep); |
101 | } |
102 | elsif (s/^'(([^'\\]|\\[\\'])*)'//) { |
103 | $snippet = $1; |
104 | $snippet = "'$snippet'" if ($keep); |
105 | } |
106 | elsif (/^["']/) { |
a0d0e21e |
107 | croak "Unmatched quote"; |
2304df62 |
108 | } |
109 | elsif (s/^\\(.)//) { |
110 | $snippet = $1; |
111 | $snippet = "\\$snippet" if ($keep); |
112 | } |
113 | elsif (!$_ || s/^$delim//) { |
114 | last; |
115 | } |
116 | else { |
117 | while ($_ && !(/^$delim/ || /^['"\\]/)) { |
118 | $snippet .= substr($_, 0, 1); |
119 | substr($_, 0, 1) = ''; |
120 | } |
121 | } |
122 | $field .= $snippet; |
123 | } |
124 | push(@words, $field); |
125 | } |
126 | @words; |
127 | } |
2304df62 |
128 | |
129 | |
a0d0e21e |
130 | sub old_shellwords { |
131 | |
132 | # Usage: |
133 | # use ParseWords; |
134 | # @words = old_shellwords($line); |
135 | # or |
136 | # @words = old_shellwords(@lines); |
137 | |
138 | local($_) = join('', @_); |
139 | my(@words,$snippet,$field); |
140 | |
141 | s/^\s+//; |
142 | while ($_ ne '') { |
143 | $field = ''; |
144 | for (;;) { |
145 | if (s/^"(([^"\\]|\\.)*)"//) { |
146 | ($snippet = $1) =~ s#\\(.)#$1#g; |
147 | } |
148 | elsif (/^"/) { |
149 | croak "Unmatched double quote: $_"; |
150 | } |
151 | elsif (s/^'(([^'\\]|\\.)*)'//) { |
152 | ($snippet = $1) =~ s#\\(.)#$1#g; |
153 | } |
154 | elsif (/^'/) { |
155 | croak "Unmatched single quote: $_"; |
156 | } |
157 | elsif (s/^\\(.)//) { |
158 | $snippet = $1; |
159 | } |
160 | elsif (s/^([^\s\\'"]+)//) { |
161 | $snippet = $1; |
162 | } |
163 | else { |
164 | s/^\s+//; |
165 | last; |
166 | } |
167 | $field .= $snippet; |
168 | } |
169 | push(@words, $field); |
170 | } |
171 | @words; |
172 | } |