lib/Text/ParseWords.pm

   1 package Text::ParseWords;
   2
   3 use vars qw($VERSION @ISA @EXPORT);
   4 $VERSION = "3.0";
   5
   6 require 5.000;
   7
   8 use Exporter;
   9 @ISA = qw(Exporter);
  10 @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
  11 @EXPORT_OK = qw(old_shellwords);
  12
  13
  14 sub shellwords {
  15     local(@lines) = @_;
  16     $lines[$#lines] =~ s/\s+$//;
  17     return(quotewords('\s+', 0, @lines));
  18 }
  19
  20
  21
  22 sub quotewords {
  23     my($delim, $keep, @lines) = @_;
  24     my($line, @words, @allwords);
  25
  26
  27     foreach $line (@lines) {
  28         @words = parse_line($delim, $keep, $line);
  29         return() unless (@words || !length($line));
  30         push(@allwords, @words);
  31     }
  32     return(@allwords);
  33 }
  34
  35
  36
  37 sub nested_quotewords {
  38     my($delim, $keep, @lines) = @_;
  39     my($i, @allwords);
  40
  41     for ($i = 0; $i < @lines; $i++) {
  42         @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
  43         return() unless (@{$allwords[$i]} || !length($lines[$i]));
  44     }
  45     return(@allwords);
  46 }
  47
  48
  49
  50 sub parse_line {
  51     my($delimiter, $keep, $line) = @_;
  52     my($quote, $quoted, $unquoted, $delim, $word, @pieces);
  53
  54     while (length($line)) {
  55         ($quote, $quoted, $unquoted, $delim) =
  56             $line =~ m/^(["'])                 # a $quote
  57                         ((?:\\.|[^\1\\])*?)    # and $quoted text
  58                         \1                     # followed by the same quote
  59                        |                       # --OR--
  60                        ^((?:\\.|[^\\"'])*?)    # an $unquoted text
  61                         (\Z(?!\n)|$delimiter|(?!^)(?=["']))
  62                                                # plus EOL, delimiter, or quote
  63                       /x;                      # extended layout
  64
  65         return() unless(length($&));
  66         $line = $';
  67
  68         if ($keep) {
  69             $quoted = "$quote$quoted$quote";
  70         }
  71         else {
  72             $unquoted =~ s/\\(.)/$1/g;
  73             $quoted =~ s/\\(.)/$1/g if ($quote eq '"');
  74         }
  75         $word .= ($quote) ? $quoted : $unquoted;
  76
  77         if (length($delim)) {
  78             push(@pieces, $word);
  79             push(@pieces, $delim) if ($keep eq 'delimiters');
  80             undef $word;
  81         }
  82         if (!length($line)) {
  83             push(@pieces, $word);
  84         }
  85     }
  86     return(@pieces);
  87 }
  88
  89
  90
  91 sub old_shellwords {
  92
  93     # Usage:
  94     #   use ParseWords;
  95     #   @words = old_shellwords($line);
  96     #   or
  97     #   @words = old_shellwords(@lines);
  98
  99     local($_) = join('', @_);
 100     my(@words,$snippet,$field);
 101
 102     s/^\s+//;
 103     while ($_ ne '') {
 104         $field = '';
 105         for (;;) {
 106             if (s/^"(([^"\\]|\\.)*)"//) {
 107                 ($snippet = $1) =~ s#\\(.)#$1#g;
 108             }
 109             elsif (/^"/) {
 110                 return();
 111             }
 112             elsif (s/^'(([^'\\]|\\.)*)'//) {
 113                 ($snippet = $1) =~ s#\\(.)#$1#g;
 114             }
 115             elsif (/^'/) {
 116                 return();
 117             }
 118             elsif (s/^\\(.)//) {
 119                 $snippet = $1;
 120             }
 121             elsif (s/^([^\s\\'"]+)//) {
 122                 $snippet = $1;
 123             }
 124             else {
 125                 s/^\s+//;
 126                 last;
 127             }
 128             $field .= $snippet;
 129         }
 130         push(@words, $field);
 131     }
 132     @words;
 133 }
 134
 135 1;
 136
 137 __END__
 138
 139 =head1 NAME
 140
 141 Text::ParseWords - parse text into an array of tokens or array of arrays
 142
 143 =head1 SYNOPSIS
 144
 145   use Text::ParseWords;
 146   @lists = &nested_quotewords($delim, $keep, @lines);
 147   @words = &quotewords($delim, $keep, @lines);
 148   @words = &shellwords(@lines);
 149   @words = &parse_line($delim, $keep, $line);
 150   @words = &old_shellwords(@lines); # DEPRECATED!
 151
 152 =head1 DESCRIPTION
 153
 154 The &nested_quotewords() and &quotewords() functions accept a delimiter
 155 (which can be a regular expression)
 156 and a list of lines and then breaks those lines up into a list of
 157 words ignoring delimiters that appear inside quotes.  &quotewords()
 158 returns all of the tokens in a single long list, while &nested_quotewords()
 159 returns a list of token lists corresponding to the elements of @lines.
 160 &parse_line() does tokenizing on a single string.  The &*quotewords()
 161 functions simply call &parse_lines(), so if you're only splitting
 162 one line you can call &parse_lines() directly and save a function
 163 call.
 164
 165 The $keep argument is a boolean flag.  If true, then the tokens are
 166 split on the specified delimiter, but all other characters (quotes,
 167 backslashes, etc.) are kept in the tokens.  If $keep is false then the
 168 &*quotewords() functions remove all quotes and backslashes that are
 169 not themselves backslash-escaped or inside of single quotes (i.e.,
 170 &quotewords() tries to interpret these characters just like the Bourne
 171 shell).  NB: these semantics are significantly different from the
 172 original version of this module shipped with Perl 5.000 through 5.004.
 173 As an additional feature, $keep may be the keyword "delimiters" which
 174 causes the functions to preserve the delimiters in each string as
 175 tokens in the token lists, in addition to preserving quote and
 176 backslash characters.
 177
 178 &shellwords() is written as a special case of &quotewords(), and it
 179 does token parsing with whitespace as a delimiter-- similar to most
 180 Unix shells.
 181
 182 =head1 EXAMPLES
 183
 184 The sample program:
 185
 186   use Text::ParseWords;
 187   @words = &quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
 188   $i = 0;
 189   foreach (@words) {
 190       print "$i: <$_>\n";
 191       $i++;
 192   }
 193
 194 produces:
 195
 196   0: <this>
 197   1: <is>
 198   2: <a test>
 199   3: <of quotewords>
 200   4: <"for>
 201   5: <you>
 202
 203 demonstrating:
 204
 205 =over 4
 206
 207 =item 0
 208 a simple word
 209
 210 =item 1
 211 multiple spaces are skipped because of our $delim
 212
 213 =item 2
 214 use of quotes to include a space in a word
 215
 216 =item 3
 217 use of a backslash to include a space in a word
 218
 219 =item 4
 220 use of a backslash to remove the special meaning of a double-quote
 221
 222 =item 5
 223 another simple word (note the lack of effect of the
 224 backslashed double-quote)
 225
 226 =back
 227
 228 Replacing C<&quotewords('\s+', 0, q{this   is...})>
 229 with C<&shellwords(q{this   is...})>
 230 is a simpler way to accomplish the same thing.
 231
 232 =head1 AUTHORS
 233
 234 Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
 235 author unknown).  Much of the code for &parse_line() (including the
 236 primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
 237
 238 Examples section another documentation provided by John Heidemann
 239 <johnh@ISI.EDU>
 240
 241 Bug reports, patches, and nagging provided by lots of folks-- thanks
 242 everybody!  Special thanks to Michael Schwern <schwern@envirolink.org>
 243 for assuring me that a &nested_quotewords() would be useful, and to
 244 Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
 245 error-checking (sort of-- you had to be there).
 246
 247 =cut