lib/Text/ParseWords.pm

   1 package Text::ParseWords;
   2
   3 use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
   4 $VERSION = "3.26";
   5
   6 require 5.000;
   7
   8 use Exporter;
   9 @ISA = qw(Exporter);
  10 @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
  11 @EXPORT_OK = qw(old_shellwords);
  12
  13
  14 sub shellwords {
  15     my (@lines) = @_;
  16     my @allwords;
  17
  18     foreach my $line (@lines) {
  19         $line =~ s/^\s+//;
  20         my @words = parse_line('\s+', 0, $line);
  21         pop @words if (@words and !defined $words[-1]);
  22         return() unless (@words || !length($line));
  23         push(@allwords, @words);
  24     }
  25     return(@allwords);
  26 }
  27
  28
  29
  30 sub quotewords {
  31     my($delim, $keep, @lines) = @_;
  32     my($line, @words, @allwords);
  33
  34     foreach $line (@lines) {
  35         @words = parse_line($delim, $keep, $line);
  36         return() unless (@words || !length($line));
  37         push(@allwords, @words);
  38     }
  39     return(@allwords);
  40 }
  41
  42
  43
  44 sub nested_quotewords {
  45     my($delim, $keep, @lines) = @_;
  46     my($i, @allwords);
  47
  48     for ($i = 0; $i < @lines; $i++) {
  49         @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
  50         return() unless (@{$allwords[$i]} || !length($lines[$i]));
  51     }
  52     return(@allwords);
  53 }
  54
  55
  56
  57 sub parse_line {
  58     my($delimiter, $keep, $line) = @_;
  59     my($word, @pieces);
  60
  61     no warnings 'uninitialized';        # we will be testing undef strings
  62
  63     while (length($line)) {
  64         # This pattern is optimised to be stack conservative on older perls.
  65         # Do not refactor without being careful and testing it on very long strings.
  66         # See Perl bug #42980 for an example of a stack busting input.
  67         $line =~ s/^
  68                     (?:
  69                         # double quoted string
  70                         (")                             # $quote
  71                         ((?>[^\\"]*(?:\\.[^\\"]*)*))"   # $quoted
  72                     |   # --OR--
  73                         # singe quoted string
  74                         (')                             # $quote
  75                         ((?>[^\\']*(?:\\.[^\\']*)*))'   # $quoted
  76                     |   # --OR--
  77                         # unquoted string
  78                         (                               # $unquoted
  79                             (?:\\.|[^\\"'])*?
  80                         )
  81                         # followed by
  82                         (                               # $delim
  83                             \Z(?!\n)                    # EOL
  84                         |   # --OR--
  85                             (?-x:$delimiter)            # delimiter
  86                         |   # --OR--
  87                             (?!^)(?=["'])               # a quote
  88                         )
  89                     )//xs or return;            # extended layout
  90         my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
  91
  92
  93         return() unless( defined($quote) || length($unquoted) || length($delim));
  94
  95         if ($keep) {
  96             $quoted = "$quote$quoted$quote";
  97         }
  98         else {
  99             $unquoted =~ s/\\(.)/$1/sg;
 100             if (defined $quote) {
 101                 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
 102                 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
 103             }
 104         }
 105         $word .= substr($line, 0, 0);   # leave results tainted
 106         $word .= defined $quote ? $quoted : $unquoted;
 107
 108         if (length($delim)) {
 109             push(@pieces, $word);
 110             push(@pieces, $delim) if ($keep eq 'delimiters');
 111             undef $word;
 112         }
 113         if (!length($line)) {
 114             push(@pieces, $word);
 115         }
 116     }
 117     return(@pieces);
 118 }
 119
 120
 121
 122 sub old_shellwords {
 123
 124     # Usage:
 125     #   use ParseWords;
 126     #   @words = old_shellwords($line);
 127     #   or
 128     #   @words = old_shellwords(@lines);
 129     #   or
 130     #   @words = old_shellwords();      # defaults to $_ (and clobbers it)
 131
 132     no warnings 'uninitialized';        # we will be testing undef strings
 133     local *_ = \join('', @_) if @_;
 134     my (@words, $snippet);
 135
 136     s/\A\s+//;
 137     while ($_ ne '') {
 138         my $field = substr($_, 0, 0);   # leave results tainted
 139         for (;;) {
 140             if (s/\A"(([^"\\]|\\.)*)"//s) {
 141                 ($snippet = $1) =~ s#\\(.)#$1#sg;
 142             }
 143             elsif (/\A"/) {
 144                 require Carp;
 145                 Carp::carp("Unmatched double quote: $_");
 146                 return();
 147             }
 148             elsif (s/\A'(([^'\\]|\\.)*)'//s) {
 149                 ($snippet = $1) =~ s#\\(.)#$1#sg;
 150             }
 151             elsif (/\A'/) {
 152                 require Carp;
 153                 Carp::carp("Unmatched single quote: $_");
 154                 return();
 155             }
 156             elsif (s/\A\\(.?)//s) {
 157                 $snippet = $1;
 158             }
 159             elsif (s/\A([^\s\\'"]+)//) {
 160                 $snippet = $1;
 161             }
 162             else {
 163                 s/\A\s+//;
 164                 last;
 165             }
 166             $field .= $snippet;
 167         }
 168         push(@words, $field);
 169     }
 170     return @words;
 171 }
 172
 173 1;
 174
 175 __END__
 176
 177 =head1 NAME
 178
 179 Text::ParseWords - parse text into an array of tokens or array of arrays
 180
 181 =head1 SYNOPSIS
 182
 183   use Text::ParseWords;
 184   @lists = &nested_quotewords($delim, $keep, @lines);
 185   @words = &quotewords($delim, $keep, @lines);
 186   @words = &shellwords(@lines);
 187   @words = &parse_line($delim, $keep, $line);
 188   @words = &old_shellwords(@lines); # DEPRECATED!
 189
 190 =head1 DESCRIPTION
 191
 192 The &nested_quotewords() and &quotewords() functions accept a delimiter
 193 (which can be a regular expression)
 194 and a list of lines and then breaks those lines up into a list of
 195 words ignoring delimiters that appear inside quotes.  &quotewords()
 196 returns all of the tokens in a single long list, while &nested_quotewords()
 197 returns a list of token lists corresponding to the elements of @lines.
 198 &parse_line() does tokenizing on a single string.  The &*quotewords()
 199 functions simply call &parse_line(), so if you're only splitting
 200 one line you can call &parse_line() directly and save a function
 201 call.
 202
 203 The $keep argument is a boolean flag.  If true, then the tokens are
 204 split on the specified delimiter, but all other characters (quotes,
 205 backslashes, etc.) are kept in the tokens.  If $keep is false then the
 206 &*quotewords() functions remove all quotes and backslashes that are
 207 not themselves backslash-escaped or inside of single quotes (i.e.,
 208 &quotewords() tries to interpret these characters just like the Bourne
 209 shell).  NB: these semantics are significantly different from the
 210 original version of this module shipped with Perl 5.000 through 5.004.
 211 As an additional feature, $keep may be the keyword "delimiters" which
 212 causes the functions to preserve the delimiters in each string as
 213 tokens in the token lists, in addition to preserving quote and
 214 backslash characters.
 215
 216 &shellwords() is written as a special case of &quotewords(), and it
 217 does token parsing with whitespace as a delimiter-- similar to most
 218 Unix shells.
 219
 220 =head1 EXAMPLES
 221
 222 The sample program:
 223
 224   use Text::ParseWords;
 225   @words = &quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
 226   $i = 0;
 227   foreach (@words) {
 228       print "$i: <$_>\n";
 229       $i++;
 230   }
 231
 232 produces:
 233
 234   0: <this>
 235   1: <is>
 236   2: <a test>
 237   3: <of quotewords>
 238   4: <"for>
 239   5: <you>
 240
 241 demonstrating:
 242
 243 =over 4
 244
 245 =item 0
 246
 247 a simple word
 248
 249 =item 1
 250
 251 multiple spaces are skipped because of our $delim
 252
 253 =item 2
 254
 255 use of quotes to include a space in a word
 256
 257 =item 3
 258
 259 use of a backslash to include a space in a word
 260
 261 =item 4
 262
 263 use of a backslash to remove the special meaning of a double-quote
 264
 265 =item 5
 266
 267 another simple word (note the lack of effect of the
 268 backslashed double-quote)
 269
 270 =back
 271
 272 Replacing C<&quotewords('\s+', 0, q{this   is...})>
 273 with C<&shellwords(q{this   is...})>
 274 is a simpler way to accomplish the same thing.
 275
 276 =head1 AUTHORS
 277
 278 Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
 279 author unknown).  Much of the code for &parse_line() (including the
 280 primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
 281
 282 Examples section another documentation provided by John Heidemann
 283 <johnh@ISI.EDU>
 284
 285 Bug reports, patches, and nagging provided by lots of folks-- thanks
 286 everybody!  Special thanks to Michael Schwern <schwern@envirolink.org>
 287 for assuring me that a &nested_quotewords() would be useful, and to
 288 Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
 289 error-checking (sort of-- you had to be there).
 290
 291 =cut