lib/Text/ParseWords.pm

   1 package Text::ParseWords;
   2
   3 use strict;
   4 require 5.006;
   5 our $VERSION = "3.27";
   6
   7
   8 use Exporter;
   9 our @ISA = qw(Exporter);
  10 our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
  11 our @EXPORT_OK = qw(old_shellwords);
  12 our $PERL_SINGLE_QUOTE;
  13
  14
  15 sub shellwords {
  16     my (@lines) = @_;
  17     my @allwords;
  18
  19     foreach my $line (@lines) {
  20         $line =~ s/^\s+//;
  21         my @words = parse_line('\s+', 0, $line);
  22         pop @words if (@words and !defined $words[-1]);
  23         return() unless (@words || !length($line));
  24         push(@allwords, @words);
  25     }
  26     return(@allwords);
  27 }
  28
  29
  30
  31 sub quotewords {
  32     my($delim, $keep, @lines) = @_;
  33     my($line, @words, @allwords);
  34
  35     foreach $line (@lines) {
  36         @words = parse_line($delim, $keep, $line);
  37         return() unless (@words || !length($line));
  38         push(@allwords, @words);
  39     }
  40     return(@allwords);
  41 }
  42
  43
  44
  45 sub nested_quotewords {
  46     my($delim, $keep, @lines) = @_;
  47     my($i, @allwords);
  48
  49     for ($i = 0; $i < @lines; $i++) {
  50         @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
  51         return() unless (@{$allwords[$i]} || !length($lines[$i]));
  52     }
  53     return(@allwords);
  54 }
  55
  56
  57
  58 sub parse_line {
  59     my($delimiter, $keep, $line) = @_;
  60     my($word, @pieces);
  61
  62     no warnings 'uninitialized';        # we will be testing undef strings
  63
  64     while (length($line)) {
  65         # This pattern is optimised to be stack conservative on older perls.
  66         # Do not refactor without being careful and testing it on very long strings.
  67         # See Perl bug #42980 for an example of a stack busting input.
  68         $line =~ s/^
  69                     (?:
  70                         # double quoted string
  71                         (")                             # $quote
  72                         ((?>[^\\"]*(?:\\.[^\\"]*)*))"   # $quoted
  73                     |   # --OR--
  74                         # singe quoted string
  75                         (')                             # $quote
  76                         ((?>[^\\']*(?:\\.[^\\']*)*))'   # $quoted
  77                     |   # --OR--
  78                         # unquoted string
  79                         (                               # $unquoted
  80                             (?:\\.|[^\\"'])*?
  81                         )
  82                         # followed by
  83                         (                               # $delim
  84                             \Z(?!\n)                    # EOL
  85                         |   # --OR--
  86                             (?-x:$delimiter)            # delimiter
  87                         |   # --OR--
  88                             (?!^)(?=["'])               # a quote
  89                         )
  90                     )//xs or return;            # extended layout
  91         my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
  92
  93
  94         return() unless( defined($quote) || length($unquoted) || length($delim));
  95
  96         if ($keep) {
  97             $quoted = "$quote$quoted$quote";
  98         }
  99         else {
 100             $unquoted =~ s/\\(.)/$1/sg;
 101             if (defined $quote) {
 102                 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
 103                 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
 104             }
 105         }
 106         $word .= substr($line, 0, 0);   # leave results tainted
 107         $word .= defined $quote ? $quoted : $unquoted;
 108
 109         if (length($delim)) {
 110             push(@pieces, $word);
 111             push(@pieces, $delim) if ($keep eq 'delimiters');
 112             undef $word;
 113         }
 114         if (!length($line)) {
 115             push(@pieces, $word);
 116         }
 117     }
 118     return(@pieces);
 119 }
 120
 121
 122
 123 sub old_shellwords {
 124
 125     # Usage:
 126     #   use ParseWords;
 127     #   @words = old_shellwords($line);
 128     #   or
 129     #   @words = old_shellwords(@lines);
 130     #   or
 131     #   @words = old_shellwords();      # defaults to $_ (and clobbers it)
 132
 133     no warnings 'uninitialized';        # we will be testing undef strings
 134     local *_ = \join('', @_) if @_;
 135     my (@words, $snippet);
 136
 137     s/\A\s+//;
 138     while ($_ ne '') {
 139         my $field = substr($_, 0, 0);   # leave results tainted
 140         for (;;) {
 141             if (s/\A"(([^"\\]|\\.)*)"//s) {
 142                 ($snippet = $1) =~ s#\\(.)#$1#sg;
 143             }
 144             elsif (/\A"/) {
 145                 require Carp;
 146                 Carp::carp("Unmatched double quote: $_");
 147                 return();
 148             }
 149             elsif (s/\A'(([^'\\]|\\.)*)'//s) {
 150                 ($snippet = $1) =~ s#\\(.)#$1#sg;
 151             }
 152             elsif (/\A'/) {
 153                 require Carp;
 154                 Carp::carp("Unmatched single quote: $_");
 155                 return();
 156             }
 157             elsif (s/\A\\(.?)//s) {
 158                 $snippet = $1;
 159             }
 160             elsif (s/\A([^\s\\'"]+)//) {
 161                 $snippet = $1;
 162             }
 163             else {
 164                 s/\A\s+//;
 165                 last;
 166             }
 167             $field .= $snippet;
 168         }
 169         push(@words, $field);
 170     }
 171     return @words;
 172 }
 173
 174 1;
 175
 176 __END__
 177
 178 =head1 NAME
 179
 180 Text::ParseWords - parse text into an array of tokens or array of arrays
 181
 182 =head1 SYNOPSIS
 183
 184   use Text::ParseWords;
 185   @lists = nested_quotewords($delim, $keep, @lines);
 186   @words = quotewords($delim, $keep, @lines);
 187   @words = shellwords(@lines);
 188   @words = parse_line($delim, $keep, $line);
 189   @words = old_shellwords(@lines); # DEPRECATED!
 190
 191 =head1 DESCRIPTION
 192
 193 The &nested_quotewords() and &quotewords() functions accept a delimiter
 194 (which can be a regular expression)
 195 and a list of lines and then breaks those lines up into a list of
 196 words ignoring delimiters that appear inside quotes.  &quotewords()
 197 returns all of the tokens in a single long list, while &nested_quotewords()
 198 returns a list of token lists corresponding to the elements of @lines.
 199 &parse_line() does tokenizing on a single string.  The &*quotewords()
 200 functions simply call &parse_line(), so if you're only splitting
 201 one line you can call &parse_line() directly and save a function
 202 call.
 203
 204 The $keep argument is a boolean flag.  If true, then the tokens are
 205 split on the specified delimiter, but all other characters (quotes,
 206 backslashes, etc.) are kept in the tokens.  If $keep is false then the
 207 &*quotewords() functions remove all quotes and backslashes that are
 208 not themselves backslash-escaped or inside of single quotes (i.e.,
 209 &quotewords() tries to interpret these characters just like the Bourne
 210 shell).  NB: these semantics are significantly different from the
 211 original version of this module shipped with Perl 5.000 through 5.004.
 212 As an additional feature, $keep may be the keyword "delimiters" which
 213 causes the functions to preserve the delimiters in each string as
 214 tokens in the token lists, in addition to preserving quote and
 215 backslash characters.
 216
 217 &shellwords() is written as a special case of &quotewords(), and it
 218 does token parsing with whitespace as a delimiter-- similar to most
 219 Unix shells.
 220
 221 =head1 EXAMPLES
 222
 223 The sample program:
 224
 225   use Text::ParseWords;
 226   @words = quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
 227   $i = 0;
 228   foreach (@words) {
 229       print "$i: <$_>\n";
 230       $i++;
 231   }
 232
 233 produces:
 234
 235   0: <this>
 236   1: <is>
 237   2: <a test>
 238   3: <of quotewords>
 239   4: <"for>
 240   5: <you>
 241
 242 demonstrating:
 243
 244 =over 4
 245
 246 =item 0
 247
 248 a simple word
 249
 250 =item 1
 251
 252 multiple spaces are skipped because of our $delim
 253
 254 =item 2
 255
 256 use of quotes to include a space in a word
 257
 258 =item 3
 259
 260 use of a backslash to include a space in a word
 261
 262 =item 4
 263
 264 use of a backslash to remove the special meaning of a double-quote
 265
 266 =item 5
 267
 268 another simple word (note the lack of effect of the
 269 backslashed double-quote)
 270
 271 =back
 272
 273 Replacing C<quotewords('\s+', 0, q{this   is...})>
 274 with C<shellwords(q{this   is...})>
 275 is a simpler way to accomplish the same thing.
 276
 277 =head1 AUTHORS
 278
 279 Maintainer: Alexandr Ciornii <alexchornyATgmail.com>.
 280
 281 Previous maintainer: Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
 282 author unknown).  Much of the code for &parse_line() (including the
 283 primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
 284
 285 Examples section another documentation provided by John Heidemann
 286 <johnh@ISI.EDU>
 287
 288 Bug reports, patches, and nagging provided by lots of folks-- thanks
 289 everybody!  Special thanks to Michael Schwern <schwern@envirolink.org>
 290 for assuring me that a &nested_quotewords() would be useful, and to
 291 Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
 292 error-checking (sort of-- you had to be there).
 293
 294 =cut