[p5sagit/p5-mst-13.2.git] / lib / Text / ParseWords.pm

package Text::ParseWords;

use vars qw($VERSION @ISA @EXPORT);
$VERSION = "3.0";

require 5.000;

use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
@EXPORT_OK = qw(old_shellwords);


sub shellwords {
    local(@lines) = @_;
    $lines[$#lines] =~ s/\s+$//;
    return(quotewords('\s+', 0, @lines));
}


sub quotewords {
    my($delim, $keep, @lines) = @_;
    my($line, @words, @allwords);
    

    foreach $line (@lines) {
	@words = parse_line($delim, $keep, $line);
	return() unless (@words || !length($line));
	push(@allwords, @words);
    }
    return(@allwords);
}


sub nested_quotewords {
    my($delim, $keep, @lines) = @_;
    my($i, @allwords);
    
    for ($i = 0; $i < @lines; $i++) {
	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
	return() unless (@{$allwords[$i]} || !length($lines[$i]));
    }
    return(@allwords);
}


sub parse_line {
    my($delimiter, $keep, $line) = @_;
    my($quote, $quoted, $unquoted, $delim, $word, @pieces);

    while (length($line)) {
	($quote, $quoted, $unquoted, $delim) =
	    $line =~ m/^(["'])                 # a $quote
                        ((?:\\.|[^\1\\])*?)    # and $quoted text
                        \1                     # followed by the same quote
		       |                       # --OR--
                       ^((?:\\.|[^\\"'])*?)    # an $unquoted text
                        (\Z(?!\n)|$delimiter|(?!^)(?=["']))  
                                               # plus EOL, delimiter, or quote
                      /x;                      # extended layout

        return() unless(length($&));
        $line = $';

        if ($keep) {
	    $quoted = "$quote$quoted$quote";
	}
        else {
	    $unquoted =~ s/\\(.)/$1/g;
	    $quoted =~ s/\\(.)/$1/g if ($quote eq '"');
	}
        $word .= ($quote) ? $quoted : $unquoted;
 
        if (length($delim)) {
            push(@pieces, $word);
            push(@pieces, $delim) if ($keep eq 'delimiters');
            undef $word;
        }
        if (!length($line)) {
            push(@pieces, $word);
	}
    }
    return(@pieces);
}


sub old_shellwords {

    # Usage:
    #	use ParseWords;
    #	@words = old_shellwords($line);
    #	or
    #	@words = old_shellwords(@lines);

    local($_) = join('', @_);
    my(@words,$snippet,$field);

    s/^\s+//;
    while ($_ ne '') {
	$field = '';
	for (;;) {
	    if (s/^"(([^"\\]|\\.)*)"//) {
		($snippet = $1) =~ s#\\(.)#$1#g;
	    }
	    elsif (/^"/) {
		return();
	    }
	    elsif (s/^'(([^'\\]|\\.)*)'//) {
		($snippet = $1) =~ s#\\(.)#$1#g;
	    }
	    elsif (/^'/) {
		return();
	    }
	    elsif (s/^\\(.)//) {
		$snippet = $1;
	    }
	    elsif (s/^([^\s\\'"]+)//) {
		$snippet = $1;
	    }
	    else {
		s/^\s+//;
		last;
	    }
	    $field .= $snippet;
	}
	push(@words, $field);
    }
    @words;
}

1;

__END__

=head1 NAME

Text::ParseWords - parse text into an array of tokens or array of arrays

=head1 SYNOPSIS

  use Text::ParseWords;
  @lists = &nested_quotewords($delim, $keep, @lines);
  @words = &quotewords($delim, $keep, @lines);
  @words = &shellwords(@lines);
  @words = &parse_line($delim, $keep, $line);
  @words = &old_shellwords(@lines); # DEPRECATED!

=head1 DESCRIPTION

The &nested_quotewords() and &quotewords() functions accept a delimiter 
(which can be a regular expression)
and a list of lines and then breaks those lines up into a list of
words ignoring delimiters that appear inside quotes.  &quotewords()
returns all of the tokens in a single long list, while &nested_quotewords()
returns a list of token lists corresponding to the elements of @lines.
&parse_line() does tokenizing on a single string.  The &*quotewords()
functions simply call &parse_lines(), so if you're only splitting
one line you can call &parse_lines() directly and save a function
call.

The $keep argument is a boolean flag.  If true, then the tokens are
split on the specified delimiter, but all other characters (quotes,
backslashes, etc.) are kept in the tokens.  If $keep is false then the
&*quotewords() functions remove all quotes and backslashes that are
not themselves backslash-escaped or inside of single quotes (i.e.,
&quotewords() tries to interpret these characters just like the Bourne
shell).  NB: these semantics are significantly different from the
original version of this module shipped with Perl 5.000 through 5.004.
As an additional feature, $keep may be the keyword "delimiters" which
causes the functions to preserve the delimiters in each string as
tokens in the token lists, in addition to preserving quote and
backslash characters.

&shellwords() is written as a special case of &quotewords(), and it
does token parsing with whitespace as a delimiter-- similar to most
Unix shells.

=head1 EXAMPLES

The sample program:

  use Text::ParseWords;
  @words = &quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
  $i = 0;
  foreach (@words) {
      print "$i: <$_>\n";
      $i++;
  }

produces:

  0: <this>
  1: <is>
  2: <a test>
  3: <of quotewords>
  4: <"for>
  5: <you>

demonstrating:

=over 4

=item 0
a simple word

=item 1
multiple spaces are skipped because of our $delim

=item 2
use of quotes to include a space in a word

=item 3
use of a backslash to include a space in a word

=item 4
use of a backslash to remove the special meaning of a double-quote

=item 5
another simple word (note the lack of effect of the
backslashed double-quote)

=back

Replacing C<&quotewords('\s+', 0, q{this   is...})>
with C<&shellwords(q{this   is...})>
is a simpler way to accomplish the same thing.

=head1 AUTHORS

Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
author unknown).  Much of the code for &parse_line() (including the
primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.

Examples section another documentation provided by John Heidemann 
<johnh@ISI.EDU>

Bug reports, patches, and nagging provided by lots of folks-- thanks
everybody!  Special thanks to Michael Schwern <schwern@envirolink.org>
for assuring me that a &nested_quotewords() would be useful, and to 
Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
error-checking (sort of-- you had to be there).

=cut
Commit	Line	Data
a0d0e21e	1	package Text::ParseWords;
a0d0e21e	2
9b599b2a	3	use vars qw($VERSION @ISA @EXPORT);
9b599b2a	4	$VERSION = "3.0";
a0d0e21e	5
9b599b2a	6	require 5.000;
dc848c6f	7
9b599b2a	8	use Exporter;
dc848c6f	9	@ISA = qw(Exporter);
9b599b2a	10	@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
a0d0e21e	11	@EXPORT_OK = qw(old_shellwords);
a0d0e21e	12
a5f75d66	13
9b599b2a	14	sub shellwords {
	15	local(@lines) = @_;
	16	$lines[$#lines] =~ s/\s+$//;
	17	return(quotewords('\s+', 0, @lines));
	18	}
a5f75d66	19
a5f75d66	20
a5f75d66	21
9b599b2a	22	sub quotewords {
	23	my($delim, $keep, @lines) = @_;
	24	my($line, @words, @allwords);
	25
	26
	27	foreach $line (@lines) {
	28	@words = parse_line($delim, $keep, $line);
	29	return() unless (@words \|\| !length($line));
	30	push(@allwords, @words);
	31	}
	32	return(@allwords);
	33	}
a5f75d66	34
a5f75d66	35
a5f75d66	36
9b599b2a	37	sub nested_quotewords {
	38	my($delim, $keep, @lines) = @_;
	39	my($i, @allwords);
	40
	41	for ($i = 0; $i < @lines; $i++) {
	42	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
	43	return() unless (@{$allwords[$i]} \|\| !length($lines[$i]));
	44	}
	45	return(@allwords);
2304df62	46	}
	47
	48
a0d0e21e	49
9b599b2a	50	sub parse_line {
	51	my($delimiter, $keep, $line) = @_;
	52	my($quote, $quoted, $unquoted, $delim, $word, @pieces);
936c8837	53
9b599b2a	54	while (length($line)) {
	55	($quote, $quoted, $unquoted, $delim) =
	56	$line =~ m/^(["']) # a $quote
	57	((?:\\.\|[^\1\\])*?) # and $quoted text
	58	\1 # followed by the same quote
	59	\| # --OR--
	60	^((?:\\.\|[^\\"'])*?) # an $unquoted text
	61	(\Z(?!\n)\|$delimiter\|(?!^)(?=["']))
	62	# plus EOL, delimiter, or quote
	63	/x; # extended layout
936c8837	64
9b599b2a	65	return() unless(length($&));
9b599b2a	66	$line = $';
936c8837	67
9b599b2a	68	if ($keep) {
	69	$quoted = "$quote$quoted$quote";
	70	}
	71	else {
	72	$unquoted =~ s/\\(.)/$1/g;
	73	$quoted =~ s/\\(.)/$1/g if ($quote eq '"');
	74	}
	75	$word .= ($quote) ? $quoted : $unquoted;
	76
	77	if (length($delim)) {
	78	push(@pieces, $word);
	79	push(@pieces, $delim) if ($keep eq 'delimiters');
	80	undef $word;
	81	}
	82	if (!length($line)) {
	83	push(@pieces, $word);
2304df62	84	}
2304df62	85	}
9b599b2a	86	return(@pieces);
2304df62	87	}
2304df62	88
2304df62	89
9b599b2a	90
a0d0e21e	91	sub old_shellwords {
	92
	93	# Usage:
	94	# use ParseWords;
	95	# @words = old_shellwords($line);
	96	# or
	97	# @words = old_shellwords(@lines);
	98
	99	local($_) = join('', @_);
	100	my(@words,$snippet,$field);
	101
	102	s/^\s+//;
	103	while ($_ ne '') {
	104	$field = '';
	105	for (;;) {
	106	if (s/^"(([^"\\]\|\\.)*)"//) {
	107	($snippet = $1) =~ s#\\(.)#$1#g;
	108	}
	109	elsif (/^"/) {
9b599b2a	110	return();
a0d0e21e	111	}
	112	elsif (s/^'(([^'\\]\|\\.)*)'//) {
	113	($snippet = $1) =~ s#\\(.)#$1#g;
	114	}
	115	elsif (/^'/) {
9b599b2a	116	return();
a0d0e21e	117	}
	118	elsif (s/^\\(.)//) {
	119	$snippet = $1;
	120	}
	121	elsif (s/^([^\s\\'"]+)//) {
	122	$snippet = $1;
	123	}
	124	else {
	125	s/^\s+//;
	126	last;
	127	}
	128	$field .= $snippet;
	129	}
	130	push(@words, $field);
	131	}
	132	@words;
	133	}
9b599b2a	134
	135	1;
	136
	137	__END__
	138
	139	=head1 NAME
	140
	141	Text::ParseWords - parse text into an array of tokens or array of arrays
	142
	143	=head1 SYNOPSIS
	144
	145	use Text::ParseWords;
	146	@lists = &nested_quotewords($delim, $keep, @lines);
	147	@words = &quotewords($delim, $keep, @lines);
	148	@words = &shellwords(@lines);
	149	@words = &parse_line($delim, $keep, $line);
	150	@words = &old_shellwords(@lines); # DEPRECATED!
	151
	152	=head1 DESCRIPTION
	153
	154	The &nested_quotewords() and &quotewords() functions accept a delimiter
	155	(which can be a regular expression)
	156	and a list of lines and then breaks those lines up into a list of
	157	words ignoring delimiters that appear inside quotes. &quotewords()
	158	returns all of the tokens in a single long list, while &nested_quotewords()
	159	returns a list of token lists corresponding to the elements of @lines.
	160	&parse_line() does tokenizing on a single string. The &*quotewords()
	161	functions simply call &parse_lines(), so if you're only splitting
	162	one line you can call &parse_lines() directly and save a function
	163	call.
	164
	165	The $keep argument is a boolean flag. If true, then the tokens are
	166	split on the specified delimiter, but all other characters (quotes,
	167	backslashes, etc.) are kept in the tokens. If $keep is false then the
	168	&*quotewords() functions remove all quotes and backslashes that are
	169	not themselves backslash-escaped or inside of single quotes (i.e.,
	170	&quotewords() tries to interpret these characters just like the Bourne
	171	shell). NB: these semantics are significantly different from the
	172	original version of this module shipped with Perl 5.000 through 5.004.
	173	As an additional feature, $keep may be the keyword "delimiters" which
	174	causes the functions to preserve the delimiters in each string as
	175	tokens in the token lists, in addition to preserving quote and
	176	backslash characters.
	177
	178	&shellwords() is written as a special case of &quotewords(), and it
	179	does token parsing with whitespace as a delimiter-- similar to most
	180	Unix shells.
	181
	182	=head1 EXAMPLES
	183
	184	The sample program:
	185
	186	use Text::ParseWords;
	187	@words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
	188	$i = 0;
	189	foreach (@words) {
	190	print "$i: <$_>\n";
	191	$i++;
	192	}
	193
	194	produces:
	195
	196	0: <this>
	197	1: <is>
198	2: <a test>
199	3: <of quotewords>
200	4: <"for>
201	5: <you>
202
203	demonstrating:
204
205	=over 4
206
207	=item 0
208	a simple word
209
210	=item 1
211	multiple spaces are skipped because of our $delim
212
213	=item 2
214	use of quotes to include a space in a word
215
216	=item 3
217	use of a backslash to include a space in a word
218
219	=item 4
220	use of a backslash to remove the special meaning of a double-quote
221
222	=item 5
223	another simple word (note the lack of effect of the
224	backslashed double-quote)
225
226	=back
227
228	Replacing C<&quotewords('\s+', 0, q{this is...})>
229	with C<&shellwords(q{this is...})>
230	is a simpler way to accomplish the same thing.
231
232	=head1 AUTHORS
233
234	Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
235	author unknown). Much of the code for &parse_line() (including the
236	primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
237
238	Examples section another documentation provided by John Heidemann
239	<johnh@ISI.EDU>
240
241	Bug reports, patches, and nagging provided by lots of folks-- thanks
242	everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
243	for assuring me that a &nested_quotewords() would be useful, and to
244	Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
245	error-checking (sort of-- you had to be there).
246
247	=cut