Cleanup the main regex in Text::ParseWords and make the
Marcus Holland-Moritz [Fri, 25 Jun 2004 20:27:05 +0000 (20:27 +0000)]
parse_line() routine faster. Add a Unicode test case.

p4raw-id: //depot/perl@22997

lib/Text/ParseWords.pm
lib/Text/ParseWords.t

index fbc0ee0..cca28bf 100644 (file)
@@ -1,7 +1,7 @@
 package Text::ParseWords;
 
 use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
-$VERSION = "3.22";
+$VERSION = "3.23";
 
 require 5.000;
 
@@ -53,32 +53,27 @@ sub parse_line {
        use re 'taint'; # if it's tainted, leave it as such
 
     my($delimiter, $keep, $line) = @_;
-    my($quote, $quoted, $unquoted, $delim, $word, @pieces);
+    my($word, @pieces);
 
     while (length($line)) {
-
-       ($quote, $quoted, undef, $unquoted, $delim, undef) =
-           $line =~ m/^(["'])                 # a $quote
-                        ((?:\\[\000-\377]|(?!\1)[^\\])*)  # and $quoted text
-                        \1                    # followed by the same quote
-                        ([\000-\377]*)        # and the rest
-                      |                       # --OR--
-                       ^((?:\\[\000-\377]|[^\\"'])*?)     # an $unquoted text
-                     (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))  
-                                               # plus EOL, delimiter, or quote
-                      ([\000-\377]*)          # the rest
-                     /x;                      # extended layout
-       return() unless( $quote || length($unquoted) || length($delim));
-
-       $line = $+;
+       $line =~ s/^(["'])                      # a $quote
+                   ((?:\\.|(?!\1)[^\\])*)      # and $quoted text
+                   \1                          # followed by the same quote
+                  |                            # --OR--
+                  ^((?:\\.|[^\\"'])*?)         # an $unquoted text
+                   (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))  
+                                               # plus EOL, delimiter, or quote
+                 //xs;                         # extended layout
+       my($quote, $quoted, $unquoted, $delim) = ($1, $2, $3, $4);
+       return() unless( defined($quote) || length($unquoted) || length($delim));
 
         if ($keep) {
            $quoted = "$quote$quoted$quote";
        }
         else {
-           $unquoted =~ s/\\([\000-\377])/$1/g;
+           $unquoted =~ s/\\(.)/$1/sg;
            if (defined $quote) {
-               $quoted =~ s/\\([\000-\377])/$1/g if ($quote eq '"');
+               $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
                $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
             }
        }
index ef0e562..c776e66 100755 (executable)
@@ -8,7 +8,7 @@ BEGIN {
 use warnings;
 use Text::ParseWords;
 
-print "1..20\n";
+print "1..21\n";
 
 @words = shellwords(qq(foo "bar quiz" zoo));
 print "not " if $words[0] ne 'foo';
@@ -119,3 +119,9 @@ print "ok 19\n";
 $result = join('|', parse_line("\t", 0, $string));
 print "not " unless $result eq "field1|field2\nstill field2|field3";
 print "ok 20\n";
+
+# unicode
+$string = qq{"field1"\x{1234}"field2\\\x{1234}still field2"\x{1234}"field3"};
+$result = join('|', parse_line("\x{1234}", 0, $string));
+print "not " unless $result eq "field1|field2\x{1234}still field2|field3";
+print "ok 21\n";