From: Yves Orton Date: Fri, 19 Jan 2007 02:14:06 +0000 (+0100) Subject: fix unicode split /\s+/ X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=8727f688bf9bab57862da9dd9073020b13c82940;p=p5sagit%2Fp5-mst-13.2.git fix unicode split /\s+/ Message-ID: <9b18b3110701181714r4f3bc9ebq9ba462eba8338734@mail.gmail.com> p4raw-id: //depot/perl@29880 --- diff --git a/pp.c b/pp.c index 4523584..4b021c0 100644 --- a/pp.c +++ b/pp.c @@ -4606,12 +4606,29 @@ PP(pp_split) if (!limit) limit = maxiters + 2; if (pm->op_pmflags & PMf_WHITE) { + if (do_utf8 && !PL_utf8_space) { + /* force PL_utf8_space to be loaded */ + bool ok; + ENTER; + ok = is_utf8_space((const U8*)" "); + assert(ok); + LEAVE; + } while (--limit) { m = s; - while (m < strend && - !((pm->op_pmflags & PMf_LOCALE) - ? isSPACE_LC(*m) : isSPACE(*m))) - ++m; + /* this one uses 'm' and is a negative test */ + if (do_utf8) { + STRLEN uskip; + while (m < strend && + !( *m == ' ' || swash_fetch(PL_utf8_space,(U8*)m, do_utf8) )) + m += UTF8SKIP(m); + } else if (pm->op_pmflags & PMf_LOCALE) { + while (m < strend && !isSPACE_LC(*m)) + ++m; + } else { + while (m < strend && !isSPACE(*m)) + ++m; + } if (m >= strend) break; @@ -4623,10 +4640,18 @@ PP(pp_split) XPUSHs(dstr); s = m + 1; - while (s < strend && - ((pm->op_pmflags & PMf_LOCALE) - ? isSPACE_LC(*s) : isSPACE(*s))) - ++s; + /* this one uses 's' and is a positive test */ + if (do_utf8) { + while (s < strend && + ( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8) )) + s += UTF8SKIP(s); + } else if (pm->op_pmflags & PMf_LOCALE) { + while (s < strend && isSPACE_LC(*s)) + ++s; + } else { + while (s < strend && isSPACE(*s)) + ++s; + } } } else if (rx->extflags & RXf_START_ONLY) { diff --git a/t/op/split.t b/t/op/split.t index d2deff3..f5d0c41 100755 --- a/t/op/split.t +++ b/t/op/split.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 55; +plan tests => 80; $FS = ':'; @@ -297,4 +297,38 @@ ok(@ary == 3 && $x = \$a[2]; is (ref $x, 'SCALAR', '#28938 - garbage after extend'); } - +{ + # check the special casing of split /\s/ and unicode + use charnames qw(:full); + # below test data is extracted from + # PropList-5.0.0.txt + # Date: 2006-06-07, 23:22:52 GMT [MD] + # + # Unicode Character Database + # Copyright (c) 1991-2006 Unicode, Inc. + # For terms of use, see http://www.unicode.org/terms_of_use.html + # For documentation, see UCD.html + my @spaces=( + 0x0009..0x000A, # Cc [5] .. + 0x000C..0x000D, # EXCLUDING \v aka ctl-000B aka vert-tab + 0x0020, # Zs SPACE + 0x0085, # Cc + 0x00A0, # Zs NO-BREAK SPACE + 0x1680, # Zs OGHAM SPACE MARK + 0x180E, # Zs MONGOLIAN VOWEL SEPARATOR + 0x2000..0x200A, # Zs [11] EN QUAD..HAIR SPACE + 0x2028, # Zl LINE SEPARATOR + 0x2029, # Zp PARAGRAPH SEPARATOR + 0x202F, # Zs NARROW NO-BREAK SPACE + 0x205F, # Zs MEDIUM MATHEMATICAL SPACE + 0x3000 # Zs IDEOGRAPHIC SPACE + ); + #diag "Have @{[0+@spaces]} to test\n"; + foreach my $cp (@spaces) { + my $space = chr($cp); + my $str="A:$space:B\x{FFFF}"; + chop $str; + my @res=split(/\s+/,$str); + is(0+@res,2) or do { diag sprintf "Char failed: 0x%x",$cp } + } +}