From: SADAHIRO Tomoyuki Date: Sat, 20 Jan 2007 00:52:42 +0000 (+0900) Subject: Re: [PATCH] fix unicode split /\s+/ X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=613f191e047db38572099edee06ae7ede4343b5b;p=p5sagit%2Fp5-mst-13.2.git Re: [PATCH] fix unicode split /\s+/ Message-Id: <20070120005232.D9CC.BQW10602@nifty.com> Date: Sat, 20 Jan 2007 00:52:42 +0900 p4raw-id: //depot/perl@29887 --- diff --git a/pp.c b/pp.c index 4b021c0..977c2b7 100644 --- a/pp.c +++ b/pp.c @@ -4590,7 +4590,11 @@ PP(pp_split) base = SP - PL_stack_base; orig = s; if (pm->op_pmflags & PMf_SKIPWHITE) { - if (pm->op_pmflags & PMf_LOCALE) { + if (do_utf8) { + while (*s == ' ' || is_utf8_space((U8*)s)) + s += UTF8SKIP(s); + } + else if (pm->op_pmflags & PMf_LOCALE) { while (isSPACE_LC(*s)) s++; } @@ -4606,22 +4610,18 @@ PP(pp_split) if (!limit) limit = maxiters + 2; if (pm->op_pmflags & PMf_WHITE) { - if (do_utf8 && !PL_utf8_space) { - /* force PL_utf8_space to be loaded */ - bool ok; - ENTER; - ok = is_utf8_space((const U8*)" "); - assert(ok); - LEAVE; - } while (--limit) { m = s; /* this one uses 'm' and is a negative test */ if (do_utf8) { - STRLEN uskip; - while (m < strend && - !( *m == ' ' || swash_fetch(PL_utf8_space,(U8*)m, do_utf8) )) - m += UTF8SKIP(m); + while (m < strend && !( *m == ' ' || is_utf8_space((U8*)m) )) { + const int t = UTF8SKIP(m); + /* is_utf8_space returns FALSE for malform utf8 */ + if (strend - m < t) + m = strend; + else + m += t; + } } else if (pm->op_pmflags & PMf_LOCALE) { while (m < strend && !isSPACE_LC(*m)) ++m; @@ -4639,11 +4639,15 @@ PP(pp_split) (void)SvUTF8_on(dstr); XPUSHs(dstr); - s = m + 1; + /* skip the whitespace found last */ + if (do_utf8) + s = m + UTF8SKIP(m); + else + s = m + 1; + /* this one uses 's' and is a positive test */ if (do_utf8) { - while (s < strend && - ( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8) )) + while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) )) s += UTF8SKIP(s); } else if (pm->op_pmflags & PMf_LOCALE) { while (s < strend && isSPACE_LC(*s)) diff --git a/t/op/split.t b/t/op/split.t index f5d0c41..b6d7570 100755 --- a/t/op/split.t +++ b/t/op/split.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 80; +plan tests => 130; $FS = ':'; @@ -309,11 +309,15 @@ ok(@ary == 3 && # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see UCD.html my @spaces=( - 0x0009..0x000A, # Cc [5] .. - 0x000C..0x000D, # EXCLUDING \v aka ctl-000B aka vert-tab - 0x0020, # Zs SPACE - 0x0085, # Cc - 0x00A0, # Zs NO-BREAK SPACE + ord("\t"), # Cc + ord("\n"), # Cc + # not PerlSpace # Cc + ord("\f"), # Cc + ord("\r"), # Cc + ord(" "), # Zs SPACE + ord("\N{NEL}"), # Cc + ord("\N{NO-BREAK SPACE}"), + # Zs NO-BREAK SPACE 0x1680, # Zs OGHAM SPACE MARK 0x180E, # Zs MONGOLIAN VOWEL SEPARATOR 0x2000..0x200A, # Zs [11] EN QUAD..HAIR SPACE @@ -325,10 +329,21 @@ ok(@ary == 3 && ); #diag "Have @{[0+@spaces]} to test\n"; foreach my $cp (@spaces) { + my $msg = sprintf "Space: U+%04x", $cp; my $space = chr($cp); - my $str="A:$space:B\x{FFFF}"; + my $str="A:$space:B\x{FFFD}"; chop $str; + my @res=split(/\s+/,$str); - is(0+@res,2) or do { diag sprintf "Char failed: 0x%x",$cp } + ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/"); + + my $s2 = "$space$space:A:$space$space:B\x{FFFD}"; + chop $s2; + + my @r2 = split(' ',$s2); + ok(@r2 == 2 && join('-', @r2) eq ":A:-:B", "$msg - ' '"); + + my @r3 = split(/\s+/, $s2); + ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2"); } }