From: Yves Orton Date: Sat, 17 Oct 2009 20:47:20 +0000 (+0200) Subject: somewhat fix failing regex tests. but break lots of other stuff at the same time X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d1eb31775715b0fcd7f36308da961c0698205d9f;p=p5sagit%2Fp5-mst-13.2.git somewhat fix failing regex tests. but break lots of other stuff at the same time --- diff --git a/embed.fnc b/embed.fnc index 0fd0a41..9626427 100644 --- a/embed.fnc +++ b/embed.fnc @@ -489,8 +489,11 @@ ApR |bool |is_utf8_idcont |NN const U8 *p ApR |bool |is_utf8_alpha |NN const U8 *p ApR |bool |is_utf8_ascii |NN const U8 *p ApR |bool |is_utf8_space |NN const U8 *p +ApR |bool |is_utf8_perl_space |NN const U8 *p +ApR |bool |is_utf8_perl_word |NN const U8 *p ApR |bool |is_utf8_cntrl |NN const U8 *p ApR |bool |is_utf8_digit |NN const U8 *p +ApR |bool |is_utf8_posix_digit |NN const U8 *p ApR |bool |is_utf8_graph |NN const U8 *p ApR |bool |is_utf8_upper |NN const U8 *p ApR |bool |is_utf8_lower |NN const U8 *p diff --git a/embed.h b/embed.h index 66c3194..057c986 100644 --- a/embed.h +++ b/embed.h @@ -378,8 +378,11 @@ #define is_utf8_alpha Perl_is_utf8_alpha #define is_utf8_ascii Perl_is_utf8_ascii #define is_utf8_space Perl_is_utf8_space +#define is_utf8_perl_space Perl_is_utf8_perl_space +#define is_utf8_perl_word Perl_is_utf8_perl_word #define is_utf8_cntrl Perl_is_utf8_cntrl #define is_utf8_digit Perl_is_utf8_digit +#define is_utf8_posix_digit Perl_is_utf8_posix_digit #define is_utf8_graph Perl_is_utf8_graph #define is_utf8_upper Perl_is_utf8_upper #define is_utf8_lower Perl_is_utf8_lower @@ -2746,8 +2749,11 @@ #define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a) #define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a) #define is_utf8_space(a) Perl_is_utf8_space(aTHX_ a) +#define is_utf8_perl_space(a) Perl_is_utf8_perl_space(aTHX_ a) +#define is_utf8_perl_word(a) Perl_is_utf8_perl_word(aTHX_ a) #define is_utf8_cntrl(a) Perl_is_utf8_cntrl(aTHX_ a) #define is_utf8_digit(a) Perl_is_utf8_digit(aTHX_ a) +#define is_utf8_posix_digit(a) Perl_is_utf8_posix_digit(aTHX_ a) #define is_utf8_graph(a) Perl_is_utf8_graph(aTHX_ a) #define is_utf8_upper(a) Perl_is_utf8_upper(aTHX_ a) #define is_utf8_lower(a) Perl_is_utf8_lower(aTHX_ a) diff --git a/embedvar.h b/embedvar.h index 2a9866f..9d6d4c3 100644 --- a/embedvar.h +++ b/embedvar.h @@ -333,6 +333,9 @@ #define PL_utf8_idstart (vTHX->Iutf8_idstart) #define PL_utf8_lower (vTHX->Iutf8_lower) #define PL_utf8_mark (vTHX->Iutf8_mark) +#define PL_utf8_perl_space (vTHX->Iutf8_perl_space) +#define PL_utf8_perl_word (vTHX->Iutf8_perl_word) +#define PL_utf8_posix_digit (vTHX->Iutf8_posix_digit) #define PL_utf8_print (vTHX->Iutf8_print) #define PL_utf8_punct (vTHX->Iutf8_punct) #define PL_utf8_space (vTHX->Iutf8_space) @@ -646,6 +649,9 @@ #define PL_Iutf8_idstart PL_utf8_idstart #define PL_Iutf8_lower PL_utf8_lower #define PL_Iutf8_mark PL_utf8_mark +#define PL_Iutf8_perl_space PL_utf8_perl_space +#define PL_Iutf8_perl_word PL_utf8_perl_word +#define PL_Iutf8_posix_digit PL_utf8_posix_digit #define PL_Iutf8_print PL_utf8_print #define PL_Iutf8_punct PL_utf8_punct #define PL_Iutf8_space PL_utf8_space diff --git a/global.sym b/global.sym index 7205e22..b554d88 100644 --- a/global.sym +++ b/global.sym @@ -229,8 +229,11 @@ Perl_is_utf8_idcont Perl_is_utf8_alpha Perl_is_utf8_ascii Perl_is_utf8_space +Perl_is_utf8_perl_space +Perl_is_utf8_perl_word Perl_is_utf8_cntrl Perl_is_utf8_digit +Perl_is_utf8_posix_digit Perl_is_utf8_graph Perl_is_utf8_upper Perl_is_utf8_lower diff --git a/intrpvar.h b/intrpvar.h index 02d6515..10cd6b7 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -516,6 +516,9 @@ PERLVAR(Iutf8_alnum, SV *) PERLVAR(Iutf8_ascii, SV *) PERLVAR(Iutf8_alpha, SV *) PERLVAR(Iutf8_space, SV *) +PERLVAR(Iutf8_perl_space, SV *) +PERLVAR(Iutf8_perl_word, SV *) +PERLVAR(Iutf8_posix_digit, SV *) PERLVAR(Iutf8_cntrl, SV *) PERLVAR(Iutf8_graph, SV *) PERLVAR(Iutf8_digit, SV *) diff --git a/lib/vars.pm b/lib/vars.pm index a0151b8..cff63d6 100644 --- a/lib/vars.pm +++ b/lib/vars.pm @@ -13,7 +13,7 @@ sub import { my ($sym, $ch); foreach (@imports) { if (($ch, $sym) = /^([\$\@\%\*\&])(.+)/) { - if ($sym =~ /\W/) { + if ($sym =~ /\P{IsWord}/) { # time for a more-detailed check-up if ($sym =~ /^\w+[[{].*[]}]$/) { require Carp; diff --git a/perlapi.h b/perlapi.h index 1d65db5..d819bc8 100644 --- a/perlapi.h +++ b/perlapi.h @@ -702,6 +702,12 @@ END_EXTERN_C #define PL_utf8_lower (*Perl_Iutf8_lower_ptr(aTHX)) #undef PL_utf8_mark #define PL_utf8_mark (*Perl_Iutf8_mark_ptr(aTHX)) +#undef PL_utf8_perl_space +#define PL_utf8_perl_space (*Perl_Iutf8_perl_space_ptr(aTHX)) +#undef PL_utf8_perl_word +#define PL_utf8_perl_word (*Perl_Iutf8_perl_word_ptr(aTHX)) +#undef PL_utf8_posix_digit +#define PL_utf8_posix_digit (*Perl_Iutf8_posix_digit_ptr(aTHX)) #undef PL_utf8_print #define PL_utf8_print (*Perl_Iutf8_print_ptr(aTHX)) #undef PL_utf8_punct diff --git a/proto.h b/proto.h index 7d47e9b..b81d749 100644 --- a/proto.h +++ b/proto.h @@ -1335,6 +1335,18 @@ PERL_CALLCONV bool Perl_is_utf8_space(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT_IS_UTF8_SPACE \ assert(p) +PERL_CALLCONV bool Perl_is_utf8_perl_space(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_perl_word(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD \ + assert(p) + PERL_CALLCONV bool Perl_is_utf8_cntrl(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); @@ -1347,6 +1359,12 @@ PERL_CALLCONV bool Perl_is_utf8_digit(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT_IS_UTF8_DIGIT \ assert(p) +PERL_CALLCONV bool Perl_is_utf8_posix_digit(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT \ + assert(p) + PERL_CALLCONV bool Perl_is_utf8_graph(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); diff --git a/regcomp.h b/regcomp.h index 198961c..a1aba15 100644 --- a/regcomp.h +++ b/regcomp.h @@ -197,7 +197,7 @@ struct regnode_2 { #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ -#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 40 (8*5) named classes */ +#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ /* also used by trie */ struct regnode_charclass { diff --git a/regexec.c b/regexec.c index efa44a5..e59b501 100644 --- a/regexec.c +++ b/regexec.c @@ -126,6 +126,36 @@ #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86") +/* + We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test + so that it is possible to override the option here without having to + rebuild the entire core. as we are required to do if we change regcomp.h + which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. +*/ +#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS +#define BROKEN_UNICODE_CHARCLASS_MAPPINGS +#endif + +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS +#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM() +#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE() +#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT() +#define RE_utf8_perl_word PL_utf8_alnum +#define RE_utf8_perl_space PL_utf8_space +#define RE_utf8_posix_digit PL_utf8_digit +#define perl_word alnum +#define perl_space space +#define posix_digit digit +#else +#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a") +#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ") +#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0") +#define RE_utf8_perl_word PL_utf8_perl_word +#define RE_utf8_perl_space PL_utf8_perl_space +#define RE_utf8_posix_digit PL_utf8_posix_digit +#endif + + #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ case NAMEL: \ PL_reg_flags |= RF_tainted; \ @@ -189,6 +219,9 @@ break + + + /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ /* for use after a quantifier and before an EXACT-like node -- japhy */ @@ -1491,8 +1524,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case ALNUM: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_ALNUM(), - swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8), + LOAD_UTF8_CHARCLASS_PERL_WORD(), + swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8), isALNUM(*s) ); case ALNUML: @@ -1502,8 +1535,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); case NALNUM: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_ALNUM(), - !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8), + LOAD_UTF8_CHARCLASS_PERL_WORD(), + !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8), !isALNUM(*s) ); case NALNUML: @@ -1513,8 +1546,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); case SPACE: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_SPACE(), - *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8), + LOAD_UTF8_CHARCLASS_PERL_SPACE(), + *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8), isSPACE(*s) ); case SPACEL: @@ -1524,8 +1557,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); case NSPACE: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_SPACE(), - !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)), + LOAD_UTF8_CHARCLASS_PERL_SPACE(), + !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)), !isSPACE(*s) ); case NSPACEL: @@ -1535,8 +1568,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); case DIGIT: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_DIGIT(), - swash_fetch(PL_utf8_digit,(U8*)s, do_utf8), + LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), + swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8), isDIGIT(*s) ); case DIGITL: @@ -1546,8 +1579,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); case NDIGIT: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_DIGIT(), - !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8), + LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), + !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8), !isDIGIT(*s) ); case NDIGITL: @@ -3484,14 +3517,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) sayNO; break; /* Special char classes - The defines start on line 129 or so */ - CCC_TRY_AFF( ALNUM, ALNUML, alnum, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); - CCC_TRY_NEG(NALNUM, NALNUML, alnum, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); + CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); + CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); - CCC_TRY_AFF( SPACE, SPACEL, space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); - CCC_TRY_NEG(NSPACE, NSPACEL, space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); + CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); + CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); - CCC_TRY_AFF( DIGIT, DIGITL, digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); - CCC_TRY_NEG(NDIGIT, NDIGITL, digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); + CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); + CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); case CLUMP: if (locinput >= PL_regeol) diff --git a/t/op/lc.t b/t/op/lc.t index 5f4c6b4..6b7625b 100644 --- a/t/op/lc.t +++ b/t/op/lc.t @@ -141,13 +141,13 @@ $b = "\x{3a3}FOO.BAR"; # \x{3a3} == GREEK CAPITAL LETTER SIGMA. ($c = $b) =~ s/(\w+)/lc($1)/ge; is($c , $a, "Using s///e to change case."); -($c = $a) =~ s/(\w+)/uc($1)/ge; +($c = $a) =~ s/(\p{IsWord}+)/uc($1)/ge; is($c , $b, "Using s///e to change case."); -($c = $b) =~ s/(\w+)/lcfirst($1)/ge; +($c = $b) =~ s/(\p{IsWord}+)/lcfirst($1)/ge; is($c , "\x{3c3}FOO.bAR", "Using s///e to change case."); -($c = $a) =~ s/(\w+)/ucfirst($1)/ge; +($c = $a) =~ s/(\p{IsWord}+)/ucfirst($1)/ge; is($c , "\x{3a3}foo.Bar", "Using s///e to change case."); # #18931: perl5.8.0 bug in \U..\E processing diff --git a/utf8.c b/utf8.c index 3de02ed..6907b7e 100644 --- a/utf8.c +++ b/utf8.c @@ -1375,6 +1375,26 @@ Perl_is_utf8_space(pTHX_ const U8 *p) } bool +Perl_is_utf8_perl_space(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE; + + return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace"); +} + +bool +Perl_is_utf8_perl_word(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD; + + return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord"); +} + +bool Perl_is_utf8_digit(pTHX_ const U8 *p) { dVAR; @@ -1385,6 +1405,16 @@ Perl_is_utf8_digit(pTHX_ const U8 *p) } bool +Perl_is_utf8_posix_digit(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT; + + return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit"); +} + +bool Perl_is_utf8_upper(pTHX_ const U8 *p) { dVAR; @@ -1451,7 +1481,7 @@ Perl_is_utf8_xdigit(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_XDIGIT; - return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit"); + return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit"); } bool