X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=e06152820c304553b6d15e8d00750513a0d9f2c2;hb=70b6a84e977a6211834eeab18dfebf4d45bd1688;hp=7d1c4746333065d3f1b8dc771c98982a6fb233c3;hpb=8fa23287293bbec887dca6d7e006e50e1a445bb2;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 7d1c474..e061528 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2,7 +2,9 @@ */ /* - * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] */ /* This file contains functions for compiling a regular expression. See @@ -6615,20 +6617,30 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); UV cp; - char string; len = (STRLEN)(endbrace - name - 2); cp = grok_hex(name + 2, &len, &fl, NULL); if ( len != (STRLEN)(endbrace - name - 2) ) { cp = 0xFFFD; } - if (cp > 0xff) - RExC_utf8 = 1; if ( valuep ) { + if (cp > 0xff) RExC_utf8 = 1; *valuep = cp; return NULL; } - string = (char)cp; - sv_str= newSVpvn(&string, 1); + + /* Need to convert to utf8 if either: won't fit into a byte, or the re + * is going to be in utf8 and the representation changes under utf8. */ + if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) { + U8 string[UTF8_MAXBYTES+1]; + U8 *tmps; + RExC_utf8 = 1; + tmps = uvuni_to_utf8(string, cp); + sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE); + } else { /* Otherwise, no need for utf8, can skip that step */ + char string; + string = (char)cp; + sv_str= newSVpvn(&string, 1); + } } else { /* fetch the charnames handler for this scope */ HV * const table = GvHV(PL_hintgv); @@ -6807,7 +6819,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) Set_Node_Cur_Length(ret); /* MJD */ RExC_parse--; nextchar(pRExC_state); - } else { + } else { /* zero length */ ret = reg_node(pRExC_state,NOTHING); } if (!cached) { @@ -7418,6 +7430,19 @@ tryagain: I32 flags = 0; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); + + /* An octal above 0xff is interpreted differently + * depending on if the re is in utf8 or not. If it + * is in utf8, the value will be itself, otherwise + * it is interpreted as modulo 0x100. It has been + * decided to discourage the use of octal above the + * single-byte range. For now, warn only when + * it ends up modulo */ + if (SIZE_ONLY && ender >= 0x100 + && ! UTF && ! PL_encoding + && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) { + vWARNdep(p, "Use of octal value above 377 is deprecated"); + } p += numlen; } else { @@ -7792,6 +7817,22 @@ case ANYOF_N##NAME: \ what = WORD; \ break +/* + We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test + so that it is possible to override the option here without having to + rebuild the entire core. as we are required to do if we change regcomp.h + which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. +*/ +#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS +#define BROKEN_UNICODE_CHARCLASS_MAPPINGS +#endif + +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS +#define POSIX_CC_UNI_NAME(CCNAME) CCNAME +#else +#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME +#endif + /* parse a class specification and produce either an ANYOF node that matches the pattern or if the pattern matches a single char only and @@ -8080,18 +8121,24 @@ parseit: * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { + + case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); + case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); + case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); + case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); + case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); + case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); + case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); + case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); + case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); + case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS case _C_C_T_(ALNUM, isALNUM(value), "Word"); - case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum"); - case _C_C_T_(ALPHA, isALPHA(value), "Alpha"); - case _C_C_T_(BLANK, isBLANK(value), "Blank"); - case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl"); - case _C_C_T_(GRAPH, isGRAPH(value), "Graph"); - case _C_C_T_(LOWER, isLOWER(value), "Lower"); - case _C_C_T_(PRINT, isPRINT(value), "Print"); - case _C_C_T_(PSXSPC, isPSXSPC(value), "Space"); - case _C_C_T_(PUNCT, isPUNCT(value), "Punct"); case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); - case _C_C_T_(UPPER, isUPPER(value), "Upper"); +#else + case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); + case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); +#endif case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); @@ -8138,7 +8185,7 @@ parseit: ANYOF_BITMAP_SET(ret, value); } yesno = '+'; - what = "Digit"; + what = POSIX_CC_UNI_NAME("Digit"); break; case ANYOF_NDIGIT: if (LOC) @@ -8151,7 +8198,7 @@ parseit: ANYOF_BITMAP_SET(ret, value); } yesno = '!'; - what = "Digit"; + what = POSIX_CC_UNI_NAME("Digit"); break; case ANYOF_MAX: /* this is to handle \p and \P */ @@ -8962,6 +9009,17 @@ Perl_regdump(pTHX_ const regexp *r) /* - regprop - printable representation of opcode */ +#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ +STMT_START { \ + if (do_sep) { \ + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ + if (flags & ANYOF_INVERT) \ + /*make sure the invert info is in each */ \ + sv_catpvs(sv, "^"); \ + do_sep = 0; \ + } \ +} STMT_END + void Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) { @@ -9087,6 +9145,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ static const char * const anyofs[] = { @@ -9102,8 +9161,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^alpha:]", "[:ascii:]", "[:^ascii:]", - "[:ctrl:]", - "[:^ctrl:]", + "[:cntrl:]", + "[:^cntrl:]", "[:graph:]", "[:^graph:]", "[:lower:]", @@ -9129,6 +9188,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ for (i = 0; i <= 256; i++) { if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { if (rangestart == -1) @@ -9142,15 +9203,23 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "-"); put_byte(sv, i - 1); } + do_sep = 1; rangestart = -1; } } - + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + /* output any special charclass tests (used mostly under use locale) */ if (o->flags & ANYOF_CLASS) for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) - if (ANYOF_CLASS_TEST(o,i)) + if (ANYOF_CLASS_TEST(o,i)) { sv_catpv(sv, anyofs[i]); - + do_sep = 1; + } + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + + /* output information about the unicode matching */ if (flags & ANYOF_UNICODE) sv_catpvs(sv, "{unicode}"); else if (flags & ANYOF_UNICODE_ALL) @@ -9163,7 +9232,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (lv) { if (sw) { U8 s[UTF8_MAXBYTES_CASE+1]; - + for (i = 0; i <= 256; i++) { /* just the first 256 */ uvchr_to_utf8(s, i);