*/
/*
- * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
+ * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
+ *
+ * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
*/
/* This file contains functions for compiling a regular expression. See
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
UV cp;
- char string;
len = (STRLEN)(endbrace - name - 2);
cp = grok_hex(name + 2, &len, &fl, NULL);
if ( len != (STRLEN)(endbrace - name - 2) ) {
cp = 0xFFFD;
}
- if (cp > 0xff)
- RExC_utf8 = 1;
if ( valuep ) {
+ if (cp > 0xff) RExC_utf8 = 1;
*valuep = cp;
return NULL;
}
- string = (char)cp;
- sv_str= newSVpvn(&string, 1);
+
+ /* Need to convert to utf8 if either: won't fit into a byte, or the re
+ * is going to be in utf8 and the representation changes under utf8. */
+ if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
+ U8 string[UTF8_MAXBYTES+1];
+ U8 *tmps;
+ RExC_utf8 = 1;
+ tmps = uvuni_to_utf8(string, cp);
+ sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
+ } else { /* Otherwise, no need for utf8, can skip that step */
+ char string;
+ string = (char)cp;
+ sv_str= newSVpvn(&string, 1);
+ }
} else {
/* fetch the charnames handler for this scope */
HV * const table = GvHV(PL_hintgv);
Set_Node_Cur_Length(ret); /* MJD */
RExC_parse--;
nextchar(pRExC_state);
- } else {
+ } else { /* zero length */
ret = reg_node(pRExC_state,NOTHING);
}
if (!cached) {
what = WORD; \
break
+/*
+ We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+ so that it is possible to override the option here without having to
+ rebuild the entire core. as we are required to do if we change regcomp.h
+ which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or if the pattern matches a single char only and
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
+
+ case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+ case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+ case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+ case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+ case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+ case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+ case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+ case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+ case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+ case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
case _C_C_T_(ALNUM, isALNUM(value), "Word");
- case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
- case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
- case _C_C_T_(BLANK, isBLANK(value), "Blank");
- case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
- case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
- case _C_C_T_(LOWER, isLOWER(value), "Lower");
- case _C_C_T_(PRINT, isPRINT(value), "Print");
- case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
- case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
- case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+ case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+ case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif
case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
ANYOF_BITMAP_SET(ret, value);
}
yesno = '+';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_NDIGIT:
if (LOC)
ANYOF_BITMAP_SET(ret, value);
}
yesno = '!';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_MAX:
/* this is to handle \p and \P */
/*
- regprop - printable representation of opcode
*/
+#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
+STMT_START { \
+ if (do_sep) { \
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
+ if (flags & ANYOF_INVERT) \
+ /*make sure the invert info is in each */ \
+ sv_catpvs(sv, "^"); \
+ do_sep = 0; \
+ } \
+} STMT_END
+
void
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
{
else if (k == ANYOF) {
int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
+ int do_sep = 0;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
static const char * const anyofs[] = {
"[:^alpha:]",
"[:ascii:]",
"[:^ascii:]",
- "[:ctrl:]",
- "[:^ctrl:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
"[:graph:]",
"[:^graph:]",
"[:lower:]",
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
+
+ /* output what the standard cp 0-255 bitmap matches */
for (i = 0; i <= 256; i++) {
if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
if (rangestart == -1)
sv_catpvs(sv, "-");
put_byte(sv, i - 1);
}
+ do_sep = 1;
rangestart = -1;
}
}
-
+
+ EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+ /* output any special charclass tests (used mostly under use locale) */
if (o->flags & ANYOF_CLASS)
for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
- if (ANYOF_CLASS_TEST(o,i))
+ if (ANYOF_CLASS_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
-
+ do_sep = 1;
+ }
+
+ EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+
+ /* output information about the unicode matching */
if (flags & ANYOF_UNICODE)
sv_catpvs(sv, "{unicode}");
else if (flags & ANYOF_UNICODE_ALL)
if (lv) {
if (sw) {
U8 s[UTF8_MAXBYTES_CASE+1];
-
+
for (i = 0; i <= 256; i++) { /* just the first 256 */
uvchr_to_utf8(s, i);