Polymorphic regexps.
Jarkko Hietaniemi [Sun, 17 Dec 2000 05:31:37 +0000 (05:31 +0000)]
Fixes at least the bugs 20001028.003 (both of them...) and
20001108.001.  The bugs 20001114.001 and 20001205.014 seem
also to be fixed by now, probably already before this patch.

p4raw-id: //depot/perl@8143

15 files changed:
embed.h
embed.pl
mg.c
objXSUB.h
pp_ctl.c
pp_hot.c
proto.h
regcomp.c
regcomp.h
regcomp.sym
regexec.c
regnodes.h
sv.c
t/op/utf8decode.t
t/pragma/utf8.t

diff --git a/embed.h b/embed.h
index 64c1eaf..3b54154 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define ref                    Perl_ref
 #define refkids                        Perl_refkids
 #define regdump                        Perl_regdump
+#define regclass_swash         Perl_regclass_swash
 #define pregexec               Perl_pregexec
 #define pregfree               Perl_pregfree
 #define pregcomp               Perl_pregcomp
 #define regbranch              S_regbranch
 #define reguni                 S_reguni
 #define regclass               S_regclass
-#define regclassutf8           S_regclassutf8
 #define regcurly               S_regcurly
 #define reg_node               S_reg_node
 #define regpiece               S_regpiece
 #define regrepeat_hard         S_regrepeat_hard
 #define regtry                 S_regtry
 #define reginclass             S_reginclass
-#define reginclassutf8         S_reginclassutf8
 #define regcppush              S_regcppush
 #define regcppop               S_regcppop
 #define regcp_set_to           S_regcp_set_to
 #define ref(a,b)               Perl_ref(aTHX_ a,b)
 #define refkids(a,b)           Perl_refkids(aTHX_ a,b)
 #define regdump(a)             Perl_regdump(aTHX_ a)
+#define regclass_swash(a,b,c)  Perl_regclass_swash(aTHX_ a,b,c)
 #define pregexec(a,b,c,d,e,f,g)        Perl_pregexec(aTHX_ a,b,c,d,e,f,g)
 #define pregfree(a)            Perl_pregfree(aTHX_ a)
 #define pregcomp(a,b,c)                Perl_pregcomp(aTHX_ a,b,c)
 #define regbranch(a,b,c)       S_regbranch(aTHX_ a,b,c)
 #define reguni(a,b,c,d)                S_reguni(aTHX_ a,b,c,d)
 #define regclass(a)            S_regclass(aTHX_ a)
-#define regclassutf8(a)                S_regclassutf8(aTHX_ a)
 #define regcurly(a)            S_regcurly(aTHX_ a)
 #define reg_node(a,b)          S_reg_node(aTHX_ a,b)
 #define regpiece(a,b)          S_regpiece(aTHX_ a,b)
 #define regrepeat(a,b)         S_regrepeat(aTHX_ a,b)
 #define regrepeat_hard(a,b,c)  S_regrepeat_hard(aTHX_ a,b,c)
 #define regtry(a,b)            S_regtry(aTHX_ a,b)
-#define reginclass(a,b)                S_reginclass(aTHX_ a,b)
-#define reginclassutf8(a,b)    S_reginclassutf8(aTHX_ a,b)
+#define reginclass(a,b,c)      S_reginclass(aTHX_ a,b,c)
 #define regcppush(a)           S_regcppush(aTHX_ a)
 #define regcppop()             S_regcppop(aTHX)
 #define regcp_set_to(a)                S_regcp_set_to(aTHX_ a)
 #define refkids                        Perl_refkids
 #define Perl_regdump           CPerlObj::Perl_regdump
 #define regdump                        Perl_regdump
+#define Perl_regclass_swash    CPerlObj::Perl_regclass_swash
+#define regclass_swash         Perl_regclass_swash
 #define Perl_pregexec          CPerlObj::Perl_pregexec
 #define pregexec               Perl_pregexec
 #define Perl_pregfree          CPerlObj::Perl_pregfree
 #define reguni                 S_reguni
 #define S_regclass             CPerlObj::S_regclass
 #define regclass               S_regclass
-#define S_regclassutf8         CPerlObj::S_regclassutf8
-#define regclassutf8           S_regclassutf8
 #define S_regcurly             CPerlObj::S_regcurly
 #define regcurly               S_regcurly
 #define S_reg_node             CPerlObj::S_reg_node
 #define regtry                 S_regtry
 #define S_reginclass           CPerlObj::S_reginclass
 #define reginclass             S_reginclass
-#define S_reginclassutf8       CPerlObj::S_reginclassutf8
-#define reginclassutf8         S_reginclassutf8
 #define S_regcppush            CPerlObj::S_regcppush
 #define regcppush              S_regcppush
 #define S_regcppop             CPerlObj::S_regcppop
index 9e2bd9c..32f3ddc 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1873,6 +1873,7 @@ Ap        |void   |push_scope
 p      |OP*    |ref            |OP* o|I32 type
 p      |OP*    |refkids        |OP* o|I32 type
 Ap     |void   |regdump        |regexp* r
+Ap     |SV*    |regclass_swash |struct regnode *n|bool doinit|SV **initsvp
 Ap     |I32    |pregexec       |regexp* prog|char* stringarg \
                                |char* strend|char* strbeg|I32 minend \
                                |SV* screamer|U32 nosave
@@ -2366,7 +2367,6 @@ s |regnode*|regatom       |struct RExC_state_t*|I32 *
 s      |regnode*|regbranch     |struct RExC_state_t*|I32 *|I32
 s      |void   |reguni         |struct RExC_state_t*|UV|char *|STRLEN*
 s      |regnode*|regclass      |struct RExC_state_t*
-s      |regnode*|regclassutf8  |struct RExC_state_t*
 s      |I32    |regcurly       |char *
 s      |regnode*|reg_node      |struct RExC_state_t*|U8
 s      |regnode*|regpiece      |struct RExC_state_t*|I32 *
@@ -2401,8 +2401,7 @@ s |I32    |regmatch       |regnode *prog
 s      |I32    |regrepeat      |regnode *p|I32 max
 s      |I32    |regrepeat_hard |regnode *p|I32 max|I32 *lp
 s      |I32    |regtry         |regexp *prog|char *startpos
-s      |bool   |reginclass     |regnode *p|I32 c
-s      |bool   |reginclassutf8 |regnode *f|U8* p
+s      |bool   |reginclass     |regnode *n|U8 *p|bool do_utf8sv_is_utf8
 s      |CHECKPOINT|regcppush   |I32 parenfloor
 s      |char*|regcppop
 s      |char*|regcp_set_to     |I32 ss
diff --git a/mg.c b/mg.c
index f97c6ce..a61d167 100644 (file)
--- a/mg.c
+++ b/mg.c
@@ -391,7 +391,7 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg)
     case '5': case '6': case '7': case '8': case '9': case '&':
        if (PL_curpm && (rx = PL_curpm->op_pmregexp)) {
 
-           paren = atoi(mg->mg_ptr);
+           paren = atoi(mg->mg_ptr); /* $& is in [0] */
          getparen:
            if (paren <= rx->nparens &&
                (s1 = rx->startp[paren]) != -1 &&
@@ -399,17 +399,15 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg)
            {
                i = t1 - s1;
              getlen:
-               if (i > 0 && (PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) {
-                   char *s = rx->subbeg + s1;
+               if (i > 0 && DO_UTF8(PL_reg_sv)) {
+                   char *s    = rx->subbeg + s1;
                    char *send = rx->subbeg + t1;
-                   i = 0;
-                   while (s < send) {
-                       s += UTF8SKIP(s);
-                       i++;
-                   }
+
+                   i = Perl_utf8_length((U8*)s, (U8*)send);
                }
-               if (i >= 0)
-                   return i;
+               if (i < 0)
+                   Perl_croak(aTHX_ "panic: magic_len: %d", i);
+               return i;
            }
        }
        return 0;
@@ -604,7 +602,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
             * Pre-threads, this was paren = atoi(GvENAME((GV*)mg->mg_obj));
             * XXX Does the new way break anything?
             */
-           paren = atoi(mg->mg_ptr);
+           paren = atoi(mg->mg_ptr); /* $& is in [0] */
          getparen:
            if (paren <= rx->nparens &&
                (s1 = rx->startp[paren]) != -1 &&
@@ -623,7 +621,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
                        PL_tainted = FALSE;
                    }
                    sv_setpvn(sv, s, i);
-                   if ((PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE)
+                   if (DO_UTF8(PL_reg_sv))
                        SvUTF8_on(sv);
                    else
                        SvUTF8_off(sv);
index 43537d3..60c6e90 100644 (file)
--- a/objXSUB.h
+++ b/objXSUB.h
 #define Perl_regdump           pPerl->Perl_regdump
 #undef  regdump
 #define regdump                        Perl_regdump
+#undef  Perl_regclass_swash
+#define Perl_regclass_swash    pPerl->Perl_regclass_swash
+#undef  regclass_swash
+#define regclass_swash         Perl_regclass_swash
 #undef  Perl_pregexec
 #define Perl_pregexec          pPerl->Perl_pregexec
 #undef  pregexec
index d079e4a..aff5815 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -157,7 +157,7 @@ PP(pp_substcont)
     register char *m = cx->sb_m;
     char *orig = cx->sb_orig;
     register REGEXP *rx = cx->sb_rx;
-
+    
     rxres_restore(&cx->sb_rxres, rx);
 
     if (cx->sb_iters++) {
@@ -176,8 +176,8 @@ PP(pp_substcont)
                                      : (REXEC_COPY_STR|REXEC_IGNOREPOS|REXEC_NOT_FIRST))))
        {
            SV *targ = cx->sb_targ;
-           sv_catpvn(dstr, s, cx->sb_strend - s);
 
+           sv_catpvn(dstr, s, cx->sb_strend - s);
            cx->sb_rxtainted |= RX_MATCH_TAINTED(rx);
 
            (void)SvOOK_off(targ);
@@ -189,9 +189,11 @@ PP(pp_substcont)
            sv_free(dstr);
 
            TAINT_IF(cx->sb_rxtainted & 1);
+           if (pm->op_pmdynflags & PMdf_UTF8)
+               SvUTF8_on(targ); /* could also copy SvUTF8(dstr)? */
            PUSHs(sv_2mortal(newSViv((I32)cx->sb_iters - 1)));
 
-           (void)SvPOK_only(targ);
+           (void)SvPOK_only_UTF8(targ);
            TAINT_IF(cx->sb_rxtainted);
            SvSETMAGIC(targ);
            SvTAINT(targ);
@@ -209,7 +211,8 @@ PP(pp_substcont)
        cx->sb_strend = s + (cx->sb_strend - m);
     }
     cx->sb_m = m = rx->startp[0] + orig;
-    sv_catpvn(dstr, s, m-s);
+    if (m > s)
+       sv_catpvn(dstr, s, m-s);
     cx->sb_s = rx->endp[0] + orig;
     { /* Update the pos() information. */
        SV *sv = cx->sb_targ;
index 6a5b96f..2904d9f 100644 (file)
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1179,6 +1179,7 @@ PP(pp_match)
        TARG = DEFSV;
        EXTEND(SP,1);
     }
+    PL_reg_sv = TARG;
     PUTBACK;                           /* EVAL blocks need stack_sp. */
     s = SvPV(TARG, len);
     strend = s + len;
@@ -1268,27 +1269,25 @@ play_it_again:
        RX_MATCH_TAINTED_on(rx);
     TAINT_IF(RX_MATCH_TAINTED(rx));
     if (gimme == G_ARRAY) {
-       I32 iters, i, len;
+       I32 nparens, i, len;
 
-       iters = rx->nparens;
-       if (global && !iters)
+       nparens = rx->nparens;
+       if (global && !nparens)
            i = 1;
        else
            i = 0;
        SPAGAIN;                        /* EVAL blocks could move the stack. */
-       EXTEND(SP, iters + i);
-       EXTEND_MORTAL(iters + i);
-       for (i = !i; i <= iters; i++) {
+       EXTEND(SP, nparens + i);
+       EXTEND_MORTAL(nparens + i);
+       for (i = !i; i <= nparens; i++) {
            PUSHs(sv_newmortal());
            /*SUPPRESS 560*/
            if ((rx->startp[i] != -1) && rx->endp[i] != -1 ) {
                len = rx->endp[i] - rx->startp[i];
                s = rx->startp[i] + truebase;
                sv_setpvn(*SP, s, len);
-               if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) {
+               if (DO_UTF8(TARG))
                    SvUTF8_on(*SP);
-                   sv_utf8_downgrade(*SP, TRUE);
-               }
            }
        }
        if (global) {
@@ -1298,7 +1297,7 @@ play_it_again:
            r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST;
            goto play_it_again;
        }
-       else if (!iters)
+       else if (!nparens)
            XPUSHs(&PL_sv_yes);
        LEAVE_SCOPE(oldsave);
        RETURN;
@@ -1831,6 +1830,7 @@ PP(pp_subst)
        TARG = DEFSV;
        EXTEND(SP,1);
     }
+    PL_reg_sv = TARG;
     if (SvFAKE(TARG) && SvREADONLY(TARG))
        sv_force_normal(TARG);
     if (SvREADONLY(TARG)
@@ -1847,7 +1847,7 @@ PP(pp_subst)
     if (PL_tainted)
        rxtainted |= 2;
     TAINT_NOT;
-
+    
   force_it:
     if (!pm || !s)
        DIE(aTHX_ "panic: do_subst");
@@ -2004,6 +2004,8 @@ PP(pp_subst)
        rxtainted |= RX_MATCH_TAINTED(rx);
        dstr = NEWSV(25, len);
        sv_setpvn(dstr, m, s-m);
+       if (DO_UTF8(TARG))
+           SvUTF8_on(dstr);
        PL_curpm = pm;
        if (!c) {
            register PERL_CONTEXT *cx;
@@ -2030,7 +2032,8 @@ PP(pp_subst)
                sv_catpvn(dstr, c, clen);
            if (once)
                break;
-       } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m, TARG, NULL, r_flags));
+       } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m,
+                            TARG, NULL, r_flags));
        sv_catpvn(dstr, s, strend - s);
 
        (void)SvOOK_off(TARG);
diff --git a/proto.h b/proto.h
index 4fc260e..1bcb5cd 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -616,6 +616,7 @@ PERL_CALLCONV void  Perl_push_scope(pTHX);
 PERL_CALLCONV OP*      Perl_ref(pTHX_ OP* o, I32 type);
 PERL_CALLCONV OP*      Perl_refkids(pTHX_ OP* o, I32 type);
 PERL_CALLCONV void     Perl_regdump(pTHX_ regexp* r);
+PERL_CALLCONV SV*      Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **initsvp);
 PERL_CALLCONV I32      Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, U32 nosave);
 PERL_CALLCONV void     Perl_pregfree(pTHX_ struct regexp* r);
 PERL_CALLCONV regexp*  Perl_pregcomp(pTHX_ char* exp, char* xend, PMOP* pm);
@@ -1111,7 +1112,6 @@ STATIC regnode*   S_regatom(pTHX_ struct RExC_state_t*, I32 *);
 STATIC regnode*        S_regbranch(pTHX_ struct RExC_state_t*, I32 *, I32);
 STATIC void    S_reguni(pTHX_ struct RExC_state_t*, UV, char *, STRLEN*);
 STATIC regnode*        S_regclass(pTHX_ struct RExC_state_t*);
-STATIC regnode*        S_regclassutf8(pTHX_ struct RExC_state_t*);
 STATIC I32     S_regcurly(pTHX_ char *);
 STATIC regnode*        S_reg_node(pTHX_ struct RExC_state_t*, U8);
 STATIC regnode*        S_regpiece(pTHX_ struct RExC_state_t*, I32 *);
@@ -1141,8 +1141,7 @@ STATIC I32        S_regmatch(pTHX_ regnode *prog);
 STATIC I32     S_regrepeat(pTHX_ regnode *p, I32 max);
 STATIC I32     S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp);
 STATIC I32     S_regtry(pTHX_ regexp *prog, char *startpos);
-STATIC bool    S_reginclass(pTHX_ regnode *p, I32 c);
-STATIC bool    S_reginclassutf8(pTHX_ regnode *f, U8* p);
+STATIC bool    S_reginclass(pTHX_ regnode *n, U8 *p, bool do_utf8sv_is_utf8);
 STATIC CHECKPOINT      S_regcppush(pTHX_ I32 parenfloor);
 STATIC char*   S_regcppop(pTHX);
 STATIC char*   S_regcp_set_to(pTHX_ I32 ss);
index aae2ced..69a9f91 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -118,7 +118,7 @@ typedef struct RExC_state_t {
     char       *end;                   /* End of input for compile */
     char       *parse;                 /* Input-scan pointer. */
     I32                whilem_seen;            /* number of WHILEM in this expr */
-    regnode    *emit;                  /* Code-emit pointer; &regdummy = don't */
+    regnode    *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
     I32                naughty;                /* How bad is this pattern? */
     I32                sawback;                /* Did we see \1, ...? */
     U32                seen;
@@ -234,8 +234,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define LOC (RExC_flags16 & PMf_LOCALE)
 #define FOLD (RExC_flags16 & PMf_FOLD)
 
-#define OOB_CHAR8              1234
-#define OOB_UTF8               123456
+#define OOB_UNICODE            12345678
 #define OOB_NAMEDCLASS         -1
 
 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
@@ -1196,7 +1195,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                break;
            }
        }
-       else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) {
+       else if (strchr((char*)PL_simple,OP(scan))) {
            int value;
 
            if (flags & SCF_DO_SUBSTR) {
@@ -1210,20 +1209,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                /* Some of the logic below assumes that switching
                   locale on will only add false positives. */
                switch (PL_regkind[(U8)OP(scan)]) {
-               case ANYUTF8:
                case SANY:
-               case SANYUTF8:
-               case ALNUMUTF8:
-               case ANYOFUTF8:
-               case ALNUMLUTF8:
-               case NALNUMUTF8:
-               case NALNUMLUTF8:
-               case SPACEUTF8:
-               case NSPACEUTF8:
-               case SPACELUTF8:
-               case NSPACELUTF8:
-               case DIGITUTF8:
-               case NDIGITUTF8:
                default:
                  do_default:
                    /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
@@ -1750,7 +1736,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
            /* turn .* into ^.* with an implied $*=1 */
            int type = OP(NEXTOPER(first));
 
-           if (type == REG_ANY || type == ANYUTF8)
+           if (type == REG_ANY)
                type = ROPT_ANCH_MBOL;
            else
                type = ROPT_ANCH_SBOL;
@@ -1850,8 +1836,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
            longest_fixed_length = 0;
        }
        if (r->regstclass 
-           && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8
-               || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY))
+           && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
            r->regstclass = NULL;
        if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
            && !(data.start_class->flags & ANYOF_EOS)
@@ -1866,6 +1851,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
                       struct regnode_charclass_class);
            r->regstclass = (regnode*)RExC_rx->data->data[n];
            r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
+           PL_regdata = r->data; /* for regprop() */
            DEBUG_r((sv = sv_newmortal(),
                     regprop(sv, (regnode*)data.start_class),
                     PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
@@ -1933,7 +1919,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
        r->reganch |= ROPT_EVAL_SEEN;
     Newz(1002, r->startp, RExC_npar, I32);
     Newz(1002, r->endp, RExC_npar, I32);
-    PL_regdata = r->data; /* for regprop() ANYOFUTF8 */
+    PL_regdata = r->data; /* for regprop() */
     DEBUG_r(regdump(r));
     return(r);
 }
@@ -2556,26 +2542,17 @@ tryagain:
        break;
     case '.':
        nextchar(pRExC_state);
-       if (UTF) {
-           if (RExC_flags16 & PMf_SINGLELINE)
-               ret = reg_node(pRExC_state, SANYUTF8);
-           else
-               ret = reg_node(pRExC_state, ANYUTF8);
-           *flagp |= HASWIDTH;
-       }
-       else {
-           if (RExC_flags16 & PMf_SINGLELINE)
-               ret = reg_node(pRExC_state, SANY);
-           else
-               ret = reg_node(pRExC_state, REG_ANY);
-           *flagp |= HASWIDTH|SIMPLE;
-       }
+       if (RExC_flags16 & PMf_SINGLELINE)
+           ret = reg_node(pRExC_state, SANY);
+       else
+           ret = reg_node(pRExC_state, REG_ANY);
+       *flagp |= HASWIDTH|SIMPLE;
        RExC_naughty++;
        break;
     case '[':
     {
        char *oregcomp_parse = ++RExC_parse;
-       ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state));
+       ret = regclass(pRExC_state);
        if (*RExC_parse != ']') {
            RExC_parse = oregcomp_parse;
            vFAIL("Unmatched [");
@@ -2659,20 +2636,14 @@ tryagain:
                is_utf8_mark((U8*)"~");         /* preload table */
            break;
        case 'w':
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
-                   : (LOC ? ALNUML     : ALNUM));
+           ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
            *flagp |= HASWIDTH|SIMPLE;
            nextchar(pRExC_state);
            if (UTF && !PL_utf8_alnum)
                is_utf8_alnum((U8*)"a");        /* preload table */
            break;
        case 'W':
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
-                   : (LOC ? NALNUML     : NALNUM));
+           ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
            *flagp |= HASWIDTH|SIMPLE;
            nextchar(pRExC_state);
            if (UTF && !PL_utf8_alnum)
@@ -2681,10 +2652,7 @@ tryagain:
        case 'b':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
-                   : (LOC ? BOUNDL     : BOUND));
+           ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
            *flagp |= SIMPLE;
            nextchar(pRExC_state);
            if (UTF && !PL_utf8_alnum)
@@ -2693,44 +2661,35 @@ tryagain:
        case 'B':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
-                   : (LOC ? NBOUNDL     : NBOUND));
+           ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
            *flagp |= SIMPLE;
            nextchar(pRExC_state);
            if (UTF && !PL_utf8_alnum)
                is_utf8_alnum((U8*)"a");        /* preload table */
            break;
        case 's':
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? SPACELUTF8 : SPACEUTF8)
-                   : (LOC ? SPACEL     : SPACE));
+           ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
            *flagp |= HASWIDTH|SIMPLE;
            nextchar(pRExC_state);
            if (UTF && !PL_utf8_space)
                is_utf8_space((U8*)" ");        /* preload table */
            break;
        case 'S':
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
-                   : (LOC ? NSPACEL     : NSPACE));
+           ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
            *flagp |= HASWIDTH|SIMPLE;
            nextchar(pRExC_state);
            if (UTF && !PL_utf8_space)
                is_utf8_space((U8*)" ");        /* preload table */
            break;
        case 'd':
-           ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT);
+           ret = reg_node(pRExC_state, DIGIT);
            *flagp |= HASWIDTH|SIMPLE;
            nextchar(pRExC_state);
            if (UTF && !PL_utf8_digit)
                is_utf8_digit((U8*)"1");        /* preload table */
            break;
        case 'D':
-           ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT);
+           ret = reg_node(pRExC_state, NDIGIT);
            *flagp |= HASWIDTH|SIMPLE;
            nextchar(pRExC_state);
            if (UTF && !PL_utf8_digit)
@@ -2754,7 +2713,7 @@ tryagain:
                    RExC_end = RExC_parse + 2;
                RExC_parse--;
 
-               ret = regclassutf8(pRExC_state);
+               ret = regclass(pRExC_state);
 
                RExC_end = oldregxend;
                RExC_parse--;
@@ -3194,58 +3153,108 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 STATIC regnode *
 S_regclass(pTHX_ RExC_state_t *pRExC_state)
 {
-    register U32 value;
-    register I32 lastvalue = OOB_CHAR8;
-    register I32 range = 0;
+    register UV value;
+    register IV lastvalue = OOB_UNICODE;
+    register IV range = 0;
     register regnode *ret;
     STRLEN numlen;
-    I32 namedclass;
+    IV namedclass;
     char *rangebegin;
     bool need_class = 0;
+    SV *listsv;
+    register char *e;
+    UV n;
+
+    ret = reganode(pRExC_state, ANYOF, 0);
+
+    if (!SIZE_ONLY)
+       ANYOF_FLAGS(ret) = 0;
+
+    if (*RExC_parse == '^') {  /* Complement of range. */
+       RExC_naughty++;
+       RExC_parse++;
+       if (!SIZE_ONLY)
+           ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+    }
 
-    ret = reg_node(pRExC_state, ANYOF);
     if (SIZE_ONLY)
        RExC_size += ANYOF_SKIP;
     else {
-       ret->flags = 0;
-       ANYOF_BITMAP_ZERO(ret);
        RExC_emit += ANYOF_SKIP;
        if (FOLD)
            ANYOF_FLAGS(ret) |= ANYOF_FOLD;
        if (LOC)
            ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
-    }
-    if (*RExC_parse == '^') {  /* Complement of range. */
-       RExC_naughty++;
-       RExC_parse++;
-       if (!SIZE_ONLY)
-           ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+       ANYOF_BITMAP_ZERO(ret);
+       listsv = newSVpvn("# comment\n", 10);
     }
 
     if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
        checkposixcc(pRExC_state);
 
     if (*RExC_parse == ']' || *RExC_parse == '-')
-       goto skipcond;          /* allow 1st char to be ] or - */
+       goto charclassloop;             /* allow 1st char to be ] or - */
+
     while (RExC_parse < RExC_end && *RExC_parse != ']') {
-       skipcond:
-       namedclass = OOB_NAMEDCLASS;
+
+    charclassloop:
+
+       namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
+
        if (!range)
            rangebegin = RExC_parse;
-       value = UCHARAT(RExC_parse++);
+       if (UTF) {
+           value = utf8_to_uv((U8*)RExC_parse,
+                              RExC_end - RExC_parse,
+                              &numlen, 0);
+           RExC_parse += numlen;
+       }
+       else
+           value = UCHARAT(RExC_parse++);
        if (value == '[')
            namedclass = regpposixcc(pRExC_state, value);
        else if (value == '\\') {
-           value = UCHARAT(RExC_parse++);
+           if (UTF) {
+               value = utf8_to_uv((U8*)RExC_parse,
+                                  RExC_end - RExC_parse,
+                                  &numlen, 0);
+               RExC_parse += numlen;
+           }
+           else
+               value = UCHARAT(RExC_parse++);
            /* Some compilers cannot handle switching on 64-bit integer
-            * values, therefore the 'value' cannot be an UV. --jhi */
-           switch (value) {
+            * values, therefore value cannot be an UV.  Yes, this will
+            * be a problem later if we want switch on Unicode.  --jhi */
+           switch ((I32)value) {
            case 'w':   namedclass = ANYOF_ALNUM;       break;
            case 'W':   namedclass = ANYOF_NALNUM;      break;
            case 's':   namedclass = ANYOF_SPACE;       break;
            case 'S':   namedclass = ANYOF_NSPACE;      break;
            case 'd':   namedclass = ANYOF_DIGIT;       break;
            case 'D':   namedclass = ANYOF_NDIGIT;      break;
+           case 'p':
+           case 'P':
+               if (*RExC_parse == '{') {
+                   e = strchr(RExC_parse++, '}');
+                    if (!e)
+                        vFAIL("Missing right brace on \\p{}");
+                   n = e - RExC_parse;
+               }
+               else {
+                   e = RExC_parse;
+                   n = 1;
+               }
+               if (!SIZE_ONLY) {
+                   if (value == 'p')
+                       Perl_sv_catpvf(aTHX_ listsv,
+                                      "+utf8::%.*s\n", (int)n, RExC_parse);
+                   else
+                       Perl_sv_catpvf(aTHX_ listsv,
+                                      "!utf8::%.*s\n", (int)n, RExC_parse);
+               }
+               RExC_parse = e + 1;
+               ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+               continue;
            case 'n':   value = '\n';                   break;
            case 'r':   value = '\r';                   break;
            case 't':   value = '\t';                   break;
@@ -3259,9 +3268,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
            case 'a':   value = '\057';                 break;
 #endif
            case 'x':
-               numlen = 0;             /* disallow underscores */
-               value = (UV)scan_hex(RExC_parse, 2, &numlen);
-               RExC_parse += numlen;
+               if (*RExC_parse == '{') {
+                   e = strchr(RExC_parse++, '}');
+                    if (!e) 
+                        vFAIL("Missing right brace on \\x{}");
+                   numlen = 1;         /* allow underscores */
+                   value = (UV)scan_hex(RExC_parse,
+                                        e - RExC_parse,
+                                        &numlen);
+                   RExC_parse = e + 1;
+               }
+               else {
+                   numlen = 0;         /* disallow underscores */
+                   value = (UV)scan_hex(RExC_parse, 2, &numlen);
+                   RExC_parse += numlen;
+               }
                break;
            case 'c':
                value = UCHARAT(RExC_parse++);
@@ -3275,16 +3296,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                break;
            default:
                if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
-
-                   vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value);
+                   vWARN2(RExC_parse,
+                          "Unrecognized escape \\%c in character class passed through",
+                          (int)value);
                break;
            }
-       }
-       if (namedclass > OOB_NAMEDCLASS) {
-           if (!need_class && !SIZE_ONLY)
+       } /* end of \blah */
+
+       if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
+           if (!SIZE_ONLY && !need_class)
                ANYOF_CLASS_ZERO(ret);
+
            need_class = 1;
-           if (range) { /* a-\d, a-[:digit:] */
+
+           /* a bad range like a-\d, a-[:digit:] ? */
+           if (range) {
                if (!SIZE_ONLY) {
                    if (ckWARN(WARN_REGEXP))
                        vWARN4(RExC_parse,
@@ -3292,11 +3319,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                               RExC_parse - rangebegin,
                               RExC_parse - rangebegin,
                               rangebegin);
-                   ANYOF_BITMAP_SET(ret, lastvalue);
-                   ANYOF_BITMAP_SET(ret, '-');
+                   if (lastvalue < 256) {
+                       ANYOF_BITMAP_SET(ret, lastvalue);
+                       ANYOF_BITMAP_SET(ret, '-');
+                   }
+                   else {
+                       ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+                       Perl_sv_catpvf(aTHX_ listsv,
+                                      /* 0x002D is Unicode for '-' */
+                                      "%04"UVxf"\n002D\n", (UV)lastvalue);
+                   }
                }
-               range = 0; /* this is not a true range */
+
+               range = 0; /* this was not a true range */
            }
+
            if (!SIZE_ONLY) {
                switch (namedclass) {
                case ANYOF_ALNUM:
@@ -3307,6 +3344,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isALNUM(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
                    break;
                case ANYOF_NALNUM:
                    if (LOC)
@@ -3316,42 +3354,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isALNUM(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
                    break;
-               case ANYOF_SPACE:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_SPACE);
-                   else {
-                       for (value = 0; value < 256; value++)
-                           if (isSPACE(value))
-                               ANYOF_BITMAP_SET(ret, value);
-                   }
-                   break;
-               case ANYOF_NSPACE:
+               case ANYOF_ALNUMC:
                    if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+                       ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
                    else {
                        for (value = 0; value < 256; value++)
-                           if (!isSPACE(value))
+                           if (isALNUMC(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
-                   break;
-               case ANYOF_DIGIT:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
-                   else {
-                       for (value = '0'; value <= '9'; value++)
-                           ANYOF_BITMAP_SET(ret, value);
-                   }
-                   break;
-               case ANYOF_NDIGIT:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
-                   else {
-                       for (value = 0; value < '0'; value++)
-                           ANYOF_BITMAP_SET(ret, value);
-                       for (value = '9' + 1; value < 256; value++)
-                           ANYOF_BITMAP_SET(ret, value);
-                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
                    break;
                case ANYOF_NALNUMC:
                    if (LOC)
@@ -3361,15 +3374,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isALNUMC(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
-                   break;
-               case ANYOF_ALNUMC:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
-                   else {
-                       for (value = 0; value < 256; value++)
-                           if (isALNUMC(value))
-                               ANYOF_BITMAP_SET(ret, value);
-                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
                    break;
                case ANYOF_ALPHA:
                    if (LOC)
@@ -3379,6 +3384,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isALPHA(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
                    break;
                case ANYOF_NALPHA:
                    if (LOC)
@@ -3388,6 +3394,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isALPHA(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
                    break;
                case ANYOF_ASCII:
                    if (LOC)
@@ -3402,6 +3409,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                                ANYOF_BITMAP_SET(ret, value);
 #endif /* EBCDIC */
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
                    break;
                case ANYOF_NASCII:
                    if (LOC)
@@ -3416,6 +3424,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                                ANYOF_BITMAP_SET(ret, value);
 #endif /* EBCDIC */
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
                    break;
                case ANYOF_BLANK:
                    if (LOC)
@@ -3425,6 +3434,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isBLANK(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
                    break;
                case ANYOF_NBLANK:
                    if (LOC)
@@ -3434,6 +3444,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isBLANK(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
                    break;
                case ANYOF_CNTRL:
                    if (LOC)
@@ -3443,7 +3454,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isCNTRL(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
-                   lastvalue = OOB_CHAR8;
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
                    break;
                case ANYOF_NCNTRL:
                    if (LOC)
@@ -3453,6 +3464,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isCNTRL(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
+                   break;
+               case ANYOF_DIGIT:
+                   if (LOC)
+                       ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+                   else {
+                       /* consecutive digits assumed */
+                       for (value = '0'; value <= '9'; value++)
+                           ANYOF_BITMAP_SET(ret, value);
+                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
+                   break;
+               case ANYOF_NDIGIT:
+                   if (LOC)
+                       ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+                   else {
+                       /* consecutive digits assumed */
+                       for (value = 0; value < '0'; value++)
+                           ANYOF_BITMAP_SET(ret, value);
+                       for (value = '9' + 1; value < 256; value++)
+                           ANYOF_BITMAP_SET(ret, value);
+                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
                    break;
                case ANYOF_GRAPH:
                    if (LOC)
@@ -3462,6 +3496,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isGRAPH(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
                    break;
                case ANYOF_NGRAPH:
                    if (LOC)
@@ -3471,6 +3506,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isGRAPH(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
                    break;
                case ANYOF_LOWER:
                    if (LOC)
@@ -3480,6 +3516,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isLOWER(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
                    break;
                case ANYOF_NLOWER:
                    if (LOC)
@@ -3489,6 +3526,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isLOWER(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
                    break;
                case ANYOF_PRINT:
                    if (LOC)
@@ -3498,6 +3536,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isPRINT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
                    break;
                case ANYOF_NPRINT:
                    if (LOC)
@@ -3507,6 +3546,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isPRINT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
                    break;
                case ANYOF_PSXSPC:
                    if (LOC)
@@ -3516,6 +3556,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isPSXSPC(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
                    break;
                case ANYOF_NPSXSPC:
                    if (LOC)
@@ -3525,6 +3566,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isPSXSPC(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
                    break;
                case ANYOF_PUNCT:
                    if (LOC)
@@ -3534,6 +3576,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isPUNCT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
                    break;
                case ANYOF_NPUNCT:
                    if (LOC)
@@ -3543,6 +3586,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isPUNCT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
+                   break;
+               case ANYOF_SPACE:
+                   if (LOC)
+                       ANYOF_CLASS_SET(ret, ANYOF_SPACE);
+                   else {
+                       for (value = 0; value < 256; value++)
+                           if (isSPACE(value))
+                               ANYOF_BITMAP_SET(ret, value);
+                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
+                   break;
+               case ANYOF_NSPACE:
+                   if (LOC)
+                       ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+                   else {
+                       for (value = 0; value < 256; value++)
+                           if (!isSPACE(value))
+                               ANYOF_BITMAP_SET(ret, value);
+                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
                    break;
                case ANYOF_UPPER:
                    if (LOC)
@@ -3552,6 +3616,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isUPPER(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
                    break;
                case ANYOF_NUPPER:
                    if (LOC)
@@ -3561,6 +3626,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isUPPER(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
                    break;
                case ANYOF_XDIGIT:
                    if (LOC)
@@ -3570,6 +3636,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (isXDIGIT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
                    break;
                case ANYOF_NXDIGIT:
                    if (LOC)
@@ -3579,6 +3646,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                            if (!isXDIGIT(value))
                                ANYOF_BITMAP_SET(ret, value);
                    }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
                    break;
                default:
                    vFAIL("Invalid [::] class");
@@ -3588,7 +3656,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
                continue;
            }
-       }
+       } /* end of namedclass \blah */
+
        if (range) {
            if (lastvalue > value) /* b-a */ {
                Simple_vFAIL4("Invalid [] range \"%*.*s\"",
@@ -3596,14 +3665,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                              RExC_parse - rangebegin,
                              rangebegin);
            }
-           range = 0;
+           range = 0; /* not a true range */
        }
        else {
-           lastvalue = value;
+           lastvalue = value; /* save the beginning of the range */
            if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
                RExC_parse[1] != ']') {
                RExC_parse++;
-               if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
+
+               /* a bad range like \w-, [:word:]- ? */
+               if (namedclass > OOB_NAMEDCLASS) {
                    if (ckWARN(WARN_REGEXP))
                        vWARN4(RExC_parse,
                               "False [] range \"%*.*s\"",
@@ -3613,325 +3684,89 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                    if (!SIZE_ONLY)
                        ANYOF_BITMAP_SET(ret, '-');
                } else
-                   range = 1;
-               continue;       /* do it next time */
+                   range = 1;  /* yeah, it's a range! */
+               continue;       /* but do it the next time */
            }
        }
+
        /* now is the next time */
        if (!SIZE_ONLY) {
+           if (lastvalue < 256 && value < 256) {
 #ifndef ASCIIish /* EBCDIC, for example. */
-           if ((isLOWER(lastvalue) && isLOWER(value)) ||
-               (isUPPER(lastvalue) && isUPPER(value)))
-           {
-               I32 i;
-               if (isLOWER(lastvalue)) {
-                   for (i = lastvalue; i <= value; i++)
-                       if (isLOWER(i))
-                           ANYOF_BITMAP_SET(ret, i);
-               } else {
-                   for (i = lastvalue; i <= value; i++)
-                       if (isUPPER(i))
-                           ANYOF_BITMAP_SET(ret, i);
+               if ((isLOWER(lastvalue) && isLOWER(value)) ||
+                   (isUPPER(lastvalue) && isUPPER(value)))
+               {
+                   IV i;
+                   if (isLOWER(lastvalue)) {
+                       for (i = lastvalue; i <= value; i++)
+                           if (isLOWER(i))
+                               ANYOF_BITMAP_SET(ret, i);
+                   } else {
+                       for (i = lastvalue; i <= value; i++)
+                           if (isUPPER(i))
+                               ANYOF_BITMAP_SET(ret, i);
+                   }
                }
-           }
-           else
+               else
 #endif
-               for ( ; lastvalue <= value; lastvalue++)
-                   ANYOF_BITMAP_SET(ret, lastvalue);
+                   for ( ; lastvalue <= value; lastvalue++)
+                       ANYOF_BITMAP_SET(ret, lastvalue);
+           } else {
+               ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+               if (lastvalue < value)
+                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+                                  (UV)lastvalue, (UV)value);
+               else
+                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                  (UV)value);
+           }
         }
-       range = 0;
+
+       range = 0; /* this range (if it was one) is done now */
     }
+
     if (need_class) {
        if (SIZE_ONLY)
            RExC_size += ANYOF_CLASS_ADD_SKIP;
        else
            RExC_emit += ANYOF_CLASS_ADD_SKIP;
     }
+
     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
     if (!SIZE_ONLY &&
-       (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
+       (ANYOF_FLAGS(ret) &
+        /* If the only flag is folding (plus possibly inversion). */
+        (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
        for (value = 0; value < 256; ++value) {
            if (ANYOF_BITMAP_TEST(ret, value)) {
-               I32 cf = PL_fold[value];
-               ANYOF_BITMAP_SET(ret, cf);
+               IV fold = PL_fold[value];
+
+               if (fold != value)
+                   ANYOF_BITMAP_SET(ret, fold);
            }
        }
        ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
     }
+
     /* optimize inverted simple patterns (e.g. [^a-z]) */
-    if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+    if (!SIZE_ONLY &&
+       /* If the only flag is inversion. */
+       (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
        for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
            ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
        ANYOF_FLAGS(ret) = 0;
     }
-    return ret;
-}
 
-STATIC regnode *
-S_regclassutf8(pTHX_ RExC_state_t *pRExC_state)
-{
-    register char *e;
-    register U32 value;
-    register U32 lastvalue = OOB_UTF8;
-    register I32 range = 0;
-    register regnode *ret;
-    STRLEN numlen;
-    I32 n;
-    SV *listsv;
-    U8 flags = 0;
-    I32 namedclass;
-    char *rangebegin;
-
-    if (*RExC_parse == '^') {  /* Complement of range. */
-       RExC_naughty++;
-       RExC_parse++;
-       if (!SIZE_ONLY)
-           flags |= ANYOF_INVERT;
-    }
-    if (!SIZE_ONLY) {
-       if (FOLD)
-           flags |= ANYOF_FOLD;
-       if (LOC)
-           flags |= ANYOF_LOCALE;
-       listsv = newSVpvn("# comment\n", 10);
-    }
-
-    if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
-       checkposixcc(pRExC_state);
-
-    if (*RExC_parse == ']' || *RExC_parse == '-')
-       goto skipcond;          /* allow 1st char to be ] or - */
-
-    while (RExC_parse < RExC_end && *RExC_parse != ']') {
-       skipcond:
-       namedclass = OOB_NAMEDCLASS;
-       if (!range)
-           rangebegin = RExC_parse;
-       value = utf8_to_uv((U8*)RExC_parse,
-                              RExC_end - RExC_parse,
-                              &numlen, 0);
-       RExC_parse += numlen;
-       if (value == '[')
-           namedclass = regpposixcc(pRExC_state, value);
-       else if (value == '\\') {
-           value = (U32)utf8_to_uv((U8*)RExC_parse,
-                                       RExC_end - RExC_parse,
-                                       &numlen, 0);
-           RExC_parse += numlen;
-           /* Some compilers cannot handle switching on 64-bit integer
-            * values, therefore value cannot be an UV.  Yes, this will
-            * be a problem later if we want switch on Unicode.  --jhi */
-           switch (value) {
-           case 'w':           namedclass = ANYOF_ALNUM;               break;
-           case 'W':           namedclass = ANYOF_NALNUM;              break;
-           case 's':           namedclass = ANYOF_SPACE;               break;
-           case 'S':           namedclass = ANYOF_NSPACE;              break;
-           case 'd':           namedclass = ANYOF_DIGIT;               break;
-           case 'D':           namedclass = ANYOF_NDIGIT;              break;
-           case 'p':
-           case 'P':
-               if (*RExC_parse == '{') {
-                   e = strchr(RExC_parse++, '}');
-                    if (!e)
-                        vFAIL("Missing right brace on \\p{}");
-                   n = e - RExC_parse;
-               }
-               else {
-                   e = RExC_parse;
-                   n = 1;
-               }
-               if (!SIZE_ONLY) {
-                   if (value == 'p')
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      "+utf8::%.*s\n", (int)n, RExC_parse);
-                   else
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      "!utf8::%.*s\n", (int)n, RExC_parse);
-               }
-               RExC_parse = e + 1;
-               lastvalue = OOB_UTF8;
-               continue;
-           case 'n':           value = '\n';           break;
-           case 'r':           value = '\r';           break;
-           case 't':           value = '\t';           break;
-           case 'f':           value = '\f';           break;
-           case 'b':           value = '\b';           break;
-#ifdef ASCIIish
-           case 'e':           value = '\033';         break;
-           case 'a':           value = '\007';         break;
-#else
-           case 'e':           value = '\047';         break;
-           case 'a':           value = '\057';         break;
-#endif
-           case 'x':
-               if (*RExC_parse == '{') {
-                   e = strchr(RExC_parse++, '}');
-                    if (!e) 
-                        vFAIL("Missing right brace on \\x{}");
-                   numlen = 1;         /* allow underscores */
-                   value = (UV)scan_hex(RExC_parse,
-                                    e - RExC_parse,
-                                    &numlen);
-                   RExC_parse = e + 1;
-               }
-               else {
-                   numlen = 0;         /* disallow underscores */
-                   value = (UV)scan_hex(RExC_parse, 2, &numlen);
-                   RExC_parse += numlen;
-               }
-               break;
-           case 'c':
-               value = UCHARAT(RExC_parse++);
-               value = toCTRL(value);
-               break;
-           case '0': case '1': case '2': case '3': case '4':
-           case '5': case '6': case '7': case '8': case '9':
-               numlen = 0;             /* disallow underscores */
-               value = (UV)scan_oct(--RExC_parse, 3, &numlen);
-               RExC_parse += numlen;
-               break;
-           default:
-               if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
-                   vWARN2(RExC_parse,
-                          "Unrecognized escape \\%c in character class passed through",
-                          (int)value);
-               break;
-           }
-       }
-       if (namedclass > OOB_NAMEDCLASS) {
-           if (range) { /* a-\d, a-[:digit:] */
-               if (!SIZE_ONLY) {
-                   if (ckWARN(WARN_REGEXP))
-                       vWARN4(RExC_parse,
-                              "False [] range \"%*.*s\"",
-                              RExC_parse - rangebegin,
-                              RExC_parse - rangebegin,
-                              rangebegin);
-                   Perl_sv_catpvf(aTHX_ listsv,
-                                  /* 0x002D is Unicode for '-' */
-                                  "%04"UVxf"\n002D\n", (UV)lastvalue);
-               }
-               range = 0;
-           }
-           if (!SIZE_ONLY) {
-               switch (namedclass) {
-               case ANYOF_ALNUM:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    break;
-               case ANYOF_NALNUM:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");    break;
-               case ANYOF_ALNUMC:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");   break;
-               case ANYOF_NALNUMC:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");   break;
-               case ANYOF_ALPHA:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");   break;
-               case ANYOF_NALPHA:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");   break;
-               case ANYOF_ASCII:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");   break;
-               case ANYOF_NASCII:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");   break;
-               case ANYOF_CNTRL:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");   break;
-               case ANYOF_NCNTRL:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");   break;
-               case ANYOF_GRAPH:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");   break;
-               case ANYOF_NGRAPH:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");   break;
-               case ANYOF_DIGIT:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");   break;
-               case ANYOF_NDIGIT:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");   break;
-               case ANYOF_LOWER:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");   break;
-               case ANYOF_NLOWER:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");   break;
-               case ANYOF_PRINT:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");   break;
-               case ANYOF_NPRINT:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");   break;
-               case ANYOF_PUNCT:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");   break;
-               case ANYOF_NPUNCT:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");   break;
-               case ANYOF_SPACE:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break;
-               case ANYOF_NSPACE:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break;
-               case ANYOF_BLANK:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");   break;
-               case ANYOF_NBLANK:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");   break;
-               case ANYOF_PSXSPC:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");   break;
-               case ANYOF_NPSXSPC:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");   break;
-               case ANYOF_UPPER:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");   break;
-               case ANYOF_NUPPER:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");   break;
-               case ANYOF_XDIGIT:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");  break;
-               case ANYOF_NXDIGIT:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");  break;
-               }
-               continue;
-           }
-       }
-        if (range) {
-           if (lastvalue > value) { /* b-a */
-               Simple_vFAIL4("Invalid [] range \"%*.*s\"",
-                             RExC_parse - rangebegin,
-                             RExC_parse - rangebegin,
-                             rangebegin);
-           }
-           range = 0;
-       }
-       else {
-           lastvalue = value;
-           if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
-               RExC_parse[1] != ']') {
-               RExC_parse++;
-               if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
-                   if (ckWARN(WARN_REGEXP))
-                       vWARN4(RExC_parse,
-                              "False [] range \"%*.*s\"",
-                              RExC_parse - rangebegin,
-                              RExC_parse - rangebegin,
-                              rangebegin);
-                   if (!SIZE_ONLY)
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      /* 0x002D is Unicode for '-' */
-                                      "002D\n");
-               } else
-                   range = 1;
-               continue;       /* do it next time */
-           }
-       }
-       /* now is the next time */
-       if (!SIZE_ONLY)
-           Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
-                          (UV)lastvalue, (UV)value);
-       range = 0;
-    }
-
-    ret = reganode(pRExC_state, ANYOFUTF8, 0);
-
-    if (!SIZE_ONLY) {
-       SV *rv = swash_init("utf8", "", listsv, 1, 0);
-#ifdef DEBUGGING
+    if (!SIZE_ONLY) { 
        AV *av = newAV();
-       av_push(av, rv);
-       av_push(av, listsv);
-       rv = newRV_inc((SV*)av);
-#else
-       SvREFCNT_dec(listsv);
-#endif
+       SV *rv;
+
+       av_store(av, 0, listsv);
+       av_store(av, 1, NULL);
+       rv = newRV_noinc((SV*)av);
        n = add_data(pRExC_state, 1, "s");
        RExC_rx->data->data[n] = (void*)rv;
-       ARG1_SET(ret, flags);
-       ARG2_SET(ret, n);
+       ARG_SET(ret, n);
     }
 
     return ret;
@@ -4269,7 +4104,7 @@ Perl_regdump(pTHX_ regexp *r)
 STATIC void
 S_put_byte(pTHX_ SV *sv, int c)
 {
-    if (isCNTRL(c) || c == 127 || c == 255)
+    if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
        Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
     else if (c == '-' || c == ']' || c == '\\' || c == '^')
        Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
@@ -4311,8 +4146,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
        Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
     else if (k == ANYOF) {
        int i, rangestart = -1;
-       bool anyofutf8 = OP(o) == ANYOFUTF8;
-       U8 flags = anyofutf8 ? ARG1(o) : o->flags;
+       U8 flags = ANYOF_FLAGS(o);
        const char * const anyofs[] = { /* Should be syncronized with
                                         * ANYOF_ #xdefines in regcomp.h */
            "\\w",
@@ -4354,78 +4188,93 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
        if (flags & ANYOF_INVERT)
            sv_catpv(sv, "^");
-       if (OP(o) == ANYOF) {
-           for (i = 0; i <= 256; i++) {
-               if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
-                   if (rangestart == -1)
-                       rangestart = i;
-               } else if (rangestart != -1) {
-                   if (i <= rangestart + 3)
-                       for (; rangestart < i; rangestart++)
-                           put_byte(sv, rangestart);
-                   else {
+       for (i = 0; i <= 256; i++) {
+           if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
+               if (rangestart == -1)
+                   rangestart = i;
+           } else if (rangestart != -1) {
+               if (i <= rangestart + 3)
+                   for (; rangestart < i; rangestart++)
                        put_byte(sv, rangestart);
-                       sv_catpv(sv, "-");
-                       put_byte(sv, i - 1);
-                   }
-                   rangestart = -1;
+               else {
+                   put_byte(sv, rangestart);
+                   sv_catpv(sv, "-");
+                   put_byte(sv, i - 1);
                }
+               rangestart = -1;
            }
-           if (o->flags & ANYOF_CLASS)
-               for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
-                   if (ANYOF_CLASS_TEST(o,i))
-                       sv_catpv(sv, anyofs[i]);
        }
-       else {
-           SV *rv = (SV*)PL_regdata->data[ARG2(o)];
-           AV *av = (AV*)SvRV((SV*)rv);
-           SV *sw = *av_fetch(av, 0, FALSE);
-           SV *lv = *av_fetch(av, 1, FALSE);
-           UV i;
-           U8 s[UTF8_MAXLEN+1];
-           for (i = 0; i <= 256; i++) { /* just the first 256 */
-               U8 *e = uv_to_utf8(s, i);
-               if (i < 256 && swash_fetch(sw, s)) {
-                   if (rangestart == -1)
-                       rangestart = i;
-               } else if (rangestart != -1) {
-                   U8 *p;
-
-                   if (i <= rangestart + 3)
-                       for (; rangestart < i; rangestart++) {
-                           for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
-                               put_byte(sv, *p);
+
+       if (o->flags & ANYOF_CLASS)
+           for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
+               if (ANYOF_CLASS_TEST(o,i))
+                   sv_catpv(sv, anyofs[i]);
+
+       if (flags & ANYOF_UNICODE)
+           sv_catpv(sv, "{unicode}");
+
+       {
+           SV *lv;
+           SV *sw = regclass_swash(o, FALSE, &lv);
+           
+           if (lv) {
+               if (sw) {
+                   UV i;
+                   U8 s[UTF8_MAXLEN+1];
+                   
+                   for (i = 0; i <= 256; i++) { /* just the first 256 */
+                       U8 *e = uv_to_utf8(s, i);
+                       
+                       if (i < 256 && swash_fetch(sw, s)) {
+                           if (rangestart == -1)
+                               rangestart = i;
+                       } else if (rangestart != -1) {
+                           U8 *p;
+                           
+                           if (i <= rangestart + 3)
+                               for (; rangestart < i; rangestart++) {
+                                   for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+                                       put_byte(sv, *p);
+                               }
+                           else {
+                               for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+                                   put_byte(sv, *p);
+                               sv_catpv(sv, "-");
+                                   for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
+                                       put_byte(sv, *p);
+                               }
+                               rangestart = -1;
+                           }
                        }
-                   else {
-                       for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
-                           put_byte(sv, *p);
-                       sv_catpv(sv, "-");
-                       for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
-                           put_byte(sv, *p);
-                   }
-                   rangestart = -1;
+                       
+                   sv_catpv(sv, "..."); /* et cetera */
                }
-           }
-           sv_catpv(sv, "...");
-           {
-               char *s = savepv(SvPVX(lv));
-
-               while(*s && *s != '\n') s++;
-               if (*s == '\n') {
-                   char *t = ++s;
 
-                   while (*s) {
-                       if (*s == '\n')
-                           *s = ' ';
-                       s++;
+               {
+                   char *s = savepv(SvPVX(lv));
+                   char *origs = s;
+                   
+                   while(*s && *s != '\n') s++;
+                   
+                   if (*s == '\n') {
+                       char *t = ++s;
+                       
+                       while (*s) {
+                           if (*s == '\n')
+                               *s = ' ';
+                           s++;
+                       }
+                       if (s[-1] == ' ')
+                           s[-1] = 0;
+                       
+                       sv_catpv(sv, t);
                    }
-                   if (s[-1] == ' ')
-                       s[-1] = 0;
-
-                   sv_catpv(sv, t);
+                   
+                   Safefree(origs);
                }
            }
        }
+
        Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
     }
     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
@@ -4486,16 +4335,6 @@ Perl_pregfree(pTHX_ struct regexp *r)
        while (--n >= 0) {
            switch (r->data->what[n]) {
            case 's':
-#ifdef DEBUGGING
-               {
-                   SV *rv = (SV*)r->data->data[n];
-                   AV *av = (AV*)SvRV((SV*)rv);
-                   SV *sw = *av_fetch(av, 0, FALSE);
-                   SV *lv = *av_fetch(av, 1, FALSE);
-                   SvREFCNT_dec(sw);
-                   SvREFCNT_dec(lv);
-               }
-#endif
                SvREFCNT_dec((SV*)r->data->data[n]);
                break;
            case 'f':
@@ -4657,4 +4496,3 @@ clear_re(pTHXo_ void *r)
 {
     ReREFCNT_dec((regexp *)r);
 }
-
index 284cf2f..c8094e1 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -88,12 +88,13 @@ struct regnode_2 {
 };
 
 #define ANYOF_BITMAP_SIZE      32      /* 256 b/(8 b/B) */
-#define ANYOF_CLASSBITMAP_SIZE  4
+#define ANYOF_CLASSBITMAP_SIZE  4      /* up to 32 (8*4) named classes */
 
 struct regnode_charclass {
     U8 flags;
     U8  type;
     U16 next_off;
+    U32 arg1;
     char bitmap[ANYOF_BITMAP_SIZE];
 };
 
@@ -101,6 +102,7 @@ struct regnode_charclass_class {
     U8 flags;
     U8  type;
     U16 next_off;
+    U32 arg1;
     char bitmap[ANYOF_BITMAP_SIZE];
     char classflags[ANYOF_CLASSBITMAP_SIZE];
 };
@@ -180,13 +182,21 @@ struct regnode_charclass_class {
 
 /* Flags for node->flags of ANYOF */
 
-#define ANYOF_CLASS    0x08
-#define ANYOF_INVERT   0x04
-#define ANYOF_FOLD     0x02
-#define ANYOF_LOCALE   0x01
+#define ANYOF_CLASS            0x08
+#define ANYOF_INVERT           0x04
+#define ANYOF_FOLD             0x02
+#define ANYOF_LOCALE           0x01
 
 /* Used for regstclass only */
-#define ANYOF_EOS      0x10            /* Can match an empty string too */
+#define ANYOF_EOS              0x10            /* Can match an empty string too */
+
+/* There is a character or a range past 0xff */
+#define ANYOF_UNICODE          0x20
+
+/* Are there any runtime flags on in this node? */
+#define ANYOF_RUNTIME(s)       (ANYOF_FLAGS(s) & 0x0f)
+
+#define ANYOF_FLAGS_ALL                0xff
 
 /* Character classes for node->classflags of ANYOF */
 /* Should be synchronized with a table in regprop() */
@@ -220,7 +230,7 @@ struct regnode_charclass_class {
 #define ANYOF_NXDIGIT  25
 #define ANYOF_PSXSPC   26      /* POSIX space: \s plus the vertical tab */
 #define ANYOF_NPSXSPC  27
-#define ANYOF_BLANK    28      /* GNU extension: space and tab */
+#define ANYOF_BLANK    28      /* GNU extension: space and tab: non-vertical space */
 #define ANYOF_NBLANK   29
 
 #define ANYOF_MAX      32
@@ -238,7 +248,6 @@ struct regnode_charclass_class {
 #define ANYOF_CLASS_SIZE       (sizeof(struct regnode_charclass_class))
 
 #define ANYOF_FLAGS(p)         ((p)->flags)
-#define ANYOF_FLAGS_ALL                0xff
 
 #define ANYOF_BIT(c)           (1 << ((c) & 7))
 
@@ -300,12 +309,14 @@ EXTCONST U8 PL_varies[] = {
 EXTCONST U8 PL_simple[];
 #else
 EXTCONST U8 PL_simple[] = {
-    REG_ANY, ANYUTF8, SANY, SANYUTF8, ANYOF, ANYOFUTF8,
-    ALNUM, ALNUMUTF8, ALNUML, ALNUMLUTF8,
-    NALNUM, NALNUMUTF8, NALNUML, NALNUMLUTF8,
-    SPACE, SPACEUTF8, SPACEL, SPACELUTF8,
-    NSPACE, NSPACEUTF8, NSPACEL, NSPACELUTF8,
-    DIGIT, DIGITUTF8, NDIGIT, NDIGITUTF8, 0
+    REG_ANY,   SANY,
+    ANYOF,
+    ALNUM,     ALNUML,
+    NALNUM,    NALNUML,
+    SPACE,     SPACEL,
+    NSPACE,    NSPACEL,
+    DIGIT,     NDIGIT,
+    0
 };
 #endif
 
index bb5f8f8..59284f4 100644 (file)
@@ -16,46 +16,27 @@ EOL         EOL,    no      Match "" at end of line.
 MEOL           EOL,    no      Same, assuming multiline.
 SEOL           EOL,    no      Same, assuming singleline.
 BOUND          BOUND,  no      Match "" at any word boundary
-BOUNDUTF8      BOUND,  no      Match "" at any word boundary
 BOUNDL         BOUND,  no      Match "" at any word boundary
-BOUNDLUTF8     BOUND,  no      Match "" at any word boundary
 NBOUND         NBOUND, no      Match "" at any word non-boundary
-NBOUNDUTF8     NBOUND, no      Match "" at any word non-boundary
 NBOUNDL                NBOUND, no      Match "" at any word non-boundary
-NBOUNDLUTF8    NBOUND, no      Match "" at any word non-boundary
 GPOS           GPOS,   no      Matches where last m//g left off.
 
 # [Special] alternatives
 REG_ANY                REG_ANY,    no  Match any one character (except newline).
-ANYUTF8                REG_ANY,    no  Match any one Unicode character (except newline).
 SANY           REG_ANY,    no  Match any one character.
-SANYUTF8       REG_ANY,    no  Match any one Unicode character.
 ANYOF          ANYOF,  sv      Match character in (or not in) this class.
-ANYOFUTF8      ANYOF,  sv 1    Match character in (or not in) this class.
 ALNUM          ALNUM,  no      Match any alphanumeric character
-ALNUMUTF8      ALNUM,  no      Match any alphanumeric character in utf8
 ALNUML         ALNUM,  no      Match any alphanumeric char in locale
-ALNUMLUTF8     ALNUM,  no      Match any alphanumeric char in locale+utf8
 NALNUM         NALNUM, no      Match any non-alphanumeric character
-NALNUMUTF8     NALNUM, no      Match any non-alphanumeric character in utf8
 NALNUML                NALNUM, no      Match any non-alphanumeric char in locale
-NALNUMLUTF8    NALNUM, no      Match any non-alphanumeric char in locale+utf8
 SPACE          SPACE,  no      Match any whitespace character
-SPACEUTF8      SPACE,  no      Match any whitespace character in utf8
 SPACEL         SPACE,  no      Match any whitespace char in locale
-SPACELUTF8     SPACE,  no      Match any whitespace char in locale+utf8
 NSPACE         NSPACE, no      Match any non-whitespace character
-NSPACEUTF8     NSPACE, no      Match any non-whitespace character in utf8
 NSPACEL                NSPACE, no      Match any non-whitespace char in locale
-NSPACELUTF8    NSPACE, no      Match any non-whitespace char in locale+utf8
 DIGIT          DIGIT,  no      Match any numeric character
-DIGITUTF8      DIGIT,  no      Match any numeric character in utf8
 DIGITL         DIGIT,  no      Match any numeric character in locale
-DIGITLUTF8     DIGIT,  no      Match any numeric character in locale+utf8
 NDIGIT         NDIGIT, no      Match any non-numeric character
-NDIGITUTF8     NDIGIT, no      Match any non-numeric character in utf8
 NDIGITL                NDIGIT, no      Match any non-numeric character in locale
-NDIGITLUTF8    NDIGIT, no      Match any non-numeric character in locale+utf8
 CLUMP          CLUMP,  no      Match any combining character sequence
 
 # BRANCH       The set of branches constituting a single choice are hooked
index 5e821ba..ac91bea 100644 (file)
--- a/regexec.c
+++ b/regexec.c
  * Forwards.
  */
 
-#define REGINCLASS(p,c)  (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
-#ifdef DEBUGGING
-#   define REGINCLASSUTF8(f,p)  (ARG1(f) ? reginclassutf8(f,p) : swash_fetch(*av_fetch((AV*)SvRV((SV*)PL_regdata->data[ARG2(f)]),0,FALSE),p))
-#else
-#   define REGINCLASSUTF8(f,p)  (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
-#endif
-
 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 
@@ -738,7 +731,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
 
        t = s;
        if (prog->reganch & ROPT_UTF8) {        
-           PL_regdata = prog->data;    /* Used by REGINCLASS UTF logic */
+           PL_regdata = prog->data;
            PL_bostr = startpos;
        }
         s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
@@ -840,25 +833,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
        unsigned int c2;
        char *e;
        register I32 tmp = 1;   /* Scratch variable? */
+       register bool do_utf8 = DO_UTF8(PL_reg_sv);
 
        /* We know what class it must start with. */
        switch (OP(c)) {
-       case ANYOFUTF8:
-           while (s < strend) {
-               if (REGINCLASSUTF8(c, (U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
-                   else
-                       tmp = doevery;
-               }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
-           }
-           break;
        case ANYOF:
            while (s < strend) {
-               if (REGINCLASS(c, *(U8*)s)) {
+               if (reginclass(c, (U8*)s, do_utf8)) {
                    if (tmp && (norun || regtry(prog, s)))
                        goto got_it;
                    else
@@ -866,7 +847,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                }
                else
                    tmp = 1;
-               s++;
+               s += do_utf8 ? UTF8SKIP(s) : 1;
            }
            break;
        case EXACTF:
@@ -912,42 +893,40 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            PL_reg_flags |= RF_tainted;
            /* FALL THROUGH */
        case BOUND:
-           tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
-           tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
-           while (s < strend) {
-               if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
-                   tmp = !tmp;
-                   if ((norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               if (s == startpos)
+                   tmp = '\n';
+               else {
+                   U8 *r = reghop((U8*)s, -1);
+                   
+                   tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
+               }
+               tmp = ((OP(c) == BOUND ?
+                       isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
+               while (s < strend) {
+                   if (tmp == !(OP(c) == BOUND ?
+                                swash_fetch(PL_utf8_alnum, (U8*)s) :
+                                isALNUM_LC_utf8((U8*)s)))
+                   {
+                       tmp = !tmp;
+                       if ((norun || regtry(prog, s)))
+                           goto got_it;
+                   }
+                   s += UTF8SKIP(s);
                }
-               s++;
            }
-           if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
-               goto got_it;
-           break;
-       case BOUNDLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case BOUNDUTF8:
-           if (s == startpos)
-               tmp = '\n';
            else {
-               U8 *r = reghop((U8*)s, -1);
-
-               tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
-           }
-           tmp = ((OP(c) == BOUNDUTF8 ?
-                   isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
-           while (s < strend) {
-               if (tmp == !(OP(c) == BOUNDUTF8 ?
-                            swash_fetch(PL_utf8_alnum, (U8*)s) :
-                            isALNUM_LC_utf8((U8*)s)))
-               {
-                   tmp = !tmp;
-                   if ((norun || regtry(prog, s)))
-                       goto got_it;
+               tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
+               tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+               while (s < strend) {
+                   if (tmp ==
+                       !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+                       tmp = !tmp;
+                       if ((norun || regtry(prog, s)))
+                           goto got_it;
+                   }
+                   s++;
                }
-               s += UTF8SKIP(s);
            }
            if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
                goto got_it;
@@ -956,365 +935,382 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            PL_reg_flags |= RF_tainted;
            /* FALL THROUGH */
        case NBOUND:
-           tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
-           tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
-           while (s < strend) {
-               if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
-                   tmp = !tmp;
-               else if ((norun || regtry(prog, s)))
-                   goto got_it;
-               s++;
+           if (do_utf8) {
+               if (s == startpos)
+                   tmp = '\n';
+               else {
+                   U8 *r = reghop((U8*)s, -1);
+                   
+                   tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
+               }
+               tmp = ((OP(c) == NBOUND ?
+                       isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
+               while (s < strend) {
+                   if (tmp == !(OP(c) == NBOUND ?
+                                swash_fetch(PL_utf8_alnum, (U8*)s) :
+                                isALNUM_LC_utf8((U8*)s)))
+                       tmp = !tmp;
+                   else if ((norun || regtry(prog, s)))
+                       goto got_it;
+                   s += UTF8SKIP(s);
+               }
            }
-           if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
-               goto got_it;
-           break;
-       case NBOUNDLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NBOUNDUTF8:
-           if (s == startpos)
-               tmp = '\n';
            else {
-               U8 *r = reghop((U8*)s, -1);
-
-               tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
-           }
-           tmp = ((OP(c) == NBOUNDUTF8 ?
-                   isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
-           while (s < strend) {
-               if (tmp == !(OP(c) == NBOUNDUTF8 ?
-                            swash_fetch(PL_utf8_alnum, (U8*)s) :
-                            isALNUM_LC_utf8((U8*)s)))
-                   tmp = !tmp;
-               else if ((norun || regtry(prog, s)))
-                   goto got_it;
-               s += UTF8SKIP(s);
+               tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
+               tmp = ((OP(c) == NBOUND ?
+                       isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+               while (s < strend) {
+                   if (tmp ==
+                       !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+                       tmp = !tmp;
+                   else if ((norun || regtry(prog, s)))
+                       goto got_it;
+                   s++;
+               }
            }
            if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
                goto got_it;
            break;
        case ALNUM:
-           while (s < strend) {
-               if (isALNUM(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case ALNUMUTF8:
-           while (s < strend) {
-               if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isALNUM(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case ALNUML:
            PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isALNUM_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (isALNUM_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case ALNUMLUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isALNUM_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isALNUM_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case NALNUM:
-           while (s < strend) {
-               if (!isALNUM(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case NALNUMUTF8:
-           while (s < strend) {
-               if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isALNUM(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case NALNUML:
            PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isALNUM_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!isALNUM_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case NALNUMLUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isALNUM_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isALNUM_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case SPACE:
-           while (s < strend) {
-               if (isSPACE(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case SPACEUTF8:
-           while (s < strend) {
-               if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isSPACE(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case SPACEL:
            PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isSPACE_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case SPACELUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isSPACE_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case NSPACE:
-           while (s < strend) {
-               if (!isSPACE(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case NSPACEUTF8:
-           while (s < strend) {
-               if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isSPACE(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case NSPACEL:
            PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isSPACE_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case NSPACELUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isSPACE_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case DIGIT:
-           while (s < strend) {
-               if (isDIGIT(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (swash_fetch(PL_utf8_digit,(U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case DIGITUTF8:
-           while (s < strend) {
-               if (swash_fetch(PL_utf8_digit,(U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isDIGIT(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case DIGITL:
            PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isDIGIT_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (isDIGIT_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case DIGITLUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isDIGIT_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isDIGIT_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case NDIGIT:
-           while (s < strend) {
-               if (!isDIGIT(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case NDIGITUTF8:
-           while (s < strend) {
-               if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isDIGIT(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        case NDIGITL:
            PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isDIGIT_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!isDIGIT_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                }
-               else
-                   tmp = 1;
-               s++;
            }
-           break;
-       case NDIGITLUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isDIGIT_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isDIGIT_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                    else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
            }
            break;
        default:
@@ -1606,6 +1602,11 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
        if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
            /* don't bother with what can't match */
            strend = HOPc(strend, -(minlen - 1));
+       DEBUG_r({
+           SV *prop = sv_newmortal();
+           regprop(prop, c);
+           PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
+       });
        if (find_byclass(prog, c, s, strend, startpos, 0))
            goto got_it;
        DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
@@ -1619,7 +1620,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                last = screaminstr(sv, prog->float_substr, s - strbeg,
                                   end_shift, &scream_pos, 1); /* last one */
                if (!last)
-                   last = scream_olds; /* Only one occurence. */
+                   last = scream_olds; /* Only one occurrence. */
            }
            else {
                STRLEN len;
@@ -1891,6 +1892,7 @@ S_regmatch(pTHX_ regnode *prog)
     int minmod = 0, sw = 0, logical = 0;
     I32 unwind = 0;
     I32 firstcp = PL_savestack_ix;
+    register bool do_utf8 = DO_UTF8(PL_reg_sv);
 
 #ifdef DEBUGGING
     PL_regindent++;
@@ -2009,8 +2011,8 @@ S_regmatch(pTHX_ regnode *prog)
            if (PL_regeol != locinput)
                sayNO;
            break;
-       case SANYUTF8:
-           if (nextchr & 0x80) {
+       case SANY:
+           if (DO_UTF8(PL_reg_sv)) {
                locinput += PL_utf8skip[nextchr];
                if (locinput > PL_regeol)
                    sayNO;
@@ -2021,13 +2023,8 @@ S_regmatch(pTHX_ regnode *prog)
                sayNO;
            nextchr = UCHARAT(++locinput);
            break;
-       case SANY:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case ANYUTF8:
-           if (nextchr & 0x80) {
+       case REG_ANY:
+           if (DO_UTF8(PL_reg_sv)) {
                locinput += PL_utf8skip[nextchr];
                if (locinput > PL_regeol)
                    sayNO;
@@ -2038,11 +2035,6 @@ S_regmatch(pTHX_ regnode *prog)
                sayNO;
            nextchr = UCHARAT(++locinput);
            break;
-       case REG_ANY:
-           if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
        case EXACT:
            s = STRING(scan);
            ln = STR_LEN(scan);
@@ -2099,22 +2091,24 @@ S_regmatch(pTHX_ regnode *prog)
            locinput += ln;
            nextchr = UCHARAT(locinput);
            break;
-       case ANYOFUTF8:
-           if (!REGINCLASSUTF8(scan, (U8*)locinput))
-               sayNO;
-           if (locinput >= PL_regeol)
-               sayNO;
-           locinput += PL_utf8skip[nextchr];
-           nextchr = UCHARAT(locinput);
-           break;
        case ANYOF:
-           if (nextchr < 0)
+           if (do_utf8) {
+               if (!reginclass(scan, (U8*)locinput, do_utf8))
+                   sayNO;
+               if (locinput >= PL_regeol)
+                   sayNO;
+               locinput += PL_utf8skip[nextchr];
                nextchr = UCHARAT(locinput);
-           if (!REGINCLASS(scan, nextchr))
-               sayNO;
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           nextchr = UCHARAT(++locinput);
+           }
+           else {
+               if (nextchr < 0)
+                   nextchr = UCHARAT(locinput);
+               if (!reginclass(scan, (U8*)locinput, do_utf8))
+                   sayNO;
+               if (!nextchr && locinput >= PL_regeol)
+                   sayNO;
+               nextchr = UCHARAT(++locinput);
+           }
            break;
        case ALNUML:
            PL_reg_flags |= RF_tainted;
@@ -2122,19 +2116,8 @@ S_regmatch(pTHX_ regnode *prog)
        case ALNUM:
            if (!nextchr)
                sayNO;
-           if (!(OP(scan) == ALNUM
-                 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case ALNUMLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case ALNUMUTF8:
-           if (!nextchr)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (!(OP(scan) == ALNUMUTF8
+           if (do_utf8) {
+               if (!(OP(scan) == ALNUM
                      ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
                      : isALNUM_LC_utf8((U8*)locinput)))
                {
@@ -2144,7 +2127,7 @@ S_regmatch(pTHX_ regnode *prog)
                nextchr = UCHARAT(locinput);
                break;
            }
-           if (!(OP(scan) == ALNUMUTF8
+           if (!(OP(scan) == ALNUM
                  ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
                sayNO;
            nextchr = UCHARAT(++locinput);
@@ -2155,19 +2138,8 @@ S_regmatch(pTHX_ regnode *prog)
        case NALNUM:
            if (!nextchr && locinput >= PL_regeol)
                sayNO;
-           if (OP(scan) == NALNUM
-               ? isALNUM(nextchr) : isALNUM_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NALNUMLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NALNUMUTF8:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (OP(scan) == NALNUMUTF8
+           if (do_utf8) {
+               if (OP(scan) == NALNUM
                    ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
                    : isALNUM_LC_utf8((U8*)locinput))
                {
@@ -2177,7 +2149,7 @@ S_regmatch(pTHX_ regnode *prog)
                nextchr = UCHARAT(locinput);
                break;
            }
-           if (OP(scan) == NALNUMUTF8
+           if (OP(scan) == NALNUM
                ? isALNUM(nextchr) : isALNUM_LC(nextchr))
                sayNO;
            nextchr = UCHARAT(++locinput);
@@ -2189,42 +2161,38 @@ S_regmatch(pTHX_ regnode *prog)
        case BOUND:
        case NBOUND:
            /* was last char in word? */
-           ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
-           if (OP(scan) == BOUND || OP(scan) == NBOUND) {
-               ln = isALNUM(ln);
-               n = isALNUM(nextchr);
-           }
-           else {
-               ln = isALNUM_LC(ln);
-               n = isALNUM_LC(nextchr);
-           }
-           if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
-               sayNO;
-           break;
-       case BOUNDLUTF8:
-       case NBOUNDLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case BOUNDUTF8:
-       case NBOUNDUTF8:
-           /* was last char in word? */
-           if (locinput == PL_regbol)
-               ln = PL_regprev;
-           else {
-               U8 *r = reghop((U8*)locinput, -1);
-
-               ln = utf8_to_uv(r, s - (char*)r, 0, 0);
-           }
-           if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
-               ln = isALNUM_uni(ln);
-               n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
+           if (do_utf8) {
+               if (locinput == PL_regbol)
+                   ln = PL_regprev;
+               else {
+                   U8 *r = reghop((U8*)locinput, -1);
+                   
+                   ln = utf8_to_uv(r, s - (char*)r, 0, 0);
+               }
+               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+                   ln = isALNUM_uni(ln);
+                   n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
+               }
+               else {
+                   ln = isALNUM_LC_uni(ln);
+                   n = isALNUM_LC_utf8((U8*)locinput);
+               }
            }
            else {
-               ln = isALNUM_LC_uni(ln);
-               n = isALNUM_LC_utf8((U8*)locinput);
+               ln = (locinput != PL_regbol) ?
+                   UCHARAT(locinput - 1) : PL_regprev;
+               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+                   ln = isALNUM(ln);
+                   n = isALNUM(nextchr);
+               }
+               else {
+                   ln = isALNUM_LC(ln);
+                   n = isALNUM_LC(nextchr);
+               }
            }
-           if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
-               sayNO;
+           if (((!ln) == (!n)) == (OP(scan) == BOUND ||
+                                   OP(scan) == BOUNDL))
+                   sayNO;
            break;
        case SPACEL:
            PL_reg_flags |= RF_tainted;
@@ -2232,32 +2200,29 @@ S_regmatch(pTHX_ regnode *prog)
        case SPACE:
            if (!nextchr)
                sayNO;
-           if (!(OP(scan) == SPACE
-                 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case SPACELUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case SPACEUTF8:
-           if (!nextchr)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (!(OP(scan) == SPACEUTF8
-                     ? swash_fetch(PL_utf8_space, (U8*)locinput)
-                     : isSPACE_LC_utf8((U8*)locinput)))
-               {
-                   sayNO;
+           if (DO_UTF8(PL_reg_sv)) {
+               if (nextchr & 0x80) {
+                   if (!(OP(scan) == SPACE
+                         ? swash_fetch(PL_utf8_space, (U8*)locinput)
+                         : isSPACE_LC_utf8((U8*)locinput)))
+                   {
+                       sayNO;
+                   }
+                   locinput += PL_utf8skip[nextchr];
+                   nextchr = UCHARAT(locinput);
+                   break;
                }
-               locinput += PL_utf8skip[nextchr];
-               nextchr = UCHARAT(locinput);
-               break;
+               if (!(OP(scan) == SPACE
+                     ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
+                   sayNO;
+               nextchr = UCHARAT(++locinput);
+           }
+           else {
+               if (!(OP(scan) == SPACE
+                     ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
+                   sayNO;
+               nextchr = UCHARAT(++locinput);
            }
-           if (!(OP(scan) == SPACEUTF8
-                 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
            break;
        case NSPACEL:
            PL_reg_flags |= RF_tainted;
@@ -2265,19 +2230,8 @@ S_regmatch(pTHX_ regnode *prog)
        case NSPACE:
            if (!nextchr && locinput >= PL_regeol)
                sayNO;
-           if (OP(scan) == NSPACE
-               ? isSPACE(nextchr) : isSPACE_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NSPACELUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NSPACEUTF8:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (OP(scan) == NSPACEUTF8
+           if (DO_UTF8(PL_reg_sv)) {
+               if (OP(scan) == NSPACE
                    ? swash_fetch(PL_utf8_space, (U8*)locinput)
                    : isSPACE_LC_utf8((U8*)locinput))
                {
@@ -2287,7 +2241,7 @@ S_regmatch(pTHX_ regnode *prog)
                nextchr = UCHARAT(locinput);
                break;
            }
-           if (OP(scan) == NSPACEUTF8
+           if (OP(scan) == NSPACE
                ? isSPACE(nextchr) : isSPACE_LC(nextchr))
                sayNO;
            nextchr = UCHARAT(++locinput);
@@ -2298,19 +2252,8 @@ S_regmatch(pTHX_ regnode *prog)
        case DIGIT:
            if (!nextchr)
                sayNO;
-           if (!(OP(scan) == DIGIT
-                 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case DIGITLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case DIGITUTF8:
-           if (!nextchr)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (!(OP(scan) == DIGITUTF8
+           if (DO_UTF8(PL_reg_sv)) {
+               if (!(OP(scan) == DIGIT
                      ? swash_fetch(PL_utf8_digit, (U8*)locinput)
                      : isDIGIT_LC_utf8((U8*)locinput)))
                {
@@ -2320,7 +2263,7 @@ S_regmatch(pTHX_ regnode *prog)
                nextchr = UCHARAT(locinput);
                break;
            }
-           if (!(OP(scan) == DIGITUTF8
+           if (!(OP(scan) == DIGIT
                  ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
                sayNO;
            nextchr = UCHARAT(++locinput);
@@ -2331,19 +2274,8 @@ S_regmatch(pTHX_ regnode *prog)
        case NDIGIT:
            if (!nextchr && locinput >= PL_regeol)
                sayNO;
-           if (OP(scan) == NDIGIT
-               ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NDIGITLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NDIGITUTF8:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (OP(scan) == NDIGITUTF8
+           if (DO_UTF8(PL_reg_sv)) {
+               if (OP(scan) == NDIGIT
                    ? swash_fetch(PL_utf8_digit, (U8*)locinput)
                    : isDIGIT_LC_utf8((U8*)locinput))
                {
@@ -2353,7 +2285,7 @@ S_regmatch(pTHX_ regnode *prog)
                nextchr = UCHARAT(locinput);
                break;
            }
-           if (OP(scan) == NDIGITUTF8
+           if (OP(scan) == NDIGIT
                ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
                sayNO;
            nextchr = UCHARAT(++locinput);
@@ -3461,30 +3393,33 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     register I32 c;
     register char *loceol = PL_regeol;
     register I32 hardcount = 0;
+    register bool do_utf8 = DO_UTF8(PL_reg_sv);
 
     scan = PL_reginput;
     if (max != REG_INFTY && max < loceol - scan)
       loceol = scan + max;
     switch (OP(p)) {
     case REG_ANY:
-       while (scan < loceol && *scan != '\n')
-           scan++;
-       break;
-    case SANY:
-       scan = loceol;
-       break;
-    case ANYUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && *scan != '\n') {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && *scan != '\n') {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && *scan != '\n')
+               scan++;
        }
        break;
-    case SANYUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+    case SANY:
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           scan = loceol;
        }
        break;
     case EXACT:                /* length of string is 1 */
@@ -3505,135 +3440,144 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
               (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
            scan++;
        break;
-    case ANYOFUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
-       }
-       break;
     case ANYOF:
-       while (scan < loceol && REGINCLASS(p, *scan))
-           scan++;
+       if (do_utf8) {
+           loceol = PL_regeol;
+           while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
+               scan++;
+       }
        break;
     case ALNUM:
-       while (scan < loceol && isALNUM(*scan))
-           scan++;
-       break;
-    case ALNUMUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isALNUM(*scan))
+               scan++;
        }
        break;
     case ALNUML:
        PL_reg_flags |= RF_tainted;
-       while (scan < loceol && isALNUM_LC(*scan))
-           scan++;
-       break;
-    case ALNUMLUTF8:
-       PL_reg_flags |= RF_tainted;
-       loceol = PL_regeol;
-       while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isALNUM_LC(*scan))
+               scan++;
        }
        break;
-       break;
     case NALNUM:
-       while (scan < loceol && !isALNUM(*scan))
-           scan++;
-       break;
-    case NALNUMUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isALNUM(*scan))
+               scan++;
        }
        break;
     case NALNUML:
        PL_reg_flags |= RF_tainted;
-       while (scan < loceol && !isALNUM_LC(*scan))
-           scan++;
-       break;
-    case NALNUMLUTF8:
-       PL_reg_flags |= RF_tainted;
-       loceol = PL_regeol;
-       while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isALNUM_LC(*scan))
+               scan++;
        }
        break;
     case SPACE:
-       while (scan < loceol && isSPACE(*scan))
-           scan++;
-       break;
-    case SPACEUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol &&
+                  (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isSPACE(*scan))
+               scan++;
        }
        break;
     case SPACEL:
        PL_reg_flags |= RF_tainted;
-       while (scan < loceol && isSPACE_LC(*scan))
-           scan++;
-       break;
-    case SPACELUTF8:
-       PL_reg_flags |= RF_tainted;
-       loceol = PL_regeol;
-       while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol &&
+                  (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isSPACE_LC(*scan))
+               scan++;
        }
        break;
     case NSPACE:
-       while (scan < loceol && !isSPACE(*scan))
-           scan++;
-       break;
-    case NSPACEUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol &&
+                  !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isSPACE(*scan))
+               scan++;
+           break;
        }
-       break;
     case NSPACEL:
        PL_reg_flags |= RF_tainted;
-       while (scan < loceol && !isSPACE_LC(*scan))
-           scan++;
-       break;
-    case NSPACELUTF8:
-       PL_reg_flags |= RF_tainted;
-       loceol = PL_regeol;
-       while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol &&
+                  !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isSPACE_LC(*scan))
+               scan++;
        }
        break;
     case DIGIT:
-       while (scan < loceol && isDIGIT(*scan))
-           scan++;
-       break;
-    case DIGITUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isDIGIT(*scan))
+               scan++;
        }
        break;
-       break;
     case NDIGIT:
-       while (scan < loceol && !isDIGIT(*scan))
-           scan++;
-       break;
-    case NDIGITUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isDIGIT(*scan))
+               scan++;
        }
        break;
     default:           /* Called on something of 0 width. */
@@ -3712,102 +3656,139 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
 }
 
 /*
+- regclass_swash - prepare the utf8 swash
+*/
+
+SV *
+Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
+{
+    SV *sw = NULL;
+    SV *si = NULL;
+
+    if (PL_regdata && PL_regdata->count) {
+       U32 n = ARG(node);
+
+       if (PL_regdata->what[n] == 's') {
+           SV *rv = (SV*)PL_regdata->data[n];
+           AV *av = (AV*)SvRV((SV*)rv);
+           SV **a;
+           
+           si = *av_fetch(av, 0, FALSE);
+           a  =  av_fetch(av, 1, FALSE);
+           
+           if (a)
+               sw = *a;
+           else if (si && doinit) {
+               sw = swash_init("utf8", "", si, 1, 0);
+               (void)av_store(av, 1, sw);
+           }
+       }
+    }
+       
+    if (initsvp)
+       *initsvp = si;
+
+    return sw;
+}
+
+/*
  - reginclass - determine if a character falls into a character class
  */
 
 STATIC bool
-S_reginclass(pTHX_ register regnode *p, register I32 c)
+S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
 {
-    char flags = ANYOF_FLAGS(p);
+    char flags = ANYOF_FLAGS(n);
     bool match = FALSE;
 
-    c &= 0xFF;
-    if (ANYOF_BITMAP_TEST(p, c))
-       match = TRUE;
-    else if (flags & ANYOF_FOLD) {
-       I32 cf;
-       if (flags & ANYOF_LOCALE) {
-           PL_reg_flags |= RF_tainted;
-           cf = PL_fold_locale[c];
+    if (do_utf8 || (flags & ANYOF_UNICODE)) {
+       if (do_utf8 && !ANYOF_RUNTIME(n)) {
+           STRLEN len;
+           UV c = utf8_to_uv_simple(p, &len);
+
+           if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
+               match = TRUE;
        }
-       else
-           cf = PL_fold[c];
-       if (ANYOF_BITMAP_TEST(p, cf))
-           match = TRUE;
-    }
 
-    if (!match && (flags & ANYOF_CLASS)) {
-       PL_reg_flags |= RF_tainted;
-       if (
-           (ANYOF_CLASS_TEST(p, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
-           (ANYOF_CLASS_TEST(p, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_ASCII)   &&  isASCII(c))     ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NASCII)  && !isASCII(c))     ||
-           (ANYOF_CLASS_TEST(p, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_BLANK)   &&  isBLANK(c))     ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NBLANK)  && !isBLANK(c))
-           ) /* How's that for a conditional? */
-       {
-           match = TRUE;
+       if (!match) {
+           SV *sw = regclass_swash(n, TRUE, 0);
+       
+           if (sw) {
+               if (swash_fetch(sw, p))
+                   match = TRUE;
+               else if (flags & ANYOF_FOLD) {
+                   U8 tmpbuf[UTF8_MAXLEN+1];
+                   
+                   if (flags & ANYOF_LOCALE) {
+                       PL_reg_flags |= RF_tainted;
+                       uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
+                   }
+               else
+                   uv_to_utf8(tmpbuf, toLOWER_utf8(p));
+                   if (swash_fetch(sw, tmpbuf))
+                       match = TRUE;
+               }
+           }
        }
     }
+    else {
+       U8 c = *p;
 
-    return (flags & ANYOF_INVERT) ? !match : match;
-}
-
-STATIC bool
-S_reginclassutf8(pTHX_ regnode *f, U8 *p)
-{                                           
-    char flags = ARG1(f);
-    bool match = FALSE;
-#ifdef DEBUGGING
-    SV *rv = (SV*)PL_regdata->data[ARG2(f)];
-    AV *av = (AV*)SvRV((SV*)rv);
-    SV *sw = *av_fetch(av, 0, FALSE);
-    SV *lv = *av_fetch(av, 1, FALSE);
-#else
-    SV *sw = (SV*)PL_regdata->data[ARG2(f)];
-#endif
+       if (ANYOF_BITMAP_TEST(n, c))
+           match = TRUE;
+       else if (flags & ANYOF_FOLD) {
+           I32 f;
 
-    if (swash_fetch(sw, p))
-       match = TRUE;
-    else if (flags & ANYOF_FOLD) {
-       U8 tmpbuf[UTF8_MAXLEN+1];
-       if (flags & ANYOF_LOCALE) {
+           if (flags & ANYOF_LOCALE) {
+               PL_reg_flags |= RF_tainted;
+               f = PL_fold_locale[c];
+           }
+           else
+               f = PL_fold[c];
+           if (f != c && ANYOF_BITMAP_TEST(n, f))
+               match = TRUE;
+       }
+       
+       if (!match && (flags & ANYOF_CLASS)) {
            PL_reg_flags |= RF_tainted;
-           uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
+           if (
+               (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
+               (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
+               (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
+               (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
+               (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
+               ) /* How's that for a conditional? */
+           {
+               match = TRUE;
+           }
        }
-       else
-           uv_to_utf8(tmpbuf, toLOWER_utf8(p));
-       if (swash_fetch(sw, tmpbuf))
-           match = TRUE;
     }
 
-    /* UTF8 combined with ANYOF_CLASS is ill-defined. */
-
     return (flags & ANYOF_INVERT) ? !match : match;
 }
 
@@ -3815,17 +3796,20 @@ STATIC U8 *
 S_reghop(pTHX_ U8 *s, I32 off)
 {                               
     if (off >= 0) {
-       while (off-- && s < (U8*)PL_regeol)
+       while (off-- && s < (U8*)PL_regeol) {
+           /* XXX could check well-formedness here */
            s += UTF8SKIP(s);
+       }
     }
     else {
        while (off++) {
            if (s > (U8*)PL_bostr) {
                s--;
-               if (*s & 0x80) {
-                   while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
+               if (UTF8_IS_CONTINUED(*s)) {
+                   while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
                        s--;
-               }               /* XXX could check well-formedness here */
+               }
+               /* XXX could check well-formedness here */
            }
        }
     }
@@ -3836,8 +3820,10 @@ STATIC U8 *
 S_reghopmaybe(pTHX_ U8* s, I32 off)
 {
     if (off >= 0) {
-       while (off-- && s < (U8*)PL_regeol)
+       while (off-- && s < (U8*)PL_regeol) {
+           /* XXX could check well-formedness here */
            s += UTF8SKIP(s);
+       }
        if (off >= 0)
            return 0;
     }
@@ -3845,10 +3831,11 @@ S_reghopmaybe(pTHX_ U8* s, I32 off)
        while (off++) {
            if (s > (U8*)PL_bostr) {
                s--;
-               if (*s & 0x80) {
-                   while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
+               if (UTF8_IS_CONTINUED(*s)) {
+                   while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
                        s--;
-               }               /* XXX could check well-formedness here */
+               }
+               /* XXX could check well-formedness here */
            }
            else
                break;
index 89c78e6..00dc0ec 100644 (file)
 #define        MEOL    7       /*  0x7 Same, assuming multiline. */
 #define        SEOL    8       /*  0x8 Same, assuming singleline. */
 #define        BOUND   9       /*  0x9 Match "" at any word boundary */
-#define        BOUNDUTF8       10      /*  0xa Match "" at any word boundary */
-#define        BOUNDL  11      /*  0xb Match "" at any word boundary */
-#define        BOUNDLUTF8      12      /*  0xc Match "" at any word boundary */
-#define        NBOUND  13      /*  0xd Match "" at any word non-boundary */
-#define        NBOUNDUTF8      14      /*  0xe Match "" at any word non-boundary */
-#define        NBOUNDL 15      /*  0xf Match "" at any word non-boundary */
-#define        NBOUNDLUTF8     16      /* 0x10 Match "" at any word non-boundary */
-#define        GPOS    17      /* 0x11 Matches where last m//g left off. */
-#define        REG_ANY 18      /* 0x12 Match any one character (except newline). */
-#define        ANYUTF8 19      /* 0x13 Match any one Unicode character (except newline). */
-#define        SANY    20      /* 0x14 Match any one character. */
-#define        SANYUTF8        21      /* 0x15 Match any one Unicode character. */
-#define        ANYOF   22      /* 0x16 Match character in (or not in) this class. */
-#define        ANYOFUTF8       23      /* 0x17 Match character in (or not in) this class. */
-#define        ALNUM   24      /* 0x18 Match any alphanumeric character */
-#define        ALNUMUTF8       25      /* 0x19 Match any alphanumeric character in utf8 */
-#define        ALNUML  26      /* 0x1a Match any alphanumeric char in locale */
-#define        ALNUMLUTF8      27      /* 0x1b Match any alphanumeric char in locale+utf8 */
-#define        NALNUM  28      /* 0x1c Match any non-alphanumeric character */
-#define        NALNUMUTF8      29      /* 0x1d Match any non-alphanumeric character in utf8 */
-#define        NALNUML 30      /* 0x1e Match any non-alphanumeric char in locale */
-#define        NALNUMLUTF8     31      /* 0x1f Match any non-alphanumeric char in locale+utf8 */
-#define        SPACE   32      /* 0x20 Match any whitespace character */
-#define        SPACEUTF8       33      /* 0x21 Match any whitespace character in utf8 */
-#define        SPACEL  34      /* 0x22 Match any whitespace char in locale */
-#define        SPACELUTF8      35      /* 0x23 Match any whitespace char in locale+utf8 */
-#define        NSPACE  36      /* 0x24 Match any non-whitespace character */
-#define        NSPACEUTF8      37      /* 0x25 Match any non-whitespace character in utf8 */
-#define        NSPACEL 38      /* 0x26 Match any non-whitespace char in locale */
-#define        NSPACELUTF8     39      /* 0x27 Match any non-whitespace char in locale+utf8 */
-#define        DIGIT   40      /* 0x28 Match any numeric character */
-#define        DIGITUTF8       41      /* 0x29 Match any numeric character in utf8 */
-#define        DIGITL  42      /* 0x2a Match any numeric character in locale */
-#define        DIGITLUTF8      43      /* 0x2b Match any numeric character in locale+utf8 */
-#define        NDIGIT  44      /* 0x2c Match any non-numeric character */
-#define        NDIGITUTF8      45      /* 0x2d Match any non-numeric character in utf8 */
-#define        NDIGITL 46      /* 0x2e Match any non-numeric character in locale */
-#define        NDIGITLUTF8     47      /* 0x2f Match any non-numeric character in locale+utf8 */
-#define        CLUMP   48      /* 0x30 Match any combining character sequence */
-#define        BRANCH  49      /* 0x31 Match this alternative, or the next... */
-#define        BACK    50      /* 0x32 Match "", "next" ptr points backward. */
-#define        EXACT   51      /* 0x33 Match this string (preceded by length). */
-#define        EXACTF  52      /* 0x34 Match this string, folded (prec. by length). */
-#define        EXACTFL 53      /* 0x35 Match this string, folded in locale (w/len). */
-#define        NOTHING 54      /* 0x36 Match empty string. */
-#define        TAIL    55      /* 0x37 Match empty string. Can jump here from outside. */
-#define        STAR    56      /* 0x38 Match this (simple) thing 0 or more times. */
-#define        PLUS    57      /* 0x39 Match this (simple) thing 1 or more times. */
-#define        CURLY   58      /* 0x3a Match this simple thing {n,m} times. */
-#define        CURLYN  59      /* 0x3b Match next-after-this simple thing  */
-#define        CURLYM  60      /* 0x3c Match this medium-complex thing {n,m} times. */
-#define        CURLYX  61      /* 0x3d Match this complex thing {n,m} times. */
-#define        WHILEM  62      /* 0x3e Do curly processing and see if rest matches. */
-#define        OPEN    63      /* 0x3f Mark this point in input as start of #n. */
-#define        CLOSE   64      /* 0x40 Analogous to OPEN. */
-#define        REF     65      /* 0x41 Match some already matched string */
-#define        REFF    66      /* 0x42 Match already matched string, folded */
-#define        REFFL   67      /* 0x43 Match already matched string, folded in loc. */
-#define        IFMATCH 68      /* 0x44 Succeeds if the following matches. */
-#define        UNLESSM 69      /* 0x45 Fails if the following matches. */
-#define        SUSPEND 70      /* 0x46 "Independent" sub-RE. */
-#define        IFTHEN  71      /* 0x47 Switch, should be preceeded by switcher . */
-#define        GROUPP  72      /* 0x48 Whether the group matched. */
-#define        LONGJMP 73      /* 0x49 Jump far away. */
-#define        BRANCHJ 74      /* 0x4a BRANCH with long offset. */
-#define        EVAL    75      /* 0x4b Execute some Perl code. */
-#define        MINMOD  76      /* 0x4c Next operator is not greedy. */
-#define        LOGICAL 77      /* 0x4d Next opcode should set the flag only. */
-#define        RENUM   78      /* 0x4e Group with independently numbered parens. */
-#define        OPTIMIZED       79      /* 0x4f Placeholder for dump. */
+#define        BOUNDL  10      /*  0xa Match "" at any word boundary */
+#define        NBOUND  11      /*  0xb Match "" at any word non-boundary */
+#define        NBOUNDL 12      /*  0xc Match "" at any word non-boundary */
+#define        GPOS    13      /*  0xd Matches where last m//g left off. */
+#define        REG_ANY 14      /*  0xe Match any one character (except newline). */
+#define        SANY    15      /*  0xf Match any one character. */
+#define        ANYOF   16      /* 0x10 Match character in (or not in) this class. */
+#define        ALNUM   17      /* 0x11 Match any alphanumeric character */
+#define        ALNUML  18      /* 0x12 Match any alphanumeric char in locale */
+#define        NALNUM  19      /* 0x13 Match any non-alphanumeric character */
+#define        NALNUML 20      /* 0x14 Match any non-alphanumeric char in locale */
+#define        SPACE   21      /* 0x15 Match any whitespace character */
+#define        SPACEL  22      /* 0x16 Match any whitespace char in locale */
+#define        NSPACE  23      /* 0x17 Match any non-whitespace character */
+#define        NSPACEL 24      /* 0x18 Match any non-whitespace char in locale */
+#define        DIGIT   25      /* 0x19 Match any numeric character */
+#define        DIGITL  26      /* 0x1a Match any numeric character in locale */
+#define        NDIGIT  27      /* 0x1b Match any non-numeric character */
+#define        NDIGITL 28      /* 0x1c Match any non-numeric character in locale */
+#define        CLUMP   29      /* 0x1d Match any combining character sequence */
+#define        BRANCH  30      /* 0x1e Match this alternative, or the next... */
+#define        BACK    31      /* 0x1f Match "", "next" ptr points backward. */
+#define        EXACT   32      /* 0x20 Match this string (preceded by length). */
+#define        EXACTF  33      /* 0x21 Match this string, folded (prec. by length). */
+#define        EXACTFL 34      /* 0x22 Match this string, folded in locale (w/len). */
+#define        NOTHING 35      /* 0x23 Match empty string. */
+#define        TAIL    36      /* 0x24 Match empty string. Can jump here from outside. */
+#define        STAR    37      /* 0x25 Match this (simple) thing 0 or more times. */
+#define        PLUS    38      /* 0x26 Match this (simple) thing 1 or more times. */
+#define        CURLY   39      /* 0x27 Match this simple thing {n,m} times. */
+#define        CURLYN  40      /* 0x28 Match next-after-this simple thing  */
+#define        CURLYM  41      /* 0x29 Match this medium-complex thing {n,m} times. */
+#define        CURLYX  42      /* 0x2a Match this complex thing {n,m} times. */
+#define        WHILEM  43      /* 0x2b Do curly processing and see if rest matches. */
+#define        OPEN    44      /* 0x2c Mark this point in input as start of #n. */
+#define        CLOSE   45      /* 0x2d Analogous to OPEN. */
+#define        REF     46      /* 0x2e Match some already matched string */
+#define        REFF    47      /* 0x2f Match already matched string, folded */
+#define        REFFL   48      /* 0x30 Match already matched string, folded in loc. */
+#define        IFMATCH 49      /* 0x31 Succeeds if the following matches. */
+#define        UNLESSM 50      /* 0x32 Fails if the following matches. */
+#define        SUSPEND 51      /* 0x33 "Independent" sub-RE. */
+#define        IFTHEN  52      /* 0x34 Switch, should be preceeded by switcher . */
+#define        GROUPP  53      /* 0x35 Whether the group matched. */
+#define        LONGJMP 54      /* 0x36 Jump far away. */
+#define        BRANCHJ 55      /* 0x37 BRANCH with long offset. */
+#define        EVAL    56      /* 0x38 Execute some Perl code. */
+#define        MINMOD  57      /* 0x39 Next operator is not greedy. */
+#define        LOGICAL 58      /* 0x3a Next opcode should set the flag only. */
+#define        RENUM   59      /* 0x3b Group with independently numbered parens. */
+#define        OPTIMIZED       60      /* 0x3c Placeholder for dump. */
 
 #ifndef DOINIT
 EXTCONST U8 PL_regkind[];
@@ -98,44 +79,25 @@ EXTCONST U8 PL_regkind[] = {
        EOL,            /* MEOL */
        EOL,            /* SEOL */
        BOUND,          /* BOUND */
-       BOUND,          /* BOUNDUTF8 */
        BOUND,          /* BOUNDL */
-       BOUND,          /* BOUNDLUTF8 */
        NBOUND,         /* NBOUND */
-       NBOUND,         /* NBOUNDUTF8 */
        NBOUND,         /* NBOUNDL */
-       NBOUND,         /* NBOUNDLUTF8 */
        GPOS,           /* GPOS */
        REG_ANY,                /* REG_ANY */
-       REG_ANY,                /* ANYUTF8 */
        REG_ANY,                /* SANY */
-       REG_ANY,                /* SANYUTF8 */
        ANYOF,          /* ANYOF */
-       ANYOF,          /* ANYOFUTF8 */
        ALNUM,          /* ALNUM */
-       ALNUM,          /* ALNUMUTF8 */
        ALNUM,          /* ALNUML */
-       ALNUM,          /* ALNUMLUTF8 */
        NALNUM,         /* NALNUM */
-       NALNUM,         /* NALNUMUTF8 */
        NALNUM,         /* NALNUML */
-       NALNUM,         /* NALNUMLUTF8 */
        SPACE,          /* SPACE */
-       SPACE,          /* SPACEUTF8 */
        SPACE,          /* SPACEL */
-       SPACE,          /* SPACELUTF8 */
        NSPACE,         /* NSPACE */
-       NSPACE,         /* NSPACEUTF8 */
        NSPACE,         /* NSPACEL */
-       NSPACE,         /* NSPACELUTF8 */
        DIGIT,          /* DIGIT */
-       DIGIT,          /* DIGITUTF8 */
        DIGIT,          /* DIGITL */
-       DIGIT,          /* DIGITLUTF8 */
        NDIGIT,         /* NDIGIT */
-       NDIGIT,         /* NDIGITUTF8 */
        NDIGIT,         /* NDIGITL */
-       NDIGIT,         /* NDIGITLUTF8 */
        CLUMP,          /* CLUMP */
        BRANCH,         /* BRANCH */
        BACK,           /* BACK */
@@ -184,44 +146,25 @@ static const U8 regarglen[] = {
        0,              /* MEOL */
        0,              /* SEOL */
        0,              /* BOUND */
-       0,              /* BOUNDUTF8 */
        0,              /* BOUNDL */
-       0,              /* BOUNDLUTF8 */
        0,              /* NBOUND */
-       0,              /* NBOUNDUTF8 */
        0,              /* NBOUNDL */
-       0,              /* NBOUNDLUTF8 */
        0,              /* GPOS */
        0,              /* REG_ANY */
-       0,              /* ANYUTF8 */
        0,              /* SANY */
-       0,              /* SANYUTF8 */
        0,              /* ANYOF */
-       EXTRA_SIZE(struct regnode_1),           /* ANYOFUTF8 */
        0,              /* ALNUM */
-       0,              /* ALNUMUTF8 */
        0,              /* ALNUML */
-       0,              /* ALNUMLUTF8 */
        0,              /* NALNUM */
-       0,              /* NALNUMUTF8 */
        0,              /* NALNUML */
-       0,              /* NALNUMLUTF8 */
        0,              /* SPACE */
-       0,              /* SPACEUTF8 */
        0,              /* SPACEL */
-       0,              /* SPACELUTF8 */
        0,              /* NSPACE */
-       0,              /* NSPACEUTF8 */
        0,              /* NSPACEL */
-       0,              /* NSPACELUTF8 */
        0,              /* DIGIT */
-       0,              /* DIGITUTF8 */
        0,              /* DIGITL */
-       0,              /* DIGITLUTF8 */
        0,              /* NDIGIT */
-       0,              /* NDIGITUTF8 */
        0,              /* NDIGITL */
-       0,              /* NDIGITLUTF8 */
        0,              /* CLUMP */
        0,              /* BRANCH */
        0,              /* BACK */
@@ -267,44 +210,25 @@ static const char reg_off_by_arg[] = {
        0,              /* MEOL */
        0,              /* SEOL */
        0,              /* BOUND */
-       0,              /* BOUNDUTF8 */
        0,              /* BOUNDL */
-       0,              /* BOUNDLUTF8 */
        0,              /* NBOUND */
-       0,              /* NBOUNDUTF8 */
        0,              /* NBOUNDL */
-       0,              /* NBOUNDLUTF8 */
        0,              /* GPOS */
        0,              /* REG_ANY */
-       0,              /* ANYUTF8 */
        0,              /* SANY */
-       0,              /* SANYUTF8 */
        0,              /* ANYOF */
-       0,              /* ANYOFUTF8 */
        0,              /* ALNUM */
-       0,              /* ALNUMUTF8 */
        0,              /* ALNUML */
-       0,              /* ALNUMLUTF8 */
        0,              /* NALNUM */
-       0,              /* NALNUMUTF8 */
        0,              /* NALNUML */
-       0,              /* NALNUMLUTF8 */
        0,              /* SPACE */
-       0,              /* SPACEUTF8 */
        0,              /* SPACEL */
-       0,              /* SPACELUTF8 */
        0,              /* NSPACE */
-       0,              /* NSPACEUTF8 */
        0,              /* NSPACEL */
-       0,              /* NSPACELUTF8 */
        0,              /* DIGIT */
-       0,              /* DIGITUTF8 */
        0,              /* DIGITL */
-       0,              /* DIGITLUTF8 */
        0,              /* NDIGIT */
-       0,              /* NDIGITUTF8 */
        0,              /* NDIGITL */
-       0,              /* NDIGITLUTF8 */
        0,              /* CLUMP */
        0,              /* BRANCH */
        0,              /* BACK */
@@ -351,79 +275,60 @@ static const char * const reg_name[] = {
        "MEOL",         /*  0x7 */
        "SEOL",         /*  0x8 */
        "BOUND",                /*  0x9 */
-       "BOUNDUTF8",            /*  0xa */
-       "BOUNDL",               /*  0xb */
-       "BOUNDLUTF8",           /*  0xc */
-       "NBOUND",               /*  0xd */
-       "NBOUNDUTF8",           /*  0xe */
-       "NBOUNDL",              /*  0xf */
-       "NBOUNDLUTF8",          /* 0x10 */
-       "GPOS",         /* 0x11 */
-       "REG_ANY",              /* 0x12 */
-       "ANYUTF8",              /* 0x13 */
-       "SANY",         /* 0x14 */
-       "SANYUTF8",             /* 0x15 */
-       "ANYOF",                /* 0x16 */
-       "ANYOFUTF8",            /* 0x17 */
-       "ALNUM",                /* 0x18 */
-       "ALNUMUTF8",            /* 0x19 */
-       "ALNUML",               /* 0x1a */
-       "ALNUMLUTF8",           /* 0x1b */
-       "NALNUM",               /* 0x1c */
-       "NALNUMUTF8",           /* 0x1d */
-       "NALNUML",              /* 0x1e */
-       "NALNUMLUTF8",          /* 0x1f */
-       "SPACE",                /* 0x20 */
-       "SPACEUTF8",            /* 0x21 */
-       "SPACEL",               /* 0x22 */
-       "SPACELUTF8",           /* 0x23 */
-       "NSPACE",               /* 0x24 */
-       "NSPACEUTF8",           /* 0x25 */
-       "NSPACEL",              /* 0x26 */
-       "NSPACELUTF8",          /* 0x27 */
-       "DIGIT",                /* 0x28 */
-       "DIGITUTF8",            /* 0x29 */
-       "DIGITL",               /* 0x2a */
-       "DIGITLUTF8",           /* 0x2b */
-       "NDIGIT",               /* 0x2c */
-       "NDIGITUTF8",           /* 0x2d */
-       "NDIGITL",              /* 0x2e */
-       "NDIGITLUTF8",          /* 0x2f */
-       "CLUMP",                /* 0x30 */
-       "BRANCH",               /* 0x31 */
-       "BACK",         /* 0x32 */
-       "EXACT",                /* 0x33 */
-       "EXACTF",               /* 0x34 */
-       "EXACTFL",              /* 0x35 */
-       "NOTHING",              /* 0x36 */
-       "TAIL",         /* 0x37 */
-       "STAR",         /* 0x38 */
-       "PLUS",         /* 0x39 */
-       "CURLY",                /* 0x3a */
-       "CURLYN",               /* 0x3b */
-       "CURLYM",               /* 0x3c */
-       "CURLYX",               /* 0x3d */
-       "WHILEM",               /* 0x3e */
-       "OPEN",         /* 0x3f */
-       "CLOSE",                /* 0x40 */
-       "REF",          /* 0x41 */
-       "REFF",         /* 0x42 */
-       "REFFL",                /* 0x43 */
-       "IFMATCH",              /* 0x44 */
-       "UNLESSM",              /* 0x45 */
-       "SUSPEND",              /* 0x46 */
-       "IFTHEN",               /* 0x47 */
-       "GROUPP",               /* 0x48 */
-       "LONGJMP",              /* 0x49 */
-       "BRANCHJ",              /* 0x4a */
-       "EVAL",         /* 0x4b */
-       "MINMOD",               /* 0x4c */
-       "LOGICAL",              /* 0x4d */
-       "RENUM",                /* 0x4e */
-       "OPTIMIZED",            /* 0x4f */
+       "BOUNDL",               /*  0xa */
+       "NBOUND",               /*  0xb */
+       "NBOUNDL",              /*  0xc */
+       "GPOS",         /*  0xd */
+       "REG_ANY",              /*  0xe */
+       "SANY",         /*  0xf */
+       "ANYOF",                /* 0x10 */
+       "ALNUM",                /* 0x11 */
+       "ALNUML",               /* 0x12 */
+       "NALNUM",               /* 0x13 */
+       "NALNUML",              /* 0x14 */
+       "SPACE",                /* 0x15 */
+       "SPACEL",               /* 0x16 */
+       "NSPACE",               /* 0x17 */
+       "NSPACEL",              /* 0x18 */
+       "DIGIT",                /* 0x19 */
+       "DIGITL",               /* 0x1a */
+       "NDIGIT",               /* 0x1b */
+       "NDIGITL",              /* 0x1c */
+       "CLUMP",                /* 0x1d */
+       "BRANCH",               /* 0x1e */
+       "BACK",         /* 0x1f */
+       "EXACT",                /* 0x20 */
+       "EXACTF",               /* 0x21 */
+       "EXACTFL",              /* 0x22 */
+       "NOTHING",              /* 0x23 */
+       "TAIL",         /* 0x24 */
+       "STAR",         /* 0x25 */
+       "PLUS",         /* 0x26 */
+       "CURLY",                /* 0x27 */
+       "CURLYN",               /* 0x28 */
+       "CURLYM",               /* 0x29 */
+       "CURLYX",               /* 0x2a */
+       "WHILEM",               /* 0x2b */
+       "OPEN",         /* 0x2c */
+       "CLOSE",                /* 0x2d */
+       "REF",          /* 0x2e */
+       "REFF",         /* 0x2f */
+       "REFFL",                /* 0x30 */
+       "IFMATCH",              /* 0x31 */
+       "UNLESSM",              /* 0x32 */
+       "SUSPEND",              /* 0x33 */
+       "IFTHEN",               /* 0x34 */
+       "GROUPP",               /* 0x35 */
+       "LONGJMP",              /* 0x36 */
+       "BRANCHJ",              /* 0x37 */
+       "EVAL",         /* 0x38 */
+       "MINMOD",               /* 0x39 */
+       "LOGICAL",              /* 0x3a */
+       "RENUM",                /* 0x3b */
+       "OPTIMIZED",            /* 0x3c */
 };
 
-static const int reg_num = 80;
+static const int reg_num = 61;
 
 #endif /* DEBUGGING */
 #endif /* REG_COMP_C */
diff --git a/sv.c b/sv.c
index 1dafbf6..1fbf83f 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -4522,11 +4522,9 @@ Perl_sv_len_utf8(pTHX_ register SV *sv)
     if (!sv)
        return 0;
 
-#ifdef NOTYET
     if (SvGMAGICAL(sv))
        return mg_length(sv);
     else
-#endif
     {
        STRLEN len;
        U8 *s = (U8*)SvPV(sv, len);
index ac42b85..cd9d56a 100644 (file)
@@ -5,6 +5,8 @@ BEGIN {
     @INC = '../lib';
 }
 
+no utf8; # this test contains raw 8-bit data on purpose; don't switch to \x{}
+
 print "1..78\n";
 
 my $test = 1;
index 6986720..89416dc 100755 (executable)
@@ -10,7 +10,7 @@ BEGIN {
     }
 }
 
-print "1..90\n";
+print "1..104\n";
 
 my $test = 1;
 
@@ -42,6 +42,7 @@ sub nok_bytes {
 
 {
     use utf8;
+
     $_ = ">\x{263A}<"; 
     s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg; 
     ok $_, '>&#9786;<';
@@ -106,212 +107,191 @@ sub nok_bytes {
 }
 
 {
-    use utf8;
-
-    $_ = "\x{263A}>\x{263A}\x{263A}"; 
-
-    ok length, 4;
-    $test++;                           # 13
-
-    ok length((m/>(.)/)[0]), 1;
-    $test++;                           # 14
-
-    ok length($&), 2;
-    $test++;                           # 15
+    # no use utf8 needed
+    $_ = "\x{263A}\x{263A}x\x{263A}y\x{263A}";
+    
+    ok length($_), 6;                  # 13
+    $test++;
 
-    ok length($'), 1;
-    $test++;                           # 16
+    ($a) = m/x(.)/;
 
-    ok length($`), 1;
-    $test++;                           # 17
+    ok length($a), 1;                  # 14
+    $test++;
 
-    ok length($1), 1;
-    $test++;                           # 18
+    ok length($`), 2;                  # 15
+    $test++;
+    ok length($&), 2;                  # 16
+    $test++;
+    ok length($'), 2;                  # 17
+    $test++;
 
-    ok length($tmp=$&), 2;
-    $test++;                           # 19
+    ok length($1), 1;                  # 18
+    $test++;
 
-    ok length($tmp=$'), 1;
-    $test++;                           # 20
+    ok length($b=$`), 2;               # 19
+    $test++;
 
-    ok length($tmp=$`), 1;
-    $test++;                           # 21
+    ok length($b=$&), 2;               # 20
+    $test++;
 
-    ok length($tmp=$1), 1;
-    $test++;                           # 22
+    ok length($b=$'), 2;               # 21
+    $test++;
 
-    {
-       use bytes;
+    ok length($b=$1), 1;               # 22
+    $test++;
 
-       my $tmp = $&;
-       ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
-       $test++;                                # 23
+    ok $a, "\x{263A}";                 # 23
+    $test++;
 
-       $tmp = $';
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 24
+    ok $`, "\x{263A}\x{263A}";         # 24
+    $test++;
 
-       $tmp = $`;
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 25
+    ok $&, "x\x{263A}";                        # 25
+    $test++;
 
-       $tmp = $1;
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 26
-    }
+    ok $', "y\x{263A}";                        # 26
+    $test++;
 
-    ok_bytes $&, pack("C*", ord(">"), 0342, 0230, 0272);
-    $test++;                           # 27
+    ok $1, "\x{263A}";                 # 27
+    $test++;
 
-    ok_bytes $', pack("C*", 0342, 0230, 0272);
-    $test++;                           # 28
+    ok_bytes $a, "\342\230\272";       # 28
+    $test++;
 
-    ok_bytes $`, pack("C*", 0342, 0230, 0272);
-    $test++;                           # 29
+    ok_bytes $1, "\342\230\272";       # 29
+    $test++;
 
-    ok_bytes $1, pack("C*", 0342, 0230, 0272);
-    $test++;                           # 30
+    ok_bytes $&, "x\342\230\272";      # 30
+    $test++;
 
     {
-       use bytes;
-       no utf8;
-
-       ok length, 10;
-       $test++;                                # 31
+       use utf8; # required
+       $_ = chr(0x263A) . chr(0x263A) . 'x' . chr(0x263A) . 'y' . chr(0x263A);
+    }
 
-       ok length((m/>(.)/)[0]), 1;
-       $test++;                                # 32
+    ok length($_), 6;                  # 31
+    $test++;
 
-       ok length($&), 2;
-       $test++;                                # 33
+    ($a) = m/x(.)/;
 
-       ok length($'), 5;
-       $test++;                                # 34
+    ok length($a), 1;                  # 32
+    $test++;
 
-       ok length($`), 3;
-       $test++;                                # 35
+    ok length($`), 2;                  # 33
+    $test++;
 
-       ok length($1), 1;
-       $test++;                                # 36
+    ok length($&), 2;                  # 34
+    $test++;
 
-       ok $&, pack("C*", ord(">"), 0342);
-       $test++;                                # 37
+    ok length($'), 2;                  # 35
+    $test++;
 
-       ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
-       $test++;                                # 38
+    ok length($1), 1;                  # 36
+    $test++;
 
-       ok $`, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 39
+    ok length($b=$`), 2;               # 37
+    $test++;
 
-       ok $1, pack("C*", 0342);
-       $test++;                                # 40
-    }
+    ok length($b=$&), 2;               # 38
+    $test++;
 
-    {
-       no utf8;
-       $_="\342\230\272>\342\230\272\342\230\272";
-    }
+    ok length($b=$'), 2;               # 39
+    $test++;
 
-    ok length, 10;
-    $test++;                           # 41
+    ok length($b=$1), 1;               # 40
+    $test++;
 
-    ok length((m/>(.)/)[0]), 1;
-    $test++;                           # 42
+    ok $a, "\x{263A}";                 # 41
+    $test++;
 
-    ok length($&), 2;
-    $test++;                           # 43
+    ok $`, "\x{263A}\x{263A}";         # 42
+    $test++;
 
-    ok length($'), 1;
-    $test++;                           # 44
+    ok $&, "x\x{263A}";                        # 43
+    $test++;
 
-    ok length($`), 1;
-    $test++;                           # 45
+    ok $', "y\x{263A}";                        # 44
+    $test++;
 
-    ok length($1), 1;
-    $test++;                           # 46
+    ok $1, "\x{263A}";                 # 45
+    $test++;
 
-    ok length($tmp=$&), 2;
-    $test++;                           # 47
+    ok_bytes $a, "\342\230\272";       # 46
+    $test++;
 
-    ok length($tmp=$'), 1;
-    $test++;                           # 48
+    ok_bytes $1, "\342\230\272";       # 47
+    $test++;
 
-    ok length($tmp=$`), 1;
-    $test++;                           # 49
+    ok_bytes $&, "x\342\230\272";      # 48
+    $test++;
 
-    ok length($tmp=$1), 1;
-    $test++;                           # 50
+    $_ = "\342\230\272\342\230\272x\342\230\272y\342\230\272";
 
-    {
-       use bytes;
+    ok length($_), 14;                 # 49
+    $test++;
 
-        my $tmp = $&;
-       ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
-       $test++;                                # 51
+    ($a) = m/x(.)/;
 
-        $tmp = $';
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 52
+    ok length($a), 1;                  # 50
+    $test++;
 
-        $tmp = $`;
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 53
+    ok length($`), 6;                  # 51
+    $test++;
 
-        $tmp = $1;
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 54
-    }
+    ok length($&), 2;                  # 52
+    $test++;
 
-    {
-       use bytes;
-       no utf8;
+    ok length($'), 6;                  # 53
+    $test++;
 
-       ok length, 10;
-       $test++;                                # 55
+    ok length($1), 1;                  # 54
+    $test++;
 
-       ok length((m/>(.)/)[0]), 1;
-       $test++;                                # 56
+    ok length($b=$`), 6;               # 55
+    $test++;
 
-       ok length($&), 2;
-       $test++;                                # 57
+    ok length($b=$&), 2;               # 56
+    $test++;
 
-       ok length($'), 5;
-       $test++;                                # 58
+    ok length($b=$'), 6;               # 57
+    $test++;
 
-       ok length($`), 3;
-       $test++;                                # 59
+    ok length($b=$1), 1;               # 58
+    $test++;
 
-       ok length($1), 1;
-       $test++;                                # 60
+    ok $a, "\342";                     # 59
+    $test++;
 
-       ok $&, pack("C*", ord(">"), 0342);
-       $test++;                                # 61
+    ok $`, "\342\230\272\342\230\272"; # 60
+    $test++;
 
-       ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
-       $test++;                                # 62
+    ok $&, "x\342";                    # 61
+    $test++;
 
-       ok $`, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 63
+    ok $', "\230\272y\342\230\272";    # 62
+    $test++;
 
-       ok $1, pack("C*", 0342);
-       $test++;                                # 64
-    }
+    ok $1, "\342";                     # 63
+    $test++;
+}
 
+{
+    use utf8;
     ok "\x{ab}" =~ /^\x{ab}$/, 1;
-    $test++;                                   # 65
+    $test++;                           # 64
 }
 
 {
     use utf8;
     ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2);
-    $test++;                # 66
+    $test++;                # 65
 }
 
 {
     use utf8;
     my @a = map ord, split(//, join("", map chr, (1234, 123, 2345)));
     ok "@a", "1234 123 2345";
-    $test++;                # 67
+    $test++;                # 66
 }
 
 {
@@ -319,7 +299,7 @@ sub nok_bytes {
     my $x = chr(123);
     my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345)));
     ok "@a", "1234 2345";
-    $test++;                # 68
+    $test++;                # 67
 }
 
 {
@@ -331,10 +311,10 @@ sub nok_bytes {
     { use utf8;  $b = "\xe4"     } # \xXX must not produce UTF-8
 
     print "not " if $a eq $b;
-    print "ok $test\n"; $test++;
+    print "ok $test\n"; $test++;       # 68
 
     { use utf8; print "not " if $a eq $b; }
-    print "ok $test\n"; $test++;
+    print "ok $test\n"; $test++;       # 69
 }
 
 {
@@ -344,7 +324,7 @@ sub nok_bytes {
     for (@x) {
        s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
        my($latin) = /^(.+)(?:\s+\d)/;
-       print $latin eq "stra\337e" ? "ok $test\n" :
+       print $latin eq "stra\337e" ? "ok $test\n" :    # 70, 71
            "#latin[$latin]\nnot ok $test\n";
        $test++;
        $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
@@ -369,7 +349,7 @@ sub nok_bytes {
     }
 
     print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B";
-    print "ok $test\n";
+    print "ok $test\n";                        # 72
     $test++;
 }
 
@@ -384,27 +364,27 @@ sub nok_bytes {
     print "not "
        unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a;
     print "ok $test\n";
-    $test++;
+    $test++;                           # 73
 
     my ($a, $b) = split(/\x{100}/, $s);
     print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20";
     print "ok $test\n";
-    $test++;
+    $test++;                           # 74
 
     my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s);
     print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20";
     print "ok $test\n";
-    $test++;
+    $test++;                           # 75
 
     my ($a, $b) = split(/\x40\x{80}/, $s);
     print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20";
     print "ok $test\n";
-    $test++;
+    $test++;                           # 76
 
     my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s);
     print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20";
     print "ok $test\n";
-    $test++;
+    $test++;                           # 77
 }
 
 {
@@ -414,14 +394,14 @@ sub nok_bytes {
 
     my $smiley = "\x{263a}";
 
-    for my $s ("\x{263a}",                     #  1
-              $smiley,                        #  2
+    for my $s ("\x{263a}",                     # 78
+              $smiley,                        # 79
                
-              "" . $smiley,                   #  3
-              "" . "\x{263a}",                #  4
+              "" . $smiley,                   # 80
+              "" . "\x{263a}",                # 81
 
-              $smiley    . "",                #  5
-              "\x{263a}" . "",                #  6
+              $smiley    . "",                # 82
+              "\x{263a}" . "",                # 83
               ) {
        my $length_chars = length($s);
        my $length_bytes;
@@ -437,14 +417,14 @@ sub nok_bytes {
        $test++;
     }
 
-    for my $s ("\x{263a}" . "\x{263a}",        #  7
-              $smiley    . $smiley,           #  8
+    for my $s ("\x{263a}" . "\x{263a}",        # 84
+              $smiley    . $smiley,           # 85
 
-              "\x{263a}\x{263a}",             #  9
-              "$smiley$smiley",               # 10
+              "\x{263a}\x{263a}",             # 86
+              "$smiley$smiley",               # 87
               
-              "\x{263a}" x 2,                 # 11
-              $smiley    x 2,                 # 12
+              "\x{263a}" x 2,                 # 88
+              $smiley    x 2,                 # 89
               ) {
        my $length_chars = length($s);
        my $length_bytes;
@@ -460,3 +440,106 @@ sub nok_bytes {
        $test++;
     }
 }
+
+{
+    use utf8;
+
+    print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4";
+    print "ok $test\n";
+    $test++;                                   # 90
+
+    print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}";
+    print "ok $test\n";
+    $test++;                                   # 91
+
+    print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}";
+    print "ok $test\n";
+    $test++;                                   # 92
+
+    print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4";
+    print "ok $test\n";
+    $test++;                                   # 93
+
+    print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4";
+    print "ok $test\n";
+    $test++;                                   # 94
+
+    print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}";
+    print "ok $test\n";
+    $test++;                                   # 95
+
+    print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}";
+    print "ok $test\n";
+    $test++;                                   # 96
+
+    print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4";
+    print "ok $test\n";
+    $test++;                                   # 97
+}
+
+{
+    # the first half of 20001028.003
+
+    my $X = chr(1448);
+    my ($Y) = $X =~ /(.*)/;
+    print "not " unless length $Y == 1;
+    print "ok $test\n";
+    $test++;                                   # 98
+}
+
+{
+    # 20001108.001
+
+    use utf8;
+    my $X = "Szab\x{f3},Bal\x{e1}zs";
+    my $Y = $X;
+    $Y =~ s/(B)/$1/ for 0..3;
+    print "not " unless $Y eq $X;
+    print "ok $test\n";
+    $test++;                                   # 99
+}
+
+{
+    # 20001114.001     
+
+    use utf8;
+    use charnames ':full';
+    my $text = "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}";
+    print "not " unless ord($text) == 0xc4;
+    print "ok $test\n";
+    $test++;                                    # 100
+}
+
+{
+    # 20001205.014
+
+    use utf8;
+
+    my $a = "ABC\x{263A}";
+
+    my @b = split( //, $a );
+
+    print "not " unless @b == 4;
+    print "ok $test\n";
+    $test++;                                    # 101
+
+    print "not " unless length($b[3]) == 1;
+    print "ok $test\n";
+    $test++;                                    # 102
+
+    $a =~ s/^A/Z/;
+    print "not " unless length($a) == 4;
+    print "ok $test\n";
+    $test++;                                    # 103
+}
+
+{
+    # the second half of 20001028.003
+
+    use utf8;
+    $X =~ s/^/chr(1488)/e;
+    print "not " unless length $X == 1;
+    print "ok $test\n";
+    $test++;                                   # 104
+}
+