From: Jarkko Hietaniemi Date: Sun, 17 Dec 2000 05:31:37 +0000 (+0000) Subject: Polymorphic regexps. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ffc61ed20e8bcfd3f1fbed23f6b8ef5c02664323;p=p5sagit%2Fp5-mst-13.2.git Polymorphic regexps. Fixes at least the bugs 20001028.003 (both of them...) and 20001108.001. The bugs 20001114.001 and 20001205.014 seem also to be fixed by now, probably already before this patch. p4raw-id: //depot/perl@8143 --- diff --git a/embed.h b/embed.h index 64c1eaf..3b54154 100644 --- a/embed.h +++ b/embed.h @@ -543,6 +543,7 @@ #define ref Perl_ref #define refkids Perl_refkids #define regdump Perl_regdump +#define regclass_swash Perl_regclass_swash #define pregexec Perl_pregexec #define pregfree Perl_pregfree #define pregcomp Perl_pregcomp @@ -995,7 +996,6 @@ #define regbranch S_regbranch #define reguni S_reguni #define regclass S_regclass -#define regclassutf8 S_regclassutf8 #define regcurly S_regcurly #define reg_node S_reg_node #define regpiece S_regpiece @@ -1025,7 +1025,6 @@ #define regrepeat_hard S_regrepeat_hard #define regtry S_regtry #define reginclass S_reginclass -#define reginclassutf8 S_reginclassutf8 #define regcppush S_regcppush #define regcppop S_regcppop #define regcp_set_to S_regcp_set_to @@ -2015,6 +2014,7 @@ #define ref(a,b) Perl_ref(aTHX_ a,b) #define refkids(a,b) Perl_refkids(aTHX_ a,b) #define regdump(a) Perl_regdump(aTHX_ a) +#define regclass_swash(a,b,c) Perl_regclass_swash(aTHX_ a,b,c) #define pregexec(a,b,c,d,e,f,g) Perl_pregexec(aTHX_ a,b,c,d,e,f,g) #define pregfree(a) Perl_pregfree(aTHX_ a) #define pregcomp(a,b,c) Perl_pregcomp(aTHX_ a,b,c) @@ -2459,7 +2459,6 @@ #define regbranch(a,b,c) S_regbranch(aTHX_ a,b,c) #define reguni(a,b,c,d) S_reguni(aTHX_ a,b,c,d) #define regclass(a) S_regclass(aTHX_ a) -#define regclassutf8(a) S_regclassutf8(aTHX_ a) #define regcurly(a) S_regcurly(aTHX_ a) #define reg_node(a,b) S_reg_node(aTHX_ a,b) #define regpiece(a,b) S_regpiece(aTHX_ a,b) @@ -2487,8 +2486,7 @@ #define regrepeat(a,b) S_regrepeat(aTHX_ a,b) #define regrepeat_hard(a,b,c) S_regrepeat_hard(aTHX_ a,b,c) #define regtry(a,b) S_regtry(aTHX_ a,b) -#define reginclass(a,b) S_reginclass(aTHX_ a,b) -#define reginclassutf8(a,b) S_reginclassutf8(aTHX_ a,b) +#define reginclass(a,b,c) S_reginclass(aTHX_ a,b,c) #define regcppush(a) S_regcppush(aTHX_ a) #define regcppop() S_regcppop(aTHX) #define regcp_set_to(a) S_regcp_set_to(aTHX_ a) @@ -3950,6 +3948,8 @@ #define refkids Perl_refkids #define Perl_regdump CPerlObj::Perl_regdump #define regdump Perl_regdump +#define Perl_regclass_swash CPerlObj::Perl_regclass_swash +#define regclass_swash Perl_regclass_swash #define Perl_pregexec CPerlObj::Perl_pregexec #define pregexec Perl_pregexec #define Perl_pregfree CPerlObj::Perl_pregfree @@ -4787,8 +4787,6 @@ #define reguni S_reguni #define S_regclass CPerlObj::S_regclass #define regclass S_regclass -#define S_regclassutf8 CPerlObj::S_regclassutf8 -#define regclassutf8 S_regclassutf8 #define S_regcurly CPerlObj::S_regcurly #define regcurly S_regcurly #define S_reg_node CPerlObj::S_reg_node @@ -4845,8 +4843,6 @@ #define regtry S_regtry #define S_reginclass CPerlObj::S_reginclass #define reginclass S_reginclass -#define S_reginclassutf8 CPerlObj::S_reginclassutf8 -#define reginclassutf8 S_reginclassutf8 #define S_regcppush CPerlObj::S_regcppush #define regcppush S_regcppush #define S_regcppop CPerlObj::S_regcppop diff --git a/embed.pl b/embed.pl index 9e2bd9c..32f3ddc 100755 --- a/embed.pl +++ b/embed.pl @@ -1873,6 +1873,7 @@ Ap |void |push_scope p |OP* |ref |OP* o|I32 type p |OP* |refkids |OP* o|I32 type Ap |void |regdump |regexp* r +Ap |SV* |regclass_swash |struct regnode *n|bool doinit|SV **initsvp Ap |I32 |pregexec |regexp* prog|char* stringarg \ |char* strend|char* strbeg|I32 minend \ |SV* screamer|U32 nosave @@ -2366,7 +2367,6 @@ s |regnode*|regatom |struct RExC_state_t*|I32 * s |regnode*|regbranch |struct RExC_state_t*|I32 *|I32 s |void |reguni |struct RExC_state_t*|UV|char *|STRLEN* s |regnode*|regclass |struct RExC_state_t* -s |regnode*|regclassutf8 |struct RExC_state_t* s |I32 |regcurly |char * s |regnode*|reg_node |struct RExC_state_t*|U8 s |regnode*|regpiece |struct RExC_state_t*|I32 * @@ -2401,8 +2401,7 @@ s |I32 |regmatch |regnode *prog s |I32 |regrepeat |regnode *p|I32 max s |I32 |regrepeat_hard |regnode *p|I32 max|I32 *lp s |I32 |regtry |regexp *prog|char *startpos -s |bool |reginclass |regnode *p|I32 c -s |bool |reginclassutf8 |regnode *f|U8* p +s |bool |reginclass |regnode *n|U8 *p|bool do_utf8sv_is_utf8 s |CHECKPOINT|regcppush |I32 parenfloor s |char*|regcppop s |char*|regcp_set_to |I32 ss diff --git a/mg.c b/mg.c index f97c6ce..a61d167 100644 --- a/mg.c +++ b/mg.c @@ -391,7 +391,7 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg) case '5': case '6': case '7': case '8': case '9': case '&': if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { - paren = atoi(mg->mg_ptr); + paren = atoi(mg->mg_ptr); /* $& is in [0] */ getparen: if (paren <= rx->nparens && (s1 = rx->startp[paren]) != -1 && @@ -399,17 +399,15 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg) { i = t1 - s1; getlen: - if (i > 0 && (PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) { - char *s = rx->subbeg + s1; + if (i > 0 && DO_UTF8(PL_reg_sv)) { + char *s = rx->subbeg + s1; char *send = rx->subbeg + t1; - i = 0; - while (s < send) { - s += UTF8SKIP(s); - i++; - } + + i = Perl_utf8_length((U8*)s, (U8*)send); } - if (i >= 0) - return i; + if (i < 0) + Perl_croak(aTHX_ "panic: magic_len: %d", i); + return i; } } return 0; @@ -604,7 +602,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) * Pre-threads, this was paren = atoi(GvENAME((GV*)mg->mg_obj)); * XXX Does the new way break anything? */ - paren = atoi(mg->mg_ptr); + paren = atoi(mg->mg_ptr); /* $& is in [0] */ getparen: if (paren <= rx->nparens && (s1 = rx->startp[paren]) != -1 && @@ -623,7 +621,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) PL_tainted = FALSE; } sv_setpvn(sv, s, i); - if ((PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) + if (DO_UTF8(PL_reg_sv)) SvUTF8_on(sv); else SvUTF8_off(sv); diff --git a/objXSUB.h b/objXSUB.h index 43537d3..60c6e90 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -1263,6 +1263,10 @@ #define Perl_regdump pPerl->Perl_regdump #undef regdump #define regdump Perl_regdump +#undef Perl_regclass_swash +#define Perl_regclass_swash pPerl->Perl_regclass_swash +#undef regclass_swash +#define regclass_swash Perl_regclass_swash #undef Perl_pregexec #define Perl_pregexec pPerl->Perl_pregexec #undef pregexec diff --git a/pp_ctl.c b/pp_ctl.c index d079e4a..aff5815 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -157,7 +157,7 @@ PP(pp_substcont) register char *m = cx->sb_m; char *orig = cx->sb_orig; register REGEXP *rx = cx->sb_rx; - + rxres_restore(&cx->sb_rxres, rx); if (cx->sb_iters++) { @@ -176,8 +176,8 @@ PP(pp_substcont) : (REXEC_COPY_STR|REXEC_IGNOREPOS|REXEC_NOT_FIRST)))) { SV *targ = cx->sb_targ; - sv_catpvn(dstr, s, cx->sb_strend - s); + sv_catpvn(dstr, s, cx->sb_strend - s); cx->sb_rxtainted |= RX_MATCH_TAINTED(rx); (void)SvOOK_off(targ); @@ -189,9 +189,11 @@ PP(pp_substcont) sv_free(dstr); TAINT_IF(cx->sb_rxtainted & 1); + if (pm->op_pmdynflags & PMdf_UTF8) + SvUTF8_on(targ); /* could also copy SvUTF8(dstr)? */ PUSHs(sv_2mortal(newSViv((I32)cx->sb_iters - 1))); - (void)SvPOK_only(targ); + (void)SvPOK_only_UTF8(targ); TAINT_IF(cx->sb_rxtainted); SvSETMAGIC(targ); SvTAINT(targ); @@ -209,7 +211,8 @@ PP(pp_substcont) cx->sb_strend = s + (cx->sb_strend - m); } cx->sb_m = m = rx->startp[0] + orig; - sv_catpvn(dstr, s, m-s); + if (m > s) + sv_catpvn(dstr, s, m-s); cx->sb_s = rx->endp[0] + orig; { /* Update the pos() information. */ SV *sv = cx->sb_targ; diff --git a/pp_hot.c b/pp_hot.c index 6a5b96f..2904d9f 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -1179,6 +1179,7 @@ PP(pp_match) TARG = DEFSV; EXTEND(SP,1); } + PL_reg_sv = TARG; PUTBACK; /* EVAL blocks need stack_sp. */ s = SvPV(TARG, len); strend = s + len; @@ -1268,27 +1269,25 @@ play_it_again: RX_MATCH_TAINTED_on(rx); TAINT_IF(RX_MATCH_TAINTED(rx)); if (gimme == G_ARRAY) { - I32 iters, i, len; + I32 nparens, i, len; - iters = rx->nparens; - if (global && !iters) + nparens = rx->nparens; + if (global && !nparens) i = 1; else i = 0; SPAGAIN; /* EVAL blocks could move the stack. */ - EXTEND(SP, iters + i); - EXTEND_MORTAL(iters + i); - for (i = !i; i <= iters; i++) { + EXTEND(SP, nparens + i); + EXTEND_MORTAL(nparens + i); + for (i = !i; i <= nparens; i++) { PUSHs(sv_newmortal()); /*SUPPRESS 560*/ if ((rx->startp[i] != -1) && rx->endp[i] != -1 ) { len = rx->endp[i] - rx->startp[i]; s = rx->startp[i] + truebase; sv_setpvn(*SP, s, len); - if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) { + if (DO_UTF8(TARG)) SvUTF8_on(*SP); - sv_utf8_downgrade(*SP, TRUE); - } } } if (global) { @@ -1298,7 +1297,7 @@ play_it_again: r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST; goto play_it_again; } - else if (!iters) + else if (!nparens) XPUSHs(&PL_sv_yes); LEAVE_SCOPE(oldsave); RETURN; @@ -1831,6 +1830,7 @@ PP(pp_subst) TARG = DEFSV; EXTEND(SP,1); } + PL_reg_sv = TARG; if (SvFAKE(TARG) && SvREADONLY(TARG)) sv_force_normal(TARG); if (SvREADONLY(TARG) @@ -1847,7 +1847,7 @@ PP(pp_subst) if (PL_tainted) rxtainted |= 2; TAINT_NOT; - + force_it: if (!pm || !s) DIE(aTHX_ "panic: do_subst"); @@ -2004,6 +2004,8 @@ PP(pp_subst) rxtainted |= RX_MATCH_TAINTED(rx); dstr = NEWSV(25, len); sv_setpvn(dstr, m, s-m); + if (DO_UTF8(TARG)) + SvUTF8_on(dstr); PL_curpm = pm; if (!c) { register PERL_CONTEXT *cx; @@ -2030,7 +2032,8 @@ PP(pp_subst) sv_catpvn(dstr, c, clen); if (once) break; - } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m, TARG, NULL, r_flags)); + } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m, + TARG, NULL, r_flags)); sv_catpvn(dstr, s, strend - s); (void)SvOOK_off(TARG); diff --git a/proto.h b/proto.h index 4fc260e..1bcb5cd 100644 --- a/proto.h +++ b/proto.h @@ -616,6 +616,7 @@ PERL_CALLCONV void Perl_push_scope(pTHX); PERL_CALLCONV OP* Perl_ref(pTHX_ OP* o, I32 type); PERL_CALLCONV OP* Perl_refkids(pTHX_ OP* o, I32 type); PERL_CALLCONV void Perl_regdump(pTHX_ regexp* r); +PERL_CALLCONV SV* Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **initsvp); PERL_CALLCONV I32 Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, U32 nosave); PERL_CALLCONV void Perl_pregfree(pTHX_ struct regexp* r); PERL_CALLCONV regexp* Perl_pregcomp(pTHX_ char* exp, char* xend, PMOP* pm); @@ -1111,7 +1112,6 @@ STATIC regnode* S_regatom(pTHX_ struct RExC_state_t*, I32 *); STATIC regnode* S_regbranch(pTHX_ struct RExC_state_t*, I32 *, I32); STATIC void S_reguni(pTHX_ struct RExC_state_t*, UV, char *, STRLEN*); STATIC regnode* S_regclass(pTHX_ struct RExC_state_t*); -STATIC regnode* S_regclassutf8(pTHX_ struct RExC_state_t*); STATIC I32 S_regcurly(pTHX_ char *); STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t*, U8); STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t*, I32 *); @@ -1141,8 +1141,7 @@ STATIC I32 S_regmatch(pTHX_ regnode *prog); STATIC I32 S_regrepeat(pTHX_ regnode *p, I32 max); STATIC I32 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp); STATIC I32 S_regtry(pTHX_ regexp *prog, char *startpos); -STATIC bool S_reginclass(pTHX_ regnode *p, I32 c); -STATIC bool S_reginclassutf8(pTHX_ regnode *f, U8* p); +STATIC bool S_reginclass(pTHX_ regnode *n, U8 *p, bool do_utf8sv_is_utf8); STATIC CHECKPOINT S_regcppush(pTHX_ I32 parenfloor); STATIC char* S_regcppop(pTHX); STATIC char* S_regcp_set_to(pTHX_ I32 ss); diff --git a/regcomp.c b/regcomp.c index aae2ced..69a9f91 100644 --- a/regcomp.c +++ b/regcomp.c @@ -118,7 +118,7 @@ typedef struct RExC_state_t { char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ I32 whilem_seen; /* number of WHILEM in this expr */ - regnode *emit; /* Code-emit pointer; ®dummy = don't */ + regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; @@ -234,8 +234,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define LOC (RExC_flags16 & PMf_LOCALE) #define FOLD (RExC_flags16 & PMf_FOLD) -#define OOB_CHAR8 1234 -#define OOB_UTF8 123456 +#define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) @@ -1196,7 +1195,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg break; } } - else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) { + else if (strchr((char*)PL_simple,OP(scan))) { int value; if (flags & SCF_DO_SUBSTR) { @@ -1210,20 +1209,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* Some of the logic below assumes that switching locale on will only add false positives. */ switch (PL_regkind[(U8)OP(scan)]) { - case ANYUTF8: case SANY: - case SANYUTF8: - case ALNUMUTF8: - case ANYOFUTF8: - case ALNUMLUTF8: - case NALNUMUTF8: - case NALNUMLUTF8: - case SPACEUTF8: - case NSPACEUTF8: - case SPACELUTF8: - case NSPACELUTF8: - case DIGITUTF8: - case NDIGITUTF8: default: do_default: /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ @@ -1750,7 +1736,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* turn .* into ^.* with an implied $*=1 */ int type = OP(NEXTOPER(first)); - if (type == REG_ANY || type == ANYUTF8) + if (type == REG_ANY) type = ROPT_ANCH_MBOL; else type = ROPT_ANCH_SBOL; @@ -1850,8 +1836,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) longest_fixed_length = 0; } if (r->regstclass - && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8 - || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY)) + && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) r->regstclass = NULL; if ((!r->anchored_substr || r->anchored_offset) && stclass_flag && !(data.start_class->flags & ANYOF_EOS) @@ -1866,6 +1851,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) struct regnode_charclass_class); r->regstclass = (regnode*)RExC_rx->data->data[n]; r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ + PL_regdata = r->data; /* for regprop() */ DEBUG_r((sv = sv_newmortal(), regprop(sv, (regnode*)data.start_class), PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", @@ -1933,7 +1919,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch |= ROPT_EVAL_SEEN; Newz(1002, r->startp, RExC_npar, I32); Newz(1002, r->endp, RExC_npar, I32); - PL_regdata = r->data; /* for regprop() ANYOFUTF8 */ + PL_regdata = r->data; /* for regprop() */ DEBUG_r(regdump(r)); return(r); } @@ -2556,26 +2542,17 @@ tryagain: break; case '.': nextchar(pRExC_state); - if (UTF) { - if (RExC_flags16 & PMf_SINGLELINE) - ret = reg_node(pRExC_state, SANYUTF8); - else - ret = reg_node(pRExC_state, ANYUTF8); - *flagp |= HASWIDTH; - } - else { - if (RExC_flags16 & PMf_SINGLELINE) - ret = reg_node(pRExC_state, SANY); - else - ret = reg_node(pRExC_state, REG_ANY); - *flagp |= HASWIDTH|SIMPLE; - } + if (RExC_flags16 & PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; break; case '[': { char *oregcomp_parse = ++RExC_parse; - ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state)); + ret = regclass(pRExC_state); if (*RExC_parse != ']') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ["); @@ -2659,20 +2636,14 @@ tryagain: is_utf8_mark((U8*)"~"); /* preload table */ break; case 'w': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? ALNUMLUTF8 : ALNUMUTF8) - : (LOC ? ALNUML : ALNUM)); + ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) is_utf8_alnum((U8*)"a"); /* preload table */ break; case 'W': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NALNUMLUTF8 : NALNUMUTF8) - : (LOC ? NALNUML : NALNUM)); + ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) @@ -2681,10 +2652,7 @@ tryagain: case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, - UTF - ? (LOC ? BOUNDLUTF8 : BOUNDUTF8) - : (LOC ? BOUNDL : BOUND)); + ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) @@ -2693,44 +2661,35 @@ tryagain: case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8) - : (LOC ? NBOUNDL : NBOUND)); + ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) is_utf8_alnum((U8*)"a"); /* preload table */ break; case 's': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? SPACELUTF8 : SPACEUTF8) - : (LOC ? SPACEL : SPACE)); + ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_space) is_utf8_space((U8*)" "); /* preload table */ break; case 'S': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NSPACELUTF8 : NSPACEUTF8) - : (LOC ? NSPACEL : NSPACE)); + ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_space) is_utf8_space((U8*)" "); /* preload table */ break; case 'd': - ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT); + ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_digit) is_utf8_digit((U8*)"1"); /* preload table */ break; case 'D': - ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT); + ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_digit) @@ -2754,7 +2713,7 @@ tryagain: RExC_end = RExC_parse + 2; RExC_parse--; - ret = regclassutf8(pRExC_state); + ret = regclass(pRExC_state); RExC_end = oldregxend; RExC_parse--; @@ -3194,58 +3153,108 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state) { - register U32 value; - register I32 lastvalue = OOB_CHAR8; - register I32 range = 0; + register UV value; + register IV lastvalue = OOB_UNICODE; + register IV range = 0; register regnode *ret; STRLEN numlen; - I32 namedclass; + IV namedclass; char *rangebegin; bool need_class = 0; + SV *listsv; + register char *e; + UV n; + + ret = reganode(pRExC_state, ANYOF, 0); + + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) = 0; + + if (*RExC_parse == '^') { /* Complement of range. */ + RExC_naughty++; + RExC_parse++; + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } - ret = reg_node(pRExC_state, ANYOF); if (SIZE_ONLY) RExC_size += ANYOF_SKIP; else { - ret->flags = 0; - ANYOF_BITMAP_ZERO(ret); RExC_emit += ANYOF_SKIP; if (FOLD) ANYOF_FLAGS(ret) |= ANYOF_FOLD; if (LOC) ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } - if (*RExC_parse == '^') { /* Complement of range. */ - RExC_naughty++; - RExC_parse++; - if (!SIZE_ONLY) - ANYOF_FLAGS(ret) |= ANYOF_INVERT; + ANYOF_BITMAP_ZERO(ret); + listsv = newSVpvn("# comment\n", 10); } if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) checkposixcc(pRExC_state); if (*RExC_parse == ']' || *RExC_parse == '-') - goto skipcond; /* allow 1st char to be ] or - */ + goto charclassloop; /* allow 1st char to be ] or - */ + while (RExC_parse < RExC_end && *RExC_parse != ']') { - skipcond: - namedclass = OOB_NAMEDCLASS; + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + if (!range) rangebegin = RExC_parse; - value = UCHARAT(RExC_parse++); + if (UTF) { + value = utf8_to_uv((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, 0); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); if (value == '[') namedclass = regpposixcc(pRExC_state, value); else if (value == '\\') { - value = UCHARAT(RExC_parse++); + if (UTF) { + value = utf8_to_uv((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, 0); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); /* Some compilers cannot handle switching on 64-bit integer - * values, therefore the 'value' cannot be an UV. --jhi */ - switch (value) { + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. --jhi */ + switch ((I32)value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; case 's': namedclass = ANYOF_SPACE; break; case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'p': + case 'P': + if (*RExC_parse == '{') { + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL("Missing right brace on \\p{}"); + n = e - RExC_parse; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + if (value == 'p') + Perl_sv_catpvf(aTHX_ listsv, + "+utf8::%.*s\n", (int)n, RExC_parse); + else + Perl_sv_catpvf(aTHX_ listsv, + "!utf8::%.*s\n", (int)n, RExC_parse); + } + RExC_parse = e + 1; + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + continue; case 'n': value = '\n'; break; case 'r': value = '\r'; break; case 't': value = '\t'; break; @@ -3259,9 +3268,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) case 'a': value = '\057'; break; #endif case 'x': - numlen = 0; /* disallow underscores */ - value = (UV)scan_hex(RExC_parse, 2, &numlen); - RExC_parse += numlen; + if (*RExC_parse == '{') { + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL("Missing right brace on \\x{}"); + numlen = 1; /* allow underscores */ + value = (UV)scan_hex(RExC_parse, + e - RExC_parse, + &numlen); + RExC_parse = e + 1; + } + else { + numlen = 0; /* disallow underscores */ + value = (UV)scan_hex(RExC_parse, 2, &numlen); + RExC_parse += numlen; + } break; case 'c': value = UCHARAT(RExC_parse++); @@ -3275,16 +3296,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) break; default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) - - vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value); + vWARN2(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } - } - if (namedclass > OOB_NAMEDCLASS) { - if (!need_class && !SIZE_ONLY) + } /* end of \blah */ + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + + if (!SIZE_ONLY && !need_class) ANYOF_CLASS_ZERO(ret); + need_class = 1; - if (range) { /* a-\d, a-[:digit:] */ + + /* a bad range like a-\d, a-[:digit:] ? */ + if (range) { if (!SIZE_ONLY) { if (ckWARN(WARN_REGEXP)) vWARN4(RExC_parse, @@ -3292,11 +3319,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) RExC_parse - rangebegin, RExC_parse - rangebegin, rangebegin); - ANYOF_BITMAP_SET(ret, lastvalue); - ANYOF_BITMAP_SET(ret, '-'); + if (lastvalue < 256) { + ANYOF_BITMAP_SET(ret, lastvalue); + ANYOF_BITMAP_SET(ret, '-'); + } + else { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + Perl_sv_catpvf(aTHX_ listsv, + /* 0x002D is Unicode for '-' */ + "%04"UVxf"\n002D\n", (UV)lastvalue); + } } - range = 0; /* this is not a true range */ + + range = 0; /* this was not a true range */ } + if (!SIZE_ONLY) { switch (namedclass) { case ANYOF_ALNUM: @@ -3307,6 +3344,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; case ANYOF_NALNUM: if (LOC) @@ -3316,42 +3354,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; - case ANYOF_SPACE: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_SPACE); - else { - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_SET(ret, value); - } - break; - case ANYOF_NSPACE: + case ANYOF_ALNUMC: if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NSPACE); + ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); else { for (value = 0; value < 256; value++) - if (!isSPACE(value)) + if (isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - break; - case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); - else { - for (value = '0'; value <= '9'; value++) - ANYOF_BITMAP_SET(ret, value); - } - break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); - else { - for (value = 0; value < '0'; value++) - ANYOF_BITMAP_SET(ret, value); - for (value = '9' + 1; value < 256; value++) - ANYOF_BITMAP_SET(ret, value); - } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; case ANYOF_NALNUMC: if (LOC) @@ -3361,15 +3374,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - break; - case ANYOF_ALNUMC: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); - else { - for (value = 0; value < 256; value++) - if (isALNUMC(value)) - ANYOF_BITMAP_SET(ret, value); - } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; case ANYOF_ALPHA: if (LOC) @@ -3379,6 +3384,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; case ANYOF_NALPHA: if (LOC) @@ -3388,6 +3394,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; case ANYOF_ASCII: if (LOC) @@ -3402,6 +3409,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, value); #endif /* EBCDIC */ } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; case ANYOF_NASCII: if (LOC) @@ -3416,6 +3424,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, value); #endif /* EBCDIC */ } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; case ANYOF_BLANK: if (LOC) @@ -3425,6 +3434,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; case ANYOF_NBLANK: if (LOC) @@ -3434,6 +3444,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; case ANYOF_CNTRL: if (LOC) @@ -3443,7 +3454,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } - lastvalue = OOB_CHAR8; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; case ANYOF_NCNTRL: if (LOC) @@ -3453,6 +3464,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); + break; + case ANYOF_DIGIT: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_DIGIT); + else { + /* consecutive digits assumed */ + for (value = '0'; value <= '9'; value++) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); + break; + case ANYOF_NDIGIT: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + else { + /* consecutive digits assumed */ + for (value = 0; value < '0'; value++) + ANYOF_BITMAP_SET(ret, value); + for (value = '9' + 1; value < 256; value++) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; case ANYOF_GRAPH: if (LOC) @@ -3462,6 +3496,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; case ANYOF_NGRAPH: if (LOC) @@ -3471,6 +3506,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; case ANYOF_LOWER: if (LOC) @@ -3480,6 +3516,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; case ANYOF_NLOWER: if (LOC) @@ -3489,6 +3526,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; case ANYOF_PRINT: if (LOC) @@ -3498,6 +3536,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; case ANYOF_NPRINT: if (LOC) @@ -3507,6 +3546,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; case ANYOF_PSXSPC: if (LOC) @@ -3516,6 +3556,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; case ANYOF_NPSXSPC: if (LOC) @@ -3525,6 +3566,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_PUNCT: if (LOC) @@ -3534,6 +3576,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; case ANYOF_NPUNCT: if (LOC) @@ -3543,6 +3586,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); + break; + case ANYOF_SPACE: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_SPACE); + else { + for (value = 0; value < 256; value++) + if (isSPACE(value)) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n"); + break; + case ANYOF_NSPACE: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NSPACE); + else { + for (value = 0; value < 256; value++) + if (!isSPACE(value)) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n"); break; case ANYOF_UPPER: if (LOC) @@ -3552,6 +3616,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; case ANYOF_NUPPER: if (LOC) @@ -3561,6 +3626,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; case ANYOF_XDIGIT: if (LOC) @@ -3570,6 +3636,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; case ANYOF_NXDIGIT: if (LOC) @@ -3579,6 +3646,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; default: vFAIL("Invalid [::] class"); @@ -3588,7 +3656,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_FLAGS(ret) |= ANYOF_CLASS; continue; } - } + } /* end of namedclass \blah */ + if (range) { if (lastvalue > value) /* b-a */ { Simple_vFAIL4("Invalid [] range \"%*.*s\"", @@ -3596,14 +3665,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) RExC_parse - rangebegin, rangebegin); } - range = 0; + range = 0; /* not a true range */ } else { - lastvalue = value; + lastvalue = value; /* save the beginning of the range */ if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && RExC_parse[1] != ']') { RExC_parse++; - if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { if (ckWARN(WARN_REGEXP)) vWARN4(RExC_parse, "False [] range \"%*.*s\"", @@ -3613,325 +3684,89 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!SIZE_ONLY) ANYOF_BITMAP_SET(ret, '-'); } else - range = 1; - continue; /* do it next time */ + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ } } + /* now is the next time */ if (!SIZE_ONLY) { + if (lastvalue < 256 && value < 256) { #ifndef ASCIIish /* EBCDIC, for example. */ - if ((isLOWER(lastvalue) && isLOWER(value)) || - (isUPPER(lastvalue) && isUPPER(value))) - { - I32 i; - if (isLOWER(lastvalue)) { - for (i = lastvalue; i <= value; i++) - if (isLOWER(i)) - ANYOF_BITMAP_SET(ret, i); - } else { - for (i = lastvalue; i <= value; i++) - if (isUPPER(i)) - ANYOF_BITMAP_SET(ret, i); + if ((isLOWER(lastvalue) && isLOWER(value)) || + (isUPPER(lastvalue) && isUPPER(value))) + { + IV i; + if (isLOWER(lastvalue)) { + for (i = lastvalue; i <= value; i++) + if (isLOWER(i)) + ANYOF_BITMAP_SET(ret, i); + } else { + for (i = lastvalue; i <= value; i++) + if (isUPPER(i)) + ANYOF_BITMAP_SET(ret, i); + } } - } - else + else #endif - for ( ; lastvalue <= value; lastvalue++) - ANYOF_BITMAP_SET(ret, lastvalue); + for ( ; lastvalue <= value; lastvalue++) + ANYOF_BITMAP_SET(ret, lastvalue); + } else { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + if (lastvalue < value) + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", + (UV)lastvalue, (UV)value); + else + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", + (UV)value); + } } - range = 0; + + range = 0; /* this range (if it was one) is done now */ } + if (need_class) { if (SIZE_ONLY) RExC_size += ANYOF_CLASS_ADD_SKIP; else RExC_emit += ANYOF_CLASS_ADD_SKIP; } + /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ if (!SIZE_ONLY && - (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) { + (ANYOF_FLAGS(ret) & + /* If the only flag is folding (plus possibly inversion). */ + (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) { for (value = 0; value < 256; ++value) { if (ANYOF_BITMAP_TEST(ret, value)) { - I32 cf = PL_fold[value]; - ANYOF_BITMAP_SET(ret, cf); + IV fold = PL_fold[value]; + + if (fold != value) + ANYOF_BITMAP_SET(ret, fold); } } ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; } + /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { + if (!SIZE_ONLY && + /* If the only flag is inversion. */ + (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; ANYOF_FLAGS(ret) = 0; } - return ret; -} -STATIC regnode * -S_regclassutf8(pTHX_ RExC_state_t *pRExC_state) -{ - register char *e; - register U32 value; - register U32 lastvalue = OOB_UTF8; - register I32 range = 0; - register regnode *ret; - STRLEN numlen; - I32 n; - SV *listsv; - U8 flags = 0; - I32 namedclass; - char *rangebegin; - - if (*RExC_parse == '^') { /* Complement of range. */ - RExC_naughty++; - RExC_parse++; - if (!SIZE_ONLY) - flags |= ANYOF_INVERT; - } - if (!SIZE_ONLY) { - if (FOLD) - flags |= ANYOF_FOLD; - if (LOC) - flags |= ANYOF_LOCALE; - listsv = newSVpvn("# comment\n", 10); - } - - if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) - checkposixcc(pRExC_state); - - if (*RExC_parse == ']' || *RExC_parse == '-') - goto skipcond; /* allow 1st char to be ] or - */ - - while (RExC_parse < RExC_end && *RExC_parse != ']') { - skipcond: - namedclass = OOB_NAMEDCLASS; - if (!range) - rangebegin = RExC_parse; - value = utf8_to_uv((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, 0); - RExC_parse += numlen; - if (value == '[') - namedclass = regpposixcc(pRExC_state, value); - else if (value == '\\') { - value = (U32)utf8_to_uv((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, 0); - RExC_parse += numlen; - /* Some compilers cannot handle switching on 64-bit integer - * values, therefore value cannot be an UV. Yes, this will - * be a problem later if we want switch on Unicode. --jhi */ - switch (value) { - case 'w': namedclass = ANYOF_ALNUM; break; - case 'W': namedclass = ANYOF_NALNUM; break; - case 's': namedclass = ANYOF_SPACE; break; - case 'S': namedclass = ANYOF_NSPACE; break; - case 'd': namedclass = ANYOF_DIGIT; break; - case 'D': namedclass = ANYOF_NDIGIT; break; - case 'p': - case 'P': - if (*RExC_parse == '{') { - e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\p{}"); - n = e - RExC_parse; - } - else { - e = RExC_parse; - n = 1; - } - if (!SIZE_ONLY) { - if (value == 'p') - Perl_sv_catpvf(aTHX_ listsv, - "+utf8::%.*s\n", (int)n, RExC_parse); - else - Perl_sv_catpvf(aTHX_ listsv, - "!utf8::%.*s\n", (int)n, RExC_parse); - } - RExC_parse = e + 1; - lastvalue = OOB_UTF8; - continue; - case 'n': value = '\n'; break; - case 'r': value = '\r'; break; - case 't': value = '\t'; break; - case 'f': value = '\f'; break; - case 'b': value = '\b'; break; -#ifdef ASCIIish - case 'e': value = '\033'; break; - case 'a': value = '\007'; break; -#else - case 'e': value = '\047'; break; - case 'a': value = '\057'; break; -#endif - case 'x': - if (*RExC_parse == '{') { - e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\x{}"); - numlen = 1; /* allow underscores */ - value = (UV)scan_hex(RExC_parse, - e - RExC_parse, - &numlen); - RExC_parse = e + 1; - } - else { - numlen = 0; /* disallow underscores */ - value = (UV)scan_hex(RExC_parse, 2, &numlen); - RExC_parse += numlen; - } - break; - case 'c': - value = UCHARAT(RExC_parse++); - value = toCTRL(value); - break; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - numlen = 0; /* disallow underscores */ - value = (UV)scan_oct(--RExC_parse, 3, &numlen); - RExC_parse += numlen; - break; - default: - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) - vWARN2(RExC_parse, - "Unrecognized escape \\%c in character class passed through", - (int)value); - break; - } - } - if (namedclass > OOB_NAMEDCLASS) { - if (range) { /* a-\d, a-[:digit:] */ - if (!SIZE_ONLY) { - if (ckWARN(WARN_REGEXP)) - vWARN4(RExC_parse, - "False [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - Perl_sv_catpvf(aTHX_ listsv, - /* 0x002D is Unicode for '-' */ - "%04"UVxf"\n002D\n", (UV)lastvalue); - } - range = 0; - } - if (!SIZE_ONLY) { - switch (namedclass) { - case ANYOF_ALNUM: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; - case ANYOF_NALNUM: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; - case ANYOF_ALNUMC: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; - case ANYOF_NALNUMC: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; - case ANYOF_ALPHA: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; - case ANYOF_NALPHA: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; - case ANYOF_ASCII: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; - case ANYOF_NASCII: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; - case ANYOF_CNTRL: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; - case ANYOF_NCNTRL: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; - case ANYOF_GRAPH: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; - case ANYOF_NGRAPH: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; - case ANYOF_DIGIT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; - case ANYOF_NDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; - case ANYOF_LOWER: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; - case ANYOF_NLOWER: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; - case ANYOF_PRINT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; - case ANYOF_NPRINT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; - case ANYOF_PUNCT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; - case ANYOF_NPUNCT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; - case ANYOF_SPACE: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break; - case ANYOF_NSPACE: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break; - case ANYOF_BLANK: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; - case ANYOF_NBLANK: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; - case ANYOF_PSXSPC: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; - case ANYOF_NPSXSPC: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; - case ANYOF_UPPER: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; - case ANYOF_NUPPER: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; - case ANYOF_XDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; - case ANYOF_NXDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; - } - continue; - } - } - if (range) { - if (lastvalue > value) { /* b-a */ - Simple_vFAIL4("Invalid [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - } - range = 0; - } - else { - lastvalue = value; - if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && - RExC_parse[1] != ']') { - RExC_parse++; - if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_REGEXP)) - vWARN4(RExC_parse, - "False [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, - /* 0x002D is Unicode for '-' */ - "002D\n"); - } else - range = 1; - continue; /* do it next time */ - } - } - /* now is the next time */ - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)lastvalue, (UV)value); - range = 0; - } - - ret = reganode(pRExC_state, ANYOFUTF8, 0); - - if (!SIZE_ONLY) { - SV *rv = swash_init("utf8", "", listsv, 1, 0); -#ifdef DEBUGGING + if (!SIZE_ONLY) { AV *av = newAV(); - av_push(av, rv); - av_push(av, listsv); - rv = newRV_inc((SV*)av); -#else - SvREFCNT_dec(listsv); -#endif + SV *rv; + + av_store(av, 0, listsv); + av_store(av, 1, NULL); + rv = newRV_noinc((SV*)av); n = add_data(pRExC_state, 1, "s"); RExC_rx->data->data[n] = (void*)rv; - ARG1_SET(ret, flags); - ARG2_SET(ret, n); + ARG_SET(ret, n); } return ret; @@ -4269,7 +4104,7 @@ Perl_regdump(pTHX_ regexp *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 127 || c == 255) + if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); else if (c == '-' || c == ']' || c == '\\' || c == '^') Perl_sv_catpvf(aTHX_ sv, "\\%c", c); @@ -4311,8 +4146,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { int i, rangestart = -1; - bool anyofutf8 = OP(o) == ANYOFUTF8; - U8 flags = anyofutf8 ? ARG1(o) : o->flags; + U8 flags = ANYOF_FLAGS(o); const char * const anyofs[] = { /* Should be syncronized with * ANYOF_ #xdefines in regcomp.h */ "\\w", @@ -4354,78 +4188,93 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) sv_catpv(sv, "^"); - if (OP(o) == ANYOF) { - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { + for (i = 0; i <= 256; i++) { + if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) put_byte(sv, rangestart); - sv_catpv(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; + else { + put_byte(sv, rangestart); + sv_catpv(sv, "-"); + put_byte(sv, i - 1); } + rangestart = -1; } - if (o->flags & ANYOF_CLASS) - for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) - if (ANYOF_CLASS_TEST(o,i)) - sv_catpv(sv, anyofs[i]); } - else { - SV *rv = (SV*)PL_regdata->data[ARG2(o)]; - AV *av = (AV*)SvRV((SV*)rv); - SV *sw = *av_fetch(av, 0, FALSE); - SV *lv = *av_fetch(av, 1, FALSE); - UV i; - U8 s[UTF8_MAXLEN+1]; - for (i = 0; i <= 256; i++) { /* just the first 256 */ - U8 *e = uv_to_utf8(s, i); - if (i < 256 && swash_fetch(sw, s)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - U8 *p; - - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) { - for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++) - put_byte(sv, *p); + + if (o->flags & ANYOF_CLASS) + for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) + if (ANYOF_CLASS_TEST(o,i)) + sv_catpv(sv, anyofs[i]); + + if (flags & ANYOF_UNICODE) + sv_catpv(sv, "{unicode}"); + + { + SV *lv; + SV *sw = regclass_swash(o, FALSE, &lv); + + if (lv) { + if (sw) { + UV i; + U8 s[UTF8_MAXLEN+1]; + + for (i = 0; i <= 256; i++) { /* just the first 256 */ + U8 *e = uv_to_utf8(s, i); + + if (i < 256 && swash_fetch(sw, s)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + U8 *p; + + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) { + for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + } + else { + for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + sv_catpv(sv, "-"); + for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++) + put_byte(sv, *p); + } + rangestart = -1; + } } - else { - for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++) - put_byte(sv, *p); - sv_catpv(sv, "-"); - for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++) - put_byte(sv, *p); - } - rangestart = -1; + + sv_catpv(sv, "..."); /* et cetera */ } - } - sv_catpv(sv, "..."); - { - char *s = savepv(SvPVX(lv)); - - while(*s && *s != '\n') s++; - if (*s == '\n') { - char *t = ++s; - while (*s) { - if (*s == '\n') - *s = ' '; - s++; + { + char *s = savepv(SvPVX(lv)); + char *origs = s; + + while(*s && *s != '\n') s++; + + if (*s == '\n') { + char *t = ++s; + + while (*s) { + if (*s == '\n') + *s = ' '; + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); } - if (s[-1] == ' ') - s[-1] = 0; - - sv_catpv(sv, t); + + Safefree(origs); } } } + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -4486,16 +4335,6 @@ Perl_pregfree(pTHX_ struct regexp *r) while (--n >= 0) { switch (r->data->what[n]) { case 's': -#ifdef DEBUGGING - { - SV *rv = (SV*)r->data->data[n]; - AV *av = (AV*)SvRV((SV*)rv); - SV *sw = *av_fetch(av, 0, FALSE); - SV *lv = *av_fetch(av, 1, FALSE); - SvREFCNT_dec(sw); - SvREFCNT_dec(lv); - } -#endif SvREFCNT_dec((SV*)r->data->data[n]); break; case 'f': @@ -4657,4 +4496,3 @@ clear_re(pTHXo_ void *r) { ReREFCNT_dec((regexp *)r); } - diff --git a/regcomp.h b/regcomp.h index 284cf2f..c8094e1 100644 --- a/regcomp.h +++ b/regcomp.h @@ -88,12 +88,13 @@ struct regnode_2 { }; #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ -#define ANYOF_CLASSBITMAP_SIZE 4 +#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ struct regnode_charclass { U8 flags; U8 type; U16 next_off; + U32 arg1; char bitmap[ANYOF_BITMAP_SIZE]; }; @@ -101,6 +102,7 @@ struct regnode_charclass_class { U8 flags; U8 type; U16 next_off; + U32 arg1; char bitmap[ANYOF_BITMAP_SIZE]; char classflags[ANYOF_CLASSBITMAP_SIZE]; }; @@ -180,13 +182,21 @@ struct regnode_charclass_class { /* Flags for node->flags of ANYOF */ -#define ANYOF_CLASS 0x08 -#define ANYOF_INVERT 0x04 -#define ANYOF_FOLD 0x02 -#define ANYOF_LOCALE 0x01 +#define ANYOF_CLASS 0x08 +#define ANYOF_INVERT 0x04 +#define ANYOF_FOLD 0x02 +#define ANYOF_LOCALE 0x01 /* Used for regstclass only */ -#define ANYOF_EOS 0x10 /* Can match an empty string too */ +#define ANYOF_EOS 0x10 /* Can match an empty string too */ + +/* There is a character or a range past 0xff */ +#define ANYOF_UNICODE 0x20 + +/* Are there any runtime flags on in this node? */ +#define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f) + +#define ANYOF_FLAGS_ALL 0xff /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ @@ -220,7 +230,7 @@ struct regnode_charclass_class { #define ANYOF_NXDIGIT 25 #define ANYOF_PSXSPC 26 /* POSIX space: \s plus the vertical tab */ #define ANYOF_NPSXSPC 27 -#define ANYOF_BLANK 28 /* GNU extension: space and tab */ +#define ANYOF_BLANK 28 /* GNU extension: space and tab: non-vertical space */ #define ANYOF_NBLANK 29 #define ANYOF_MAX 32 @@ -238,7 +248,6 @@ struct regnode_charclass_class { #define ANYOF_CLASS_SIZE (sizeof(struct regnode_charclass_class)) #define ANYOF_FLAGS(p) ((p)->flags) -#define ANYOF_FLAGS_ALL 0xff #define ANYOF_BIT(c) (1 << ((c) & 7)) @@ -300,12 +309,14 @@ EXTCONST U8 PL_varies[] = { EXTCONST U8 PL_simple[]; #else EXTCONST U8 PL_simple[] = { - REG_ANY, ANYUTF8, SANY, SANYUTF8, ANYOF, ANYOFUTF8, - ALNUM, ALNUMUTF8, ALNUML, ALNUMLUTF8, - NALNUM, NALNUMUTF8, NALNUML, NALNUMLUTF8, - SPACE, SPACEUTF8, SPACEL, SPACELUTF8, - NSPACE, NSPACEUTF8, NSPACEL, NSPACELUTF8, - DIGIT, DIGITUTF8, NDIGIT, NDIGITUTF8, 0 + REG_ANY, SANY, + ANYOF, + ALNUM, ALNUML, + NALNUM, NALNUML, + SPACE, SPACEL, + NSPACE, NSPACEL, + DIGIT, NDIGIT, + 0 }; #endif diff --git a/regcomp.sym b/regcomp.sym index bb5f8f8..59284f4 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -16,46 +16,27 @@ EOL EOL, no Match "" at end of line. MEOL EOL, no Same, assuming multiline. SEOL EOL, no Same, assuming singleline. BOUND BOUND, no Match "" at any word boundary -BOUNDUTF8 BOUND, no Match "" at any word boundary BOUNDL BOUND, no Match "" at any word boundary -BOUNDLUTF8 BOUND, no Match "" at any word boundary NBOUND NBOUND, no Match "" at any word non-boundary -NBOUNDUTF8 NBOUND, no Match "" at any word non-boundary NBOUNDL NBOUND, no Match "" at any word non-boundary -NBOUNDLUTF8 NBOUND, no Match "" at any word non-boundary GPOS GPOS, no Matches where last m//g left off. # [Special] alternatives REG_ANY REG_ANY, no Match any one character (except newline). -ANYUTF8 REG_ANY, no Match any one Unicode character (except newline). SANY REG_ANY, no Match any one character. -SANYUTF8 REG_ANY, no Match any one Unicode character. ANYOF ANYOF, sv Match character in (or not in) this class. -ANYOFUTF8 ANYOF, sv 1 Match character in (or not in) this class. ALNUM ALNUM, no Match any alphanumeric character -ALNUMUTF8 ALNUM, no Match any alphanumeric character in utf8 ALNUML ALNUM, no Match any alphanumeric char in locale -ALNUMLUTF8 ALNUM, no Match any alphanumeric char in locale+utf8 NALNUM NALNUM, no Match any non-alphanumeric character -NALNUMUTF8 NALNUM, no Match any non-alphanumeric character in utf8 NALNUML NALNUM, no Match any non-alphanumeric char in locale -NALNUMLUTF8 NALNUM, no Match any non-alphanumeric char in locale+utf8 SPACE SPACE, no Match any whitespace character -SPACEUTF8 SPACE, no Match any whitespace character in utf8 SPACEL SPACE, no Match any whitespace char in locale -SPACELUTF8 SPACE, no Match any whitespace char in locale+utf8 NSPACE NSPACE, no Match any non-whitespace character -NSPACEUTF8 NSPACE, no Match any non-whitespace character in utf8 NSPACEL NSPACE, no Match any non-whitespace char in locale -NSPACELUTF8 NSPACE, no Match any non-whitespace char in locale+utf8 DIGIT DIGIT, no Match any numeric character -DIGITUTF8 DIGIT, no Match any numeric character in utf8 DIGITL DIGIT, no Match any numeric character in locale -DIGITLUTF8 DIGIT, no Match any numeric character in locale+utf8 NDIGIT NDIGIT, no Match any non-numeric character -NDIGITUTF8 NDIGIT, no Match any non-numeric character in utf8 NDIGITL NDIGIT, no Match any non-numeric character in locale -NDIGITLUTF8 NDIGIT, no Match any non-numeric character in locale+utf8 CLUMP CLUMP, no Match any combining character sequence # BRANCH The set of branches constituting a single choice are hooked diff --git a/regexec.c b/regexec.c index 5e821ba..ac91bea 100644 --- a/regexec.c +++ b/regexec.c @@ -105,13 +105,6 @@ * Forwards. */ -#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c)) -#ifdef DEBUGGING -# define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch(*av_fetch((AV*)SvRV((SV*)PL_regdata->data[ARG2(f)]),0,FALSE),p)) -#else -# define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p)) -#endif - #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) @@ -738,7 +731,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, t = s; if (prog->reganch & ROPT_UTF8) { - PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */ + PL_regdata = prog->data; PL_bostr = startpos; } s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1); @@ -840,25 +833,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta unsigned int c2; char *e; register I32 tmp = 1; /* Scratch variable? */ + register bool do_utf8 = DO_UTF8(PL_reg_sv); /* We know what class it must start with. */ switch (OP(c)) { - case ANYOFUTF8: - while (s < strend) { - if (REGINCLASSUTF8(c, (U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; case ANYOF: while (s < strend) { - if (REGINCLASS(c, *(U8*)s)) { + if (reginclass(c, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -866,7 +847,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s++; + s += do_utf8 ? UTF8SKIP(s) : 1; } break; case EXACTF: @@ -912,42 +893,40 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: - tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); - while (s < strend) { - if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { - tmp = !tmp; - if ((norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + if (s == startpos) + tmp = '\n'; + else { + U8 *r = reghop((U8*)s, -1); + + tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); + } + tmp = ((OP(c) == BOUND ? + isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); + while (s < strend) { + if (tmp == !(OP(c) == BOUND ? + swash_fetch(PL_utf8_alnum, (U8*)s) : + isALNUM_LC_utf8((U8*)s))) + { + tmp = !tmp; + if ((norun || regtry(prog, s))) + goto got_it; + } + s += UTF8SKIP(s); } - s++; } - if ((!prog->minlen && tmp) && (norun || regtry(prog, s))) - goto got_it; - break; - case BOUNDLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUNDUTF8: - if (s == startpos) - tmp = '\n'; else { - U8 *r = reghop((U8*)s, -1); - - tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); - } - tmp = ((OP(c) == BOUNDUTF8 ? - isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); - while (s < strend) { - if (tmp == !(OP(c) == BOUNDUTF8 ? - swash_fetch(PL_utf8_alnum, (U8*)s) : - isALNUM_LC_utf8((U8*)s))) - { - tmp = !tmp; - if ((norun || regtry(prog, s))) - goto got_it; + tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; + tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + while (s < strend) { + if (tmp == + !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { + tmp = !tmp; + if ((norun || regtry(prog, s))) + goto got_it; + } + s++; } - s += UTF8SKIP(s); } if ((!prog->minlen && tmp) && (norun || regtry(prog, s))) goto got_it; @@ -956,365 +935,382 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUND: - tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); - while (s < strend) { - if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) - tmp = !tmp; - else if ((norun || regtry(prog, s))) - goto got_it; - s++; + if (do_utf8) { + if (s == startpos) + tmp = '\n'; + else { + U8 *r = reghop((U8*)s, -1); + + tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); + } + tmp = ((OP(c) == NBOUND ? + isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); + while (s < strend) { + if (tmp == !(OP(c) == NBOUND ? + swash_fetch(PL_utf8_alnum, (U8*)s) : + isALNUM_LC_utf8((U8*)s))) + tmp = !tmp; + else if ((norun || regtry(prog, s))) + goto got_it; + s += UTF8SKIP(s); + } } - if ((!prog->minlen && !tmp) && (norun || regtry(prog, s))) - goto got_it; - break; - case NBOUNDLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NBOUNDUTF8: - if (s == startpos) - tmp = '\n'; else { - U8 *r = reghop((U8*)s, -1); - - tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); - } - tmp = ((OP(c) == NBOUNDUTF8 ? - isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); - while (s < strend) { - if (tmp == !(OP(c) == NBOUNDUTF8 ? - swash_fetch(PL_utf8_alnum, (U8*)s) : - isALNUM_LC_utf8((U8*)s))) - tmp = !tmp; - else if ((norun || regtry(prog, s))) - goto got_it; - s += UTF8SKIP(s); + tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; + tmp = ((OP(c) == NBOUND ? + isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + while (s < strend) { + if (tmp == + !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) + tmp = !tmp; + else if ((norun || regtry(prog, s))) + goto got_it; + s++; + } } if ((!prog->minlen && !tmp) && (norun || regtry(prog, s))) goto got_it; break; case ALNUM: - while (s < strend) { - if (isALNUM(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (swash_fetch(PL_utf8_alnum, (U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case ALNUMUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_alnum, (U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isALNUM(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case ALNUML: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isALNUM_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (isALNUM_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case ALNUMLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isALNUM_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isALNUM_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NALNUM: - while (s < strend) { - if (!isALNUM(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!swash_fetch(PL_utf8_alnum, (U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NALNUMUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_alnum, (U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isALNUM(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NALNUML: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isALNUM_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!isALNUM_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NALNUMLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isALNUM_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isALNUM_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case SPACE: - while (s < strend) { - if (isSPACE(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case SPACEUTF8: - while (s < strend) { - if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isSPACE(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case SPACEL: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isSPACE_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case SPACELUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isSPACE_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NSPACE: - while (s < strend) { - if (!isSPACE(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NSPACEUTF8: - while (s < strend) { - if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isSPACE(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NSPACEL: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isSPACE_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NSPACELUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isSPACE_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case DIGIT: - while (s < strend) { - if (isDIGIT(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (swash_fetch(PL_utf8_digit,(U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case DIGITUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_digit,(U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isDIGIT(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case DIGITL: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isDIGIT_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (isDIGIT_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case DIGITLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isDIGIT_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isDIGIT_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NDIGIT: - while (s < strend) { - if (!isDIGIT(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!swash_fetch(PL_utf8_digit,(U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NDIGITUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_digit,(U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isDIGIT(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NDIGITL: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isDIGIT_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!isDIGIT_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NDIGITLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isDIGIT_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isDIGIT_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; default: @@ -1606,6 +1602,11 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT) /* don't bother with what can't match */ strend = HOPc(strend, -(minlen - 1)); + DEBUG_r({ + SV *prop = sv_newmortal(); + regprop(prop, c); + PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s); + }); if (find_byclass(prog, c, s, strend, startpos, 0)) goto got_it; DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n")); @@ -1619,7 +1620,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * last = screaminstr(sv, prog->float_substr, s - strbeg, end_shift, &scream_pos, 1); /* last one */ if (!last) - last = scream_olds; /* Only one occurence. */ + last = scream_olds; /* Only one occurrence. */ } else { STRLEN len; @@ -1891,6 +1892,7 @@ S_regmatch(pTHX_ regnode *prog) int minmod = 0, sw = 0, logical = 0; I32 unwind = 0; I32 firstcp = PL_savestack_ix; + register bool do_utf8 = DO_UTF8(PL_reg_sv); #ifdef DEBUGGING PL_regindent++; @@ -2009,8 +2011,8 @@ S_regmatch(pTHX_ regnode *prog) if (PL_regeol != locinput) sayNO; break; - case SANYUTF8: - if (nextchr & 0x80) { + case SANY: + if (DO_UTF8(PL_reg_sv)) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -2021,13 +2023,8 @@ S_regmatch(pTHX_ regnode *prog) sayNO; nextchr = UCHARAT(++locinput); break; - case SANY: - if (!nextchr && locinput >= PL_regeol) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ANYUTF8: - if (nextchr & 0x80) { + case REG_ANY: + if (DO_UTF8(PL_reg_sv)) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -2038,11 +2035,6 @@ S_regmatch(pTHX_ regnode *prog) sayNO; nextchr = UCHARAT(++locinput); break; - case REG_ANY: - if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') - sayNO; - nextchr = UCHARAT(++locinput); - break; case EXACT: s = STRING(scan); ln = STR_LEN(scan); @@ -2099,22 +2091,24 @@ S_regmatch(pTHX_ regnode *prog) locinput += ln; nextchr = UCHARAT(locinput); break; - case ANYOFUTF8: - if (!REGINCLASSUTF8(scan, (U8*)locinput)) - sayNO; - if (locinput >= PL_regeol) - sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; case ANYOF: - if (nextchr < 0) + if (do_utf8) { + if (!reginclass(scan, (U8*)locinput, do_utf8)) + sayNO; + if (locinput >= PL_regeol) + sayNO; + locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); - if (!REGINCLASS(scan, nextchr)) - sayNO; - if (!nextchr && locinput >= PL_regeol) - sayNO; - nextchr = UCHARAT(++locinput); + } + else { + if (nextchr < 0) + nextchr = UCHARAT(locinput); + if (!reginclass(scan, (U8*)locinput, do_utf8)) + sayNO; + if (!nextchr && locinput >= PL_regeol) + sayNO; + nextchr = UCHARAT(++locinput); + } break; case ALNUML: PL_reg_flags |= RF_tainted; @@ -2122,19 +2116,8 @@ S_regmatch(pTHX_ regnode *prog) case ALNUM: if (!nextchr) sayNO; - if (!(OP(scan) == ALNUM - ? isALNUM(nextchr) : isALNUM_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ALNUMLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case ALNUMUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == ALNUMUTF8 + if (do_utf8) { + if (!(OP(scan) == ALNUM ? swash_fetch(PL_utf8_alnum, (U8*)locinput) : isALNUM_LC_utf8((U8*)locinput))) { @@ -2144,7 +2127,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == ALNUMUTF8 + if (!(OP(scan) == ALNUM ? isALNUM(nextchr) : isALNUM_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); @@ -2155,19 +2138,8 @@ S_regmatch(pTHX_ regnode *prog) case NALNUM: if (!nextchr && locinput >= PL_regeol) sayNO; - if (OP(scan) == NALNUM - ? isALNUM(nextchr) : isALNUM_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NALNUMLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NALNUMUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NALNUMUTF8 + if (do_utf8) { + if (OP(scan) == NALNUM ? swash_fetch(PL_utf8_alnum, (U8*)locinput) : isALNUM_LC_utf8((U8*)locinput)) { @@ -2177,7 +2149,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (OP(scan) == NALNUMUTF8 + if (OP(scan) == NALNUM ? isALNUM(nextchr) : isALNUM_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); @@ -2189,42 +2161,38 @@ S_regmatch(pTHX_ regnode *prog) case BOUND: case NBOUND: /* was last char in word? */ - ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev; - if (OP(scan) == BOUND || OP(scan) == NBOUND) { - ln = isALNUM(ln); - n = isALNUM(nextchr); - } - else { - ln = isALNUM_LC(ln); - n = isALNUM_LC(nextchr); - } - if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL)) - sayNO; - break; - case BOUNDLUTF8: - case NBOUNDLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUNDUTF8: - case NBOUNDUTF8: - /* was last char in word? */ - if (locinput == PL_regbol) - ln = PL_regprev; - else { - U8 *r = reghop((U8*)locinput, -1); - - ln = utf8_to_uv(r, s - (char*)r, 0, 0); - } - if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { - ln = isALNUM_uni(ln); - n = swash_fetch(PL_utf8_alnum, (U8*)locinput); + if (do_utf8) { + if (locinput == PL_regbol) + ln = PL_regprev; + else { + U8 *r = reghop((U8*)locinput, -1); + + ln = utf8_to_uv(r, s - (char*)r, 0, 0); + } + if (OP(scan) == BOUND || OP(scan) == NBOUND) { + ln = isALNUM_uni(ln); + n = swash_fetch(PL_utf8_alnum, (U8*)locinput); + } + else { + ln = isALNUM_LC_uni(ln); + n = isALNUM_LC_utf8((U8*)locinput); + } } else { - ln = isALNUM_LC_uni(ln); - n = isALNUM_LC_utf8((U8*)locinput); + ln = (locinput != PL_regbol) ? + UCHARAT(locinput - 1) : PL_regprev; + if (OP(scan) == BOUND || OP(scan) == NBOUND) { + ln = isALNUM(ln); + n = isALNUM(nextchr); + } + else { + ln = isALNUM_LC(ln); + n = isALNUM_LC(nextchr); + } } - if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8)) - sayNO; + if (((!ln) == (!n)) == (OP(scan) == BOUND || + OP(scan) == BOUNDL)) + sayNO; break; case SPACEL: PL_reg_flags |= RF_tainted; @@ -2232,32 +2200,29 @@ S_regmatch(pTHX_ regnode *prog) case SPACE: if (!nextchr) sayNO; - if (!(OP(scan) == SPACE - ? isSPACE(nextchr) : isSPACE_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case SPACELUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case SPACEUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == SPACEUTF8 - ? swash_fetch(PL_utf8_space, (U8*)locinput) - : isSPACE_LC_utf8((U8*)locinput))) - { - sayNO; + if (DO_UTF8(PL_reg_sv)) { + if (nextchr & 0x80) { + if (!(OP(scan) == SPACE + ? swash_fetch(PL_utf8_space, (U8*)locinput) + : isSPACE_LC_utf8((U8*)locinput))) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; + if (!(OP(scan) == SPACE + ? isSPACE(nextchr) : isSPACE_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + } + else { + if (!(OP(scan) == SPACE + ? isSPACE(nextchr) : isSPACE_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); } - if (!(OP(scan) == SPACEUTF8 - ? isSPACE(nextchr) : isSPACE_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); break; case NSPACEL: PL_reg_flags |= RF_tainted; @@ -2265,19 +2230,8 @@ S_regmatch(pTHX_ regnode *prog) case NSPACE: if (!nextchr && locinput >= PL_regeol) sayNO; - if (OP(scan) == NSPACE - ? isSPACE(nextchr) : isSPACE_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NSPACELUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NSPACEUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NSPACEUTF8 + if (DO_UTF8(PL_reg_sv)) { + if (OP(scan) == NSPACE ? swash_fetch(PL_utf8_space, (U8*)locinput) : isSPACE_LC_utf8((U8*)locinput)) { @@ -2287,7 +2241,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (OP(scan) == NSPACEUTF8 + if (OP(scan) == NSPACE ? isSPACE(nextchr) : isSPACE_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); @@ -2298,19 +2252,8 @@ S_regmatch(pTHX_ regnode *prog) case DIGIT: if (!nextchr) sayNO; - if (!(OP(scan) == DIGIT - ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case DIGITLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case DIGITUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == DIGITUTF8 + if (DO_UTF8(PL_reg_sv)) { + if (!(OP(scan) == DIGIT ? swash_fetch(PL_utf8_digit, (U8*)locinput) : isDIGIT_LC_utf8((U8*)locinput))) { @@ -2320,7 +2263,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == DIGITUTF8 + if (!(OP(scan) == DIGIT ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); @@ -2331,19 +2274,8 @@ S_regmatch(pTHX_ regnode *prog) case NDIGIT: if (!nextchr && locinput >= PL_regeol) sayNO; - if (OP(scan) == NDIGIT - ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NDIGITLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NDIGITUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NDIGITUTF8 + if (DO_UTF8(PL_reg_sv)) { + if (OP(scan) == NDIGIT ? swash_fetch(PL_utf8_digit, (U8*)locinput) : isDIGIT_LC_utf8((U8*)locinput)) { @@ -2353,7 +2285,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (OP(scan) == NDIGITUTF8 + if (OP(scan) == NDIGIT ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); @@ -3461,30 +3393,33 @@ S_regrepeat(pTHX_ regnode *p, I32 max) register I32 c; register char *loceol = PL_regeol; register I32 hardcount = 0; + register bool do_utf8 = DO_UTF8(PL_reg_sv); scan = PL_reginput; if (max != REG_INFTY && max < loceol - scan) loceol = scan + max; switch (OP(p)) { case REG_ANY: - while (scan < loceol && *scan != '\n') - scan++; - break; - case SANY: - scan = loceol; - break; - case ANYUTF8: - loceol = PL_regeol; - while (scan < loceol && *scan != '\n') { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && *scan != '\n') { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && *scan != '\n') + scan++; } break; - case SANYUTF8: - loceol = PL_regeol; - while (scan < loceol) { - scan += UTF8SKIP(scan); - hardcount++; + case SANY: + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + scan = loceol; } break; case EXACT: /* length of string is 1 */ @@ -3505,135 +3440,144 @@ S_regrepeat(pTHX_ regnode *p, I32 max) (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) scan++; break; - case ANYOFUTF8: - loceol = PL_regeol; - while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - break; case ANYOF: - while (scan < loceol && REGINCLASS(p, *scan)) - scan++; + if (do_utf8) { + loceol = PL_regeol; + while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) + scan++; + } break; case ALNUM: - while (scan < loceol && isALNUM(*scan)) - scan++; - break; - case ALNUMUTF8: - loceol = PL_regeol; - while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isALNUM(*scan)) + scan++; } break; case ALNUML: PL_reg_flags |= RF_tainted; - while (scan < loceol && isALNUM_LC(*scan)) - scan++; - break; - case ALNUMLUTF8: - PL_reg_flags |= RF_tainted; - loceol = PL_regeol; - while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isALNUM_LC(*scan)) + scan++; } break; - break; case NALNUM: - while (scan < loceol && !isALNUM(*scan)) - scan++; - break; - case NALNUMUTF8: - loceol = PL_regeol; - while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isALNUM(*scan)) + scan++; } break; case NALNUML: PL_reg_flags |= RF_tainted; - while (scan < loceol && !isALNUM_LC(*scan)) - scan++; - break; - case NALNUMLUTF8: - PL_reg_flags |= RF_tainted; - loceol = PL_regeol; - while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isALNUM_LC(*scan)) + scan++; } break; case SPACE: - while (scan < loceol && isSPACE(*scan)) - scan++; - break; - case SPACEUTF8: - loceol = PL_regeol; - while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && + (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isSPACE(*scan)) + scan++; } break; case SPACEL: PL_reg_flags |= RF_tainted; - while (scan < loceol && isSPACE_LC(*scan)) - scan++; - break; - case SPACELUTF8: - PL_reg_flags |= RF_tainted; - loceol = PL_regeol; - while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && + (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isSPACE_LC(*scan)) + scan++; } break; case NSPACE: - while (scan < loceol && !isSPACE(*scan)) - scan++; - break; - case NSPACEUTF8: - loceol = PL_regeol; - while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && + !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isSPACE(*scan)) + scan++; + break; } - break; case NSPACEL: PL_reg_flags |= RF_tainted; - while (scan < loceol && !isSPACE_LC(*scan)) - scan++; - break; - case NSPACELUTF8: - PL_reg_flags |= RF_tainted; - loceol = PL_regeol; - while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && + !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isSPACE_LC(*scan)) + scan++; } break; case DIGIT: - while (scan < loceol && isDIGIT(*scan)) - scan++; - break; - case DIGITUTF8: - loceol = PL_regeol; - while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isDIGIT(*scan)) + scan++; } break; - break; case NDIGIT: - while (scan < loceol && !isDIGIT(*scan)) - scan++; - break; - case NDIGITUTF8: - loceol = PL_regeol; - while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isDIGIT(*scan)) + scan++; } break; default: /* Called on something of 0 width. */ @@ -3712,102 +3656,139 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp) } /* +- regclass_swash - prepare the utf8 swash +*/ + +SV * +Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp) +{ + SV *sw = NULL; + SV *si = NULL; + + if (PL_regdata && PL_regdata->count) { + U32 n = ARG(node); + + if (PL_regdata->what[n] == 's') { + SV *rv = (SV*)PL_regdata->data[n]; + AV *av = (AV*)SvRV((SV*)rv); + SV **a; + + si = *av_fetch(av, 0, FALSE); + a = av_fetch(av, 1, FALSE); + + if (a) + sw = *a; + else if (si && doinit) { + sw = swash_init("utf8", "", si, 1, 0); + (void)av_store(av, 1, sw); + } + } + } + + if (initsvp) + *initsvp = si; + + return sw; +} + +/* - reginclass - determine if a character falls into a character class */ STATIC bool -S_reginclass(pTHX_ register regnode *p, register I32 c) +S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) { - char flags = ANYOF_FLAGS(p); + char flags = ANYOF_FLAGS(n); bool match = FALSE; - c &= 0xFF; - if (ANYOF_BITMAP_TEST(p, c)) - match = TRUE; - else if (flags & ANYOF_FOLD) { - I32 cf; - if (flags & ANYOF_LOCALE) { - PL_reg_flags |= RF_tainted; - cf = PL_fold_locale[c]; + if (do_utf8 || (flags & ANYOF_UNICODE)) { + if (do_utf8 && !ANYOF_RUNTIME(n)) { + STRLEN len; + UV c = utf8_to_uv_simple(p, &len); + + if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) + match = TRUE; } - else - cf = PL_fold[c]; - if (ANYOF_BITMAP_TEST(p, cf)) - match = TRUE; - } - if (!match && (flags & ANYOF_CLASS)) { - PL_reg_flags |= RF_tainted; - if ( - (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC) && isPSXSPC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_BLANK) && isBLANK(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NBLANK) && !isBLANK(c)) - ) /* How's that for a conditional? */ - { - match = TRUE; + if (!match) { + SV *sw = regclass_swash(n, TRUE, 0); + + if (sw) { + if (swash_fetch(sw, p)) + match = TRUE; + else if (flags & ANYOF_FOLD) { + U8 tmpbuf[UTF8_MAXLEN+1]; + + if (flags & ANYOF_LOCALE) { + PL_reg_flags |= RF_tainted; + uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p)); + } + else + uv_to_utf8(tmpbuf, toLOWER_utf8(p)); + if (swash_fetch(sw, tmpbuf)) + match = TRUE; + } + } } } + else { + U8 c = *p; - return (flags & ANYOF_INVERT) ? !match : match; -} - -STATIC bool -S_reginclassutf8(pTHX_ regnode *f, U8 *p) -{ - char flags = ARG1(f); - bool match = FALSE; -#ifdef DEBUGGING - SV *rv = (SV*)PL_regdata->data[ARG2(f)]; - AV *av = (AV*)SvRV((SV*)rv); - SV *sw = *av_fetch(av, 0, FALSE); - SV *lv = *av_fetch(av, 1, FALSE); -#else - SV *sw = (SV*)PL_regdata->data[ARG2(f)]; -#endif + if (ANYOF_BITMAP_TEST(n, c)) + match = TRUE; + else if (flags & ANYOF_FOLD) { + I32 f; - if (swash_fetch(sw, p)) - match = TRUE; - else if (flags & ANYOF_FOLD) { - U8 tmpbuf[UTF8_MAXLEN+1]; - if (flags & ANYOF_LOCALE) { + if (flags & ANYOF_LOCALE) { + PL_reg_flags |= RF_tainted; + f = PL_fold_locale[c]; + } + else + f = PL_fold[c]; + if (f != c && ANYOF_BITMAP_TEST(n, f)) + match = TRUE; + } + + if (!match && (flags & ANYOF_CLASS)) { PL_reg_flags |= RF_tainted; - uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p)); + if ( + (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) + ) /* How's that for a conditional? */ + { + match = TRUE; + } } - else - uv_to_utf8(tmpbuf, toLOWER_utf8(p)); - if (swash_fetch(sw, tmpbuf)) - match = TRUE; } - /* UTF8 combined with ANYOF_CLASS is ill-defined. */ - return (flags & ANYOF_INVERT) ? !match : match; } @@ -3815,17 +3796,20 @@ STATIC U8 * S_reghop(pTHX_ U8 *s, I32 off) { if (off >= 0) { - while (off-- && s < (U8*)PL_regeol) + while (off-- && s < (U8*)PL_regeol) { + /* XXX could check well-formedness here */ s += UTF8SKIP(s); + } } else { while (off++) { if (s > (U8*)PL_bostr) { s--; - if (*s & 0x80) { - while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80) + if (UTF8_IS_CONTINUED(*s)) { + while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s)) s--; - } /* XXX could check well-formedness here */ + } + /* XXX could check well-formedness here */ } } } @@ -3836,8 +3820,10 @@ STATIC U8 * S_reghopmaybe(pTHX_ U8* s, I32 off) { if (off >= 0) { - while (off-- && s < (U8*)PL_regeol) + while (off-- && s < (U8*)PL_regeol) { + /* XXX could check well-formedness here */ s += UTF8SKIP(s); + } if (off >= 0) return 0; } @@ -3845,10 +3831,11 @@ S_reghopmaybe(pTHX_ U8* s, I32 off) while (off++) { if (s > (U8*)PL_bostr) { s--; - if (*s & 0x80) { - while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80) + if (UTF8_IS_CONTINUED(*s)) { + while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s)) s--; - } /* XXX could check well-formedness here */ + } + /* XXX could check well-formedness here */ } else break; diff --git a/regnodes.h b/regnodes.h index 89c78e6..00dc0ec 100644 --- a/regnodes.h +++ b/regnodes.h @@ -13,76 +13,57 @@ #define MEOL 7 /* 0x7 Same, assuming multiline. */ #define SEOL 8 /* 0x8 Same, assuming singleline. */ #define BOUND 9 /* 0x9 Match "" at any word boundary */ -#define BOUNDUTF8 10 /* 0xa Match "" at any word boundary */ -#define BOUNDL 11 /* 0xb Match "" at any word boundary */ -#define BOUNDLUTF8 12 /* 0xc Match "" at any word boundary */ -#define NBOUND 13 /* 0xd Match "" at any word non-boundary */ -#define NBOUNDUTF8 14 /* 0xe Match "" at any word non-boundary */ -#define NBOUNDL 15 /* 0xf Match "" at any word non-boundary */ -#define NBOUNDLUTF8 16 /* 0x10 Match "" at any word non-boundary */ -#define GPOS 17 /* 0x11 Matches where last m//g left off. */ -#define REG_ANY 18 /* 0x12 Match any one character (except newline). */ -#define ANYUTF8 19 /* 0x13 Match any one Unicode character (except newline). */ -#define SANY 20 /* 0x14 Match any one character. */ -#define SANYUTF8 21 /* 0x15 Match any one Unicode character. */ -#define ANYOF 22 /* 0x16 Match character in (or not in) this class. */ -#define ANYOFUTF8 23 /* 0x17 Match character in (or not in) this class. */ -#define ALNUM 24 /* 0x18 Match any alphanumeric character */ -#define ALNUMUTF8 25 /* 0x19 Match any alphanumeric character in utf8 */ -#define ALNUML 26 /* 0x1a Match any alphanumeric char in locale */ -#define ALNUMLUTF8 27 /* 0x1b Match any alphanumeric char in locale+utf8 */ -#define NALNUM 28 /* 0x1c Match any non-alphanumeric character */ -#define NALNUMUTF8 29 /* 0x1d Match any non-alphanumeric character in utf8 */ -#define NALNUML 30 /* 0x1e Match any non-alphanumeric char in locale */ -#define NALNUMLUTF8 31 /* 0x1f Match any non-alphanumeric char in locale+utf8 */ -#define SPACE 32 /* 0x20 Match any whitespace character */ -#define SPACEUTF8 33 /* 0x21 Match any whitespace character in utf8 */ -#define SPACEL 34 /* 0x22 Match any whitespace char in locale */ -#define SPACELUTF8 35 /* 0x23 Match any whitespace char in locale+utf8 */ -#define NSPACE 36 /* 0x24 Match any non-whitespace character */ -#define NSPACEUTF8 37 /* 0x25 Match any non-whitespace character in utf8 */ -#define NSPACEL 38 /* 0x26 Match any non-whitespace char in locale */ -#define NSPACELUTF8 39 /* 0x27 Match any non-whitespace char in locale+utf8 */ -#define DIGIT 40 /* 0x28 Match any numeric character */ -#define DIGITUTF8 41 /* 0x29 Match any numeric character in utf8 */ -#define DIGITL 42 /* 0x2a Match any numeric character in locale */ -#define DIGITLUTF8 43 /* 0x2b Match any numeric character in locale+utf8 */ -#define NDIGIT 44 /* 0x2c Match any non-numeric character */ -#define NDIGITUTF8 45 /* 0x2d Match any non-numeric character in utf8 */ -#define NDIGITL 46 /* 0x2e Match any non-numeric character in locale */ -#define NDIGITLUTF8 47 /* 0x2f Match any non-numeric character in locale+utf8 */ -#define CLUMP 48 /* 0x30 Match any combining character sequence */ -#define BRANCH 49 /* 0x31 Match this alternative, or the next... */ -#define BACK 50 /* 0x32 Match "", "next" ptr points backward. */ -#define EXACT 51 /* 0x33 Match this string (preceded by length). */ -#define EXACTF 52 /* 0x34 Match this string, folded (prec. by length). */ -#define EXACTFL 53 /* 0x35 Match this string, folded in locale (w/len). */ -#define NOTHING 54 /* 0x36 Match empty string. */ -#define TAIL 55 /* 0x37 Match empty string. Can jump here from outside. */ -#define STAR 56 /* 0x38 Match this (simple) thing 0 or more times. */ -#define PLUS 57 /* 0x39 Match this (simple) thing 1 or more times. */ -#define CURLY 58 /* 0x3a Match this simple thing {n,m} times. */ -#define CURLYN 59 /* 0x3b Match next-after-this simple thing */ -#define CURLYM 60 /* 0x3c Match this medium-complex thing {n,m} times. */ -#define CURLYX 61 /* 0x3d Match this complex thing {n,m} times. */ -#define WHILEM 62 /* 0x3e Do curly processing and see if rest matches. */ -#define OPEN 63 /* 0x3f Mark this point in input as start of #n. */ -#define CLOSE 64 /* 0x40 Analogous to OPEN. */ -#define REF 65 /* 0x41 Match some already matched string */ -#define REFF 66 /* 0x42 Match already matched string, folded */ -#define REFFL 67 /* 0x43 Match already matched string, folded in loc. */ -#define IFMATCH 68 /* 0x44 Succeeds if the following matches. */ -#define UNLESSM 69 /* 0x45 Fails if the following matches. */ -#define SUSPEND 70 /* 0x46 "Independent" sub-RE. */ -#define IFTHEN 71 /* 0x47 Switch, should be preceeded by switcher . */ -#define GROUPP 72 /* 0x48 Whether the group matched. */ -#define LONGJMP 73 /* 0x49 Jump far away. */ -#define BRANCHJ 74 /* 0x4a BRANCH with long offset. */ -#define EVAL 75 /* 0x4b Execute some Perl code. */ -#define MINMOD 76 /* 0x4c Next operator is not greedy. */ -#define LOGICAL 77 /* 0x4d Next opcode should set the flag only. */ -#define RENUM 78 /* 0x4e Group with independently numbered parens. */ -#define OPTIMIZED 79 /* 0x4f Placeholder for dump. */ +#define BOUNDL 10 /* 0xa Match "" at any word boundary */ +#define NBOUND 11 /* 0xb Match "" at any word non-boundary */ +#define NBOUNDL 12 /* 0xc Match "" at any word non-boundary */ +#define GPOS 13 /* 0xd Matches where last m//g left off. */ +#define REG_ANY 14 /* 0xe Match any one character (except newline). */ +#define SANY 15 /* 0xf Match any one character. */ +#define ANYOF 16 /* 0x10 Match character in (or not in) this class. */ +#define ALNUM 17 /* 0x11 Match any alphanumeric character */ +#define ALNUML 18 /* 0x12 Match any alphanumeric char in locale */ +#define NALNUM 19 /* 0x13 Match any non-alphanumeric character */ +#define NALNUML 20 /* 0x14 Match any non-alphanumeric char in locale */ +#define SPACE 21 /* 0x15 Match any whitespace character */ +#define SPACEL 22 /* 0x16 Match any whitespace char in locale */ +#define NSPACE 23 /* 0x17 Match any non-whitespace character */ +#define NSPACEL 24 /* 0x18 Match any non-whitespace char in locale */ +#define DIGIT 25 /* 0x19 Match any numeric character */ +#define DIGITL 26 /* 0x1a Match any numeric character in locale */ +#define NDIGIT 27 /* 0x1b Match any non-numeric character */ +#define NDIGITL 28 /* 0x1c Match any non-numeric character in locale */ +#define CLUMP 29 /* 0x1d Match any combining character sequence */ +#define BRANCH 30 /* 0x1e Match this alternative, or the next... */ +#define BACK 31 /* 0x1f Match "", "next" ptr points backward. */ +#define EXACT 32 /* 0x20 Match this string (preceded by length). */ +#define EXACTF 33 /* 0x21 Match this string, folded (prec. by length). */ +#define EXACTFL 34 /* 0x22 Match this string, folded in locale (w/len). */ +#define NOTHING 35 /* 0x23 Match empty string. */ +#define TAIL 36 /* 0x24 Match empty string. Can jump here from outside. */ +#define STAR 37 /* 0x25 Match this (simple) thing 0 or more times. */ +#define PLUS 38 /* 0x26 Match this (simple) thing 1 or more times. */ +#define CURLY 39 /* 0x27 Match this simple thing {n,m} times. */ +#define CURLYN 40 /* 0x28 Match next-after-this simple thing */ +#define CURLYM 41 /* 0x29 Match this medium-complex thing {n,m} times. */ +#define CURLYX 42 /* 0x2a Match this complex thing {n,m} times. */ +#define WHILEM 43 /* 0x2b Do curly processing and see if rest matches. */ +#define OPEN 44 /* 0x2c Mark this point in input as start of #n. */ +#define CLOSE 45 /* 0x2d Analogous to OPEN. */ +#define REF 46 /* 0x2e Match some already matched string */ +#define REFF 47 /* 0x2f Match already matched string, folded */ +#define REFFL 48 /* 0x30 Match already matched string, folded in loc. */ +#define IFMATCH 49 /* 0x31 Succeeds if the following matches. */ +#define UNLESSM 50 /* 0x32 Fails if the following matches. */ +#define SUSPEND 51 /* 0x33 "Independent" sub-RE. */ +#define IFTHEN 52 /* 0x34 Switch, should be preceeded by switcher . */ +#define GROUPP 53 /* 0x35 Whether the group matched. */ +#define LONGJMP 54 /* 0x36 Jump far away. */ +#define BRANCHJ 55 /* 0x37 BRANCH with long offset. */ +#define EVAL 56 /* 0x38 Execute some Perl code. */ +#define MINMOD 57 /* 0x39 Next operator is not greedy. */ +#define LOGICAL 58 /* 0x3a Next opcode should set the flag only. */ +#define RENUM 59 /* 0x3b Group with independently numbered parens. */ +#define OPTIMIZED 60 /* 0x3c Placeholder for dump. */ #ifndef DOINIT EXTCONST U8 PL_regkind[]; @@ -98,44 +79,25 @@ EXTCONST U8 PL_regkind[] = { EOL, /* MEOL */ EOL, /* SEOL */ BOUND, /* BOUND */ - BOUND, /* BOUNDUTF8 */ BOUND, /* BOUNDL */ - BOUND, /* BOUNDLUTF8 */ NBOUND, /* NBOUND */ - NBOUND, /* NBOUNDUTF8 */ NBOUND, /* NBOUNDL */ - NBOUND, /* NBOUNDLUTF8 */ GPOS, /* GPOS */ REG_ANY, /* REG_ANY */ - REG_ANY, /* ANYUTF8 */ REG_ANY, /* SANY */ - REG_ANY, /* SANYUTF8 */ ANYOF, /* ANYOF */ - ANYOF, /* ANYOFUTF8 */ ALNUM, /* ALNUM */ - ALNUM, /* ALNUMUTF8 */ ALNUM, /* ALNUML */ - ALNUM, /* ALNUMLUTF8 */ NALNUM, /* NALNUM */ - NALNUM, /* NALNUMUTF8 */ NALNUM, /* NALNUML */ - NALNUM, /* NALNUMLUTF8 */ SPACE, /* SPACE */ - SPACE, /* SPACEUTF8 */ SPACE, /* SPACEL */ - SPACE, /* SPACELUTF8 */ NSPACE, /* NSPACE */ - NSPACE, /* NSPACEUTF8 */ NSPACE, /* NSPACEL */ - NSPACE, /* NSPACELUTF8 */ DIGIT, /* DIGIT */ - DIGIT, /* DIGITUTF8 */ DIGIT, /* DIGITL */ - DIGIT, /* DIGITLUTF8 */ NDIGIT, /* NDIGIT */ - NDIGIT, /* NDIGITUTF8 */ NDIGIT, /* NDIGITL */ - NDIGIT, /* NDIGITLUTF8 */ CLUMP, /* CLUMP */ BRANCH, /* BRANCH */ BACK, /* BACK */ @@ -184,44 +146,25 @@ static const U8 regarglen[] = { 0, /* MEOL */ 0, /* SEOL */ 0, /* BOUND */ - 0, /* BOUNDUTF8 */ 0, /* BOUNDL */ - 0, /* BOUNDLUTF8 */ 0, /* NBOUND */ - 0, /* NBOUNDUTF8 */ 0, /* NBOUNDL */ - 0, /* NBOUNDLUTF8 */ 0, /* GPOS */ 0, /* REG_ANY */ - 0, /* ANYUTF8 */ 0, /* SANY */ - 0, /* SANYUTF8 */ 0, /* ANYOF */ - EXTRA_SIZE(struct regnode_1), /* ANYOFUTF8 */ 0, /* ALNUM */ - 0, /* ALNUMUTF8 */ 0, /* ALNUML */ - 0, /* ALNUMLUTF8 */ 0, /* NALNUM */ - 0, /* NALNUMUTF8 */ 0, /* NALNUML */ - 0, /* NALNUMLUTF8 */ 0, /* SPACE */ - 0, /* SPACEUTF8 */ 0, /* SPACEL */ - 0, /* SPACELUTF8 */ 0, /* NSPACE */ - 0, /* NSPACEUTF8 */ 0, /* NSPACEL */ - 0, /* NSPACELUTF8 */ 0, /* DIGIT */ - 0, /* DIGITUTF8 */ 0, /* DIGITL */ - 0, /* DIGITLUTF8 */ 0, /* NDIGIT */ - 0, /* NDIGITUTF8 */ 0, /* NDIGITL */ - 0, /* NDIGITLUTF8 */ 0, /* CLUMP */ 0, /* BRANCH */ 0, /* BACK */ @@ -267,44 +210,25 @@ static const char reg_off_by_arg[] = { 0, /* MEOL */ 0, /* SEOL */ 0, /* BOUND */ - 0, /* BOUNDUTF8 */ 0, /* BOUNDL */ - 0, /* BOUNDLUTF8 */ 0, /* NBOUND */ - 0, /* NBOUNDUTF8 */ 0, /* NBOUNDL */ - 0, /* NBOUNDLUTF8 */ 0, /* GPOS */ 0, /* REG_ANY */ - 0, /* ANYUTF8 */ 0, /* SANY */ - 0, /* SANYUTF8 */ 0, /* ANYOF */ - 0, /* ANYOFUTF8 */ 0, /* ALNUM */ - 0, /* ALNUMUTF8 */ 0, /* ALNUML */ - 0, /* ALNUMLUTF8 */ 0, /* NALNUM */ - 0, /* NALNUMUTF8 */ 0, /* NALNUML */ - 0, /* NALNUMLUTF8 */ 0, /* SPACE */ - 0, /* SPACEUTF8 */ 0, /* SPACEL */ - 0, /* SPACELUTF8 */ 0, /* NSPACE */ - 0, /* NSPACEUTF8 */ 0, /* NSPACEL */ - 0, /* NSPACELUTF8 */ 0, /* DIGIT */ - 0, /* DIGITUTF8 */ 0, /* DIGITL */ - 0, /* DIGITLUTF8 */ 0, /* NDIGIT */ - 0, /* NDIGITUTF8 */ 0, /* NDIGITL */ - 0, /* NDIGITLUTF8 */ 0, /* CLUMP */ 0, /* BRANCH */ 0, /* BACK */ @@ -351,79 +275,60 @@ static const char * const reg_name[] = { "MEOL", /* 0x7 */ "SEOL", /* 0x8 */ "BOUND", /* 0x9 */ - "BOUNDUTF8", /* 0xa */ - "BOUNDL", /* 0xb */ - "BOUNDLUTF8", /* 0xc */ - "NBOUND", /* 0xd */ - "NBOUNDUTF8", /* 0xe */ - "NBOUNDL", /* 0xf */ - "NBOUNDLUTF8", /* 0x10 */ - "GPOS", /* 0x11 */ - "REG_ANY", /* 0x12 */ - "ANYUTF8", /* 0x13 */ - "SANY", /* 0x14 */ - "SANYUTF8", /* 0x15 */ - "ANYOF", /* 0x16 */ - "ANYOFUTF8", /* 0x17 */ - "ALNUM", /* 0x18 */ - "ALNUMUTF8", /* 0x19 */ - "ALNUML", /* 0x1a */ - "ALNUMLUTF8", /* 0x1b */ - "NALNUM", /* 0x1c */ - "NALNUMUTF8", /* 0x1d */ - "NALNUML", /* 0x1e */ - "NALNUMLUTF8", /* 0x1f */ - "SPACE", /* 0x20 */ - "SPACEUTF8", /* 0x21 */ - "SPACEL", /* 0x22 */ - "SPACELUTF8", /* 0x23 */ - "NSPACE", /* 0x24 */ - "NSPACEUTF8", /* 0x25 */ - "NSPACEL", /* 0x26 */ - "NSPACELUTF8", /* 0x27 */ - "DIGIT", /* 0x28 */ - "DIGITUTF8", /* 0x29 */ - "DIGITL", /* 0x2a */ - "DIGITLUTF8", /* 0x2b */ - "NDIGIT", /* 0x2c */ - "NDIGITUTF8", /* 0x2d */ - "NDIGITL", /* 0x2e */ - "NDIGITLUTF8", /* 0x2f */ - "CLUMP", /* 0x30 */ - "BRANCH", /* 0x31 */ - "BACK", /* 0x32 */ - "EXACT", /* 0x33 */ - "EXACTF", /* 0x34 */ - "EXACTFL", /* 0x35 */ - "NOTHING", /* 0x36 */ - "TAIL", /* 0x37 */ - "STAR", /* 0x38 */ - "PLUS", /* 0x39 */ - "CURLY", /* 0x3a */ - "CURLYN", /* 0x3b */ - "CURLYM", /* 0x3c */ - "CURLYX", /* 0x3d */ - "WHILEM", /* 0x3e */ - "OPEN", /* 0x3f */ - "CLOSE", /* 0x40 */ - "REF", /* 0x41 */ - "REFF", /* 0x42 */ - "REFFL", /* 0x43 */ - "IFMATCH", /* 0x44 */ - "UNLESSM", /* 0x45 */ - "SUSPEND", /* 0x46 */ - "IFTHEN", /* 0x47 */ - "GROUPP", /* 0x48 */ - "LONGJMP", /* 0x49 */ - "BRANCHJ", /* 0x4a */ - "EVAL", /* 0x4b */ - "MINMOD", /* 0x4c */ - "LOGICAL", /* 0x4d */ - "RENUM", /* 0x4e */ - "OPTIMIZED", /* 0x4f */ + "BOUNDL", /* 0xa */ + "NBOUND", /* 0xb */ + "NBOUNDL", /* 0xc */ + "GPOS", /* 0xd */ + "REG_ANY", /* 0xe */ + "SANY", /* 0xf */ + "ANYOF", /* 0x10 */ + "ALNUM", /* 0x11 */ + "ALNUML", /* 0x12 */ + "NALNUM", /* 0x13 */ + "NALNUML", /* 0x14 */ + "SPACE", /* 0x15 */ + "SPACEL", /* 0x16 */ + "NSPACE", /* 0x17 */ + "NSPACEL", /* 0x18 */ + "DIGIT", /* 0x19 */ + "DIGITL", /* 0x1a */ + "NDIGIT", /* 0x1b */ + "NDIGITL", /* 0x1c */ + "CLUMP", /* 0x1d */ + "BRANCH", /* 0x1e */ + "BACK", /* 0x1f */ + "EXACT", /* 0x20 */ + "EXACTF", /* 0x21 */ + "EXACTFL", /* 0x22 */ + "NOTHING", /* 0x23 */ + "TAIL", /* 0x24 */ + "STAR", /* 0x25 */ + "PLUS", /* 0x26 */ + "CURLY", /* 0x27 */ + "CURLYN", /* 0x28 */ + "CURLYM", /* 0x29 */ + "CURLYX", /* 0x2a */ + "WHILEM", /* 0x2b */ + "OPEN", /* 0x2c */ + "CLOSE", /* 0x2d */ + "REF", /* 0x2e */ + "REFF", /* 0x2f */ + "REFFL", /* 0x30 */ + "IFMATCH", /* 0x31 */ + "UNLESSM", /* 0x32 */ + "SUSPEND", /* 0x33 */ + "IFTHEN", /* 0x34 */ + "GROUPP", /* 0x35 */ + "LONGJMP", /* 0x36 */ + "BRANCHJ", /* 0x37 */ + "EVAL", /* 0x38 */ + "MINMOD", /* 0x39 */ + "LOGICAL", /* 0x3a */ + "RENUM", /* 0x3b */ + "OPTIMIZED", /* 0x3c */ }; -static const int reg_num = 80; +static const int reg_num = 61; #endif /* DEBUGGING */ #endif /* REG_COMP_C */ diff --git a/sv.c b/sv.c index 1dafbf6..1fbf83f 100644 --- a/sv.c +++ b/sv.c @@ -4522,11 +4522,9 @@ Perl_sv_len_utf8(pTHX_ register SV *sv) if (!sv) return 0; -#ifdef NOTYET if (SvGMAGICAL(sv)) return mg_length(sv); else -#endif { STRLEN len; U8 *s = (U8*)SvPV(sv, len); diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t index ac42b85..cd9d56a 100644 --- a/t/op/utf8decode.t +++ b/t/op/utf8decode.t @@ -5,6 +5,8 @@ BEGIN { @INC = '../lib'; } +no utf8; # this test contains raw 8-bit data on purpose; don't switch to \x{} + print "1..78\n"; my $test = 1; diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index 6986720..89416dc 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -10,7 +10,7 @@ BEGIN { } } -print "1..90\n"; +print "1..104\n"; my $test = 1; @@ -42,6 +42,7 @@ sub nok_bytes { { use utf8; + $_ = ">\x{263A}<"; s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg; ok $_, '>☺<'; @@ -106,212 +107,191 @@ sub nok_bytes { } { - use utf8; - - $_ = "\x{263A}>\x{263A}\x{263A}"; - - ok length, 4; - $test++; # 13 - - ok length((m/>(.)/)[0]), 1; - $test++; # 14 - - ok length($&), 2; - $test++; # 15 + # no use utf8 needed + $_ = "\x{263A}\x{263A}x\x{263A}y\x{263A}"; + + ok length($_), 6; # 13 + $test++; - ok length($'), 1; - $test++; # 16 + ($a) = m/x(.)/; - ok length($`), 1; - $test++; # 17 + ok length($a), 1; # 14 + $test++; - ok length($1), 1; - $test++; # 18 + ok length($`), 2; # 15 + $test++; + ok length($&), 2; # 16 + $test++; + ok length($'), 2; # 17 + $test++; - ok length($tmp=$&), 2; - $test++; # 19 + ok length($1), 1; # 18 + $test++; - ok length($tmp=$'), 1; - $test++; # 20 + ok length($b=$`), 2; # 19 + $test++; - ok length($tmp=$`), 1; - $test++; # 21 + ok length($b=$&), 2; # 20 + $test++; - ok length($tmp=$1), 1; - $test++; # 22 + ok length($b=$'), 2; # 21 + $test++; - { - use bytes; + ok length($b=$1), 1; # 22 + $test++; - my $tmp = $&; - ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272); - $test++; # 23 + ok $a, "\x{263A}"; # 23 + $test++; - $tmp = $'; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 24 + ok $`, "\x{263A}\x{263A}"; # 24 + $test++; - $tmp = $`; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 25 + ok $&, "x\x{263A}"; # 25 + $test++; - $tmp = $1; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 26 - } + ok $', "y\x{263A}"; # 26 + $test++; - ok_bytes $&, pack("C*", ord(">"), 0342, 0230, 0272); - $test++; # 27 + ok $1, "\x{263A}"; # 27 + $test++; - ok_bytes $', pack("C*", 0342, 0230, 0272); - $test++; # 28 + ok_bytes $a, "\342\230\272"; # 28 + $test++; - ok_bytes $`, pack("C*", 0342, 0230, 0272); - $test++; # 29 + ok_bytes $1, "\342\230\272"; # 29 + $test++; - ok_bytes $1, pack("C*", 0342, 0230, 0272); - $test++; # 30 + ok_bytes $&, "x\342\230\272"; # 30 + $test++; { - use bytes; - no utf8; - - ok length, 10; - $test++; # 31 + use utf8; # required + $_ = chr(0x263A) . chr(0x263A) . 'x' . chr(0x263A) . 'y' . chr(0x263A); + } - ok length((m/>(.)/)[0]), 1; - $test++; # 32 + ok length($_), 6; # 31 + $test++; - ok length($&), 2; - $test++; # 33 + ($a) = m/x(.)/; - ok length($'), 5; - $test++; # 34 + ok length($a), 1; # 32 + $test++; - ok length($`), 3; - $test++; # 35 + ok length($`), 2; # 33 + $test++; - ok length($1), 1; - $test++; # 36 + ok length($&), 2; # 34 + $test++; - ok $&, pack("C*", ord(">"), 0342); - $test++; # 37 + ok length($'), 2; # 35 + $test++; - ok $', pack("C*", 0230, 0272, 0342, 0230, 0272); - $test++; # 38 + ok length($1), 1; # 36 + $test++; - ok $`, pack("C*", 0342, 0230, 0272); - $test++; # 39 + ok length($b=$`), 2; # 37 + $test++; - ok $1, pack("C*", 0342); - $test++; # 40 - } + ok length($b=$&), 2; # 38 + $test++; - { - no utf8; - $_="\342\230\272>\342\230\272\342\230\272"; - } + ok length($b=$'), 2; # 39 + $test++; - ok length, 10; - $test++; # 41 + ok length($b=$1), 1; # 40 + $test++; - ok length((m/>(.)/)[0]), 1; - $test++; # 42 + ok $a, "\x{263A}"; # 41 + $test++; - ok length($&), 2; - $test++; # 43 + ok $`, "\x{263A}\x{263A}"; # 42 + $test++; - ok length($'), 1; - $test++; # 44 + ok $&, "x\x{263A}"; # 43 + $test++; - ok length($`), 1; - $test++; # 45 + ok $', "y\x{263A}"; # 44 + $test++; - ok length($1), 1; - $test++; # 46 + ok $1, "\x{263A}"; # 45 + $test++; - ok length($tmp=$&), 2; - $test++; # 47 + ok_bytes $a, "\342\230\272"; # 46 + $test++; - ok length($tmp=$'), 1; - $test++; # 48 + ok_bytes $1, "\342\230\272"; # 47 + $test++; - ok length($tmp=$`), 1; - $test++; # 49 + ok_bytes $&, "x\342\230\272"; # 48 + $test++; - ok length($tmp=$1), 1; - $test++; # 50 + $_ = "\342\230\272\342\230\272x\342\230\272y\342\230\272"; - { - use bytes; + ok length($_), 14; # 49 + $test++; - my $tmp = $&; - ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272); - $test++; # 51 + ($a) = m/x(.)/; - $tmp = $'; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 52 + ok length($a), 1; # 50 + $test++; - $tmp = $`; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 53 + ok length($`), 6; # 51 + $test++; - $tmp = $1; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 54 - } + ok length($&), 2; # 52 + $test++; - { - use bytes; - no utf8; + ok length($'), 6; # 53 + $test++; - ok length, 10; - $test++; # 55 + ok length($1), 1; # 54 + $test++; - ok length((m/>(.)/)[0]), 1; - $test++; # 56 + ok length($b=$`), 6; # 55 + $test++; - ok length($&), 2; - $test++; # 57 + ok length($b=$&), 2; # 56 + $test++; - ok length($'), 5; - $test++; # 58 + ok length($b=$'), 6; # 57 + $test++; - ok length($`), 3; - $test++; # 59 + ok length($b=$1), 1; # 58 + $test++; - ok length($1), 1; - $test++; # 60 + ok $a, "\342"; # 59 + $test++; - ok $&, pack("C*", ord(">"), 0342); - $test++; # 61 + ok $`, "\342\230\272\342\230\272"; # 60 + $test++; - ok $', pack("C*", 0230, 0272, 0342, 0230, 0272); - $test++; # 62 + ok $&, "x\342"; # 61 + $test++; - ok $`, pack("C*", 0342, 0230, 0272); - $test++; # 63 + ok $', "\230\272y\342\230\272"; # 62 + $test++; - ok $1, pack("C*", 0342); - $test++; # 64 - } + ok $1, "\342"; # 63 + $test++; +} +{ + use utf8; ok "\x{ab}" =~ /^\x{ab}$/, 1; - $test++; # 65 + $test++; # 64 } { use utf8; ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2); - $test++; # 66 + $test++; # 65 } { use utf8; my @a = map ord, split(//, join("", map chr, (1234, 123, 2345))); ok "@a", "1234 123 2345"; - $test++; # 67 + $test++; # 66 } { @@ -319,7 +299,7 @@ sub nok_bytes { my $x = chr(123); my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345))); ok "@a", "1234 2345"; - $test++; # 68 + $test++; # 67 } { @@ -331,10 +311,10 @@ sub nok_bytes { { use utf8; $b = "\xe4" } # \xXX must not produce UTF-8 print "not " if $a eq $b; - print "ok $test\n"; $test++; + print "ok $test\n"; $test++; # 68 { use utf8; print "not " if $a eq $b; } - print "ok $test\n"; $test++; + print "ok $test\n"; $test++; # 69 } { @@ -344,7 +324,7 @@ sub nok_bytes { for (@x) { s/(\d+)\s*([\w\-]+)/$1 . uc $2/e; my($latin) = /^(.+)(?:\s+\d)/; - print $latin eq "stra\337e" ? "ok $test\n" : + print $latin eq "stra\337e" ? "ok $test\n" : # 70, 71 "#latin[$latin]\nnot ok $test\n"; $test++; $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a @@ -369,7 +349,7 @@ sub nok_bytes { } print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B"; - print "ok $test\n"; + print "ok $test\n"; # 72 $test++; } @@ -384,27 +364,27 @@ sub nok_bytes { print "not " unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a; print "ok $test\n"; - $test++; + $test++; # 73 my ($a, $b) = split(/\x{100}/, $s); print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20"; print "ok $test\n"; - $test++; + $test++; # 74 my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s); print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20"; print "ok $test\n"; - $test++; + $test++; # 75 my ($a, $b) = split(/\x40\x{80}/, $s); print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20"; print "ok $test\n"; - $test++; + $test++; # 76 my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s); print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20"; print "ok $test\n"; - $test++; + $test++; # 77 } { @@ -414,14 +394,14 @@ sub nok_bytes { my $smiley = "\x{263a}"; - for my $s ("\x{263a}", # 1 - $smiley, # 2 + for my $s ("\x{263a}", # 78 + $smiley, # 79 - "" . $smiley, # 3 - "" . "\x{263a}", # 4 + "" . $smiley, # 80 + "" . "\x{263a}", # 81 - $smiley . "", # 5 - "\x{263a}" . "", # 6 + $smiley . "", # 82 + "\x{263a}" . "", # 83 ) { my $length_chars = length($s); my $length_bytes; @@ -437,14 +417,14 @@ sub nok_bytes { $test++; } - for my $s ("\x{263a}" . "\x{263a}", # 7 - $smiley . $smiley, # 8 + for my $s ("\x{263a}" . "\x{263a}", # 84 + $smiley . $smiley, # 85 - "\x{263a}\x{263a}", # 9 - "$smiley$smiley", # 10 + "\x{263a}\x{263a}", # 86 + "$smiley$smiley", # 87 - "\x{263a}" x 2, # 11 - $smiley x 2, # 12 + "\x{263a}" x 2, # 88 + $smiley x 2, # 89 ) { my $length_chars = length($s); my $length_bytes; @@ -460,3 +440,106 @@ sub nok_bytes { $test++; } } + +{ + use utf8; + + print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; + print "ok $test\n"; + $test++; # 90 + + print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; + print "ok $test\n"; + $test++; # 91 + + print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; + print "ok $test\n"; + $test++; # 92 + + print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; + print "ok $test\n"; + $test++; # 93 + + print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; + print "ok $test\n"; + $test++; # 94 + + print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; + print "ok $test\n"; + $test++; # 95 + + print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; + print "ok $test\n"; + $test++; # 96 + + print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; + print "ok $test\n"; + $test++; # 97 +} + +{ + # the first half of 20001028.003 + + my $X = chr(1448); + my ($Y) = $X =~ /(.*)/; + print "not " unless length $Y == 1; + print "ok $test\n"; + $test++; # 98 +} + +{ + # 20001108.001 + + use utf8; + my $X = "Szab\x{f3},Bal\x{e1}zs"; + my $Y = $X; + $Y =~ s/(B)/$1/ for 0..3; + print "not " unless $Y eq $X; + print "ok $test\n"; + $test++; # 99 +} + +{ + # 20001114.001 + + use utf8; + use charnames ':full'; + my $text = "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}"; + print "not " unless ord($text) == 0xc4; + print "ok $test\n"; + $test++; # 100 +} + +{ + # 20001205.014 + + use utf8; + + my $a = "ABC\x{263A}"; + + my @b = split( //, $a ); + + print "not " unless @b == 4; + print "ok $test\n"; + $test++; # 101 + + print "not " unless length($b[3]) == 1; + print "ok $test\n"; + $test++; # 102 + + $a =~ s/^A/Z/; + print "not " unless length($a) == 4; + print "ok $test\n"; + $test++; # 103 +} + +{ + # the second half of 20001028.003 + + use utf8; + $X =~ s/^/chr(1488)/e; + print "not " unless length $X == 1; + print "ok $test\n"; + $test++; # 104 +} +