Integrate encoding::warnings from Autrijus Tang.

[p5sagit/p5-mst-13.2.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index d0d0ce7..f254713 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -5,9 +5,15 @@
  * "One Ring to rule them all, One Ring to find them..."
  */
 
-/* This file contains functions for executing a regular expresssion.  See
- * also regcomp.c which funnnily enough, contains functions for compiling
+/* This file contains functions for executing a regular expression.  See
+ * also regcomp.c which funnily enough, contains functions for compiling
  * a regular expression.
+ *
+ * This file is also copied at build time to ext/re/re_exec.c, where
+ * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
+ * This causes the main functions to be compiled under new names and with
+ * debugging support added, which makes "use re 'debug'" work.
+ 
  */
 
 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
@@ -73,7 +79,7 @@
  ****    Alterations to Henry's code are...
  ****
  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
- ****    2000, 2001, 2002, 2003, 2004, by Larry Wall and others
+ ****    2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
  ****
  ****    You may distribute under the terms of either the GNU General Public
  ****    License or the Artistic License, as specified in the README file.
@@ -403,6 +409,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     I32 ml_anch;
     register char *other_last = Nullch;        /* other substr checked before this */
     char *check_at = Nullch;           /* check substr found at this pos */
+    I32 multiline = prog->reganch & PMf_MULTILINE;
 #ifdef DEBUGGING
     char *i_strpos = strpos;
     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
@@ -464,7 +471,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     if (prog->reganch & ROPT_ANCH) {   /* Match at beg-of-str or after \n */
        ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
                     || ( (prog->reganch & ROPT_ANCH_BOL)
-                         && !PL_multiline ) ); /* Check after \n? */
+                         && !multiline ) );    /* Check after \n? */
 
        if (!ml_anch) {
          if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
@@ -558,11 +565,11 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     else if (prog->reganch & ROPT_CANY_SEEN)
        s = fbm_instr((U8*)(s + start_shift),
                      (U8*)(strend - end_shift),
-                     check, PL_multiline ? FBMrf_MULTILINE : 0);
+                     check, multiline ? FBMrf_MULTILINE : 0);
     else
        s = fbm_instr(HOP3(s, start_shift, strend),
                      HOP3(strend, -end_shift, strbeg),
-                     check, PL_multiline ? FBMrf_MULTILINE : 0);
+                     check, multiline ? FBMrf_MULTILINE : 0);
 
     /* Update the count-of-usability, remove useless subpatterns,
        unshift s.  */
@@ -631,7 +638,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                        HOP3(HOP3(last1, prog->anchored_offset, strend)
                                + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
                        must,
-                       PL_multiline ? FBMrf_MULTILINE : 0
+                       multiline ? FBMrf_MULTILINE : 0
                    );
                DEBUG_r(PerlIO_printf(Perl_debug_log,
                        "%s anchored substr `%s%.*s%s'%s",
@@ -692,7 +699,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                s = fbm_instr((unsigned char*)s,
                              (unsigned char*)last + SvCUR(must)
                                  - (SvTAIL(must)!=0),
-                             must, PL_multiline ? FBMrf_MULTILINE : 0);
+                             must, multiline ? FBMrf_MULTILINE : 0);
            DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
                    (s ? "Found" : "Contradicts"),
                    PL_colors[0],
@@ -1021,15 +1028,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            if (UTF) {
                STRLEN ulen1, ulen2;
                U8 *sm = (U8 *) m;
-               U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-               U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+               U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+               U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
 
                to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
                to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
-               c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 
+               c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 
                                    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
-               c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+               c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
                                    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
                lnc = 0;
                while (sm < ((U8 *) m + ln)) {
@@ -1067,15 +1074,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 
            if (do_utf8) {
                UV c, f;
-               U8 tmpbuf [UTF8_MAXLEN+1];
-               U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+               U8 tmpbuf [UTF8_MAXBYTES+1];
+               U8 foldbuf[UTF8_MAXBYTES_CASE+1];
                STRLEN len, foldlen;
                
                if (c1 == c2) {
                    /* Upper and lower of 1st char are equal -
                     * probably not a "letter". */
                    while (s <= e) {
-                       c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+                       c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY);
                        if ( c == c1
@@ -1102,7 +1109,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                }
                else {
                    while (s <= e) {
-                     c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+                     c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY);
 
@@ -1628,6 +1635,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     char *scream_olds;
     SV* oreplsv = GvSV(PL_replgv);
     bool do_utf8 = DO_UTF8(sv);
+    I32 multiline = prog->reganch & PMf_MULTILINE;
 #ifdef DEBUGGING
     SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
     SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
@@ -1744,7 +1752,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
        if (s == startpos && regtry(prog, startpos))
            goto got_it;
-       else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
+       else if (multiline || (prog->reganch & ROPT_IMPLICIT)
                 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
        {
            char *end;
@@ -1878,7 +1886,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                                    end_shift, &scream_pos, 0))
                 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
                                  (unsigned char*)strend, must,
-                                 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
+                                 multiline ? FBMrf_MULTILINE : 0))) ) {
            /* we may be pointing at the wrong string */
            if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
                s = strbeg + (s - SvPVX(sv));
@@ -1979,7 +1987,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                if (SvTAIL(float_real)) {
                    if (memEQ(strend - len + 1, little, len - 1))
                        last = strend - len + 1;
-                   else if (!PL_multiline)
+                   else if (!multiline)
                        last = memEQ(strend - len, little, len)
                            ? strend - len : Nullch;
                    else
@@ -2369,8 +2377,7 @@ S_regmatch(pTHX_ regnode *prog)
 
        switch (OP(scan)) {
        case BOL:
-           if (locinput == PL_bostr || (PL_multiline &&
-               (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
+           if (locinput == PL_bostr)
            {
                /* regtill = regbol; */
                break;
@@ -2392,12 +2399,8 @@ S_regmatch(pTHX_ regnode *prog)
                break;
            sayNO;
        case EOL:
-           if (PL_multiline)
-               goto meol;
-           else
                goto seol;
        case MEOL:
-         meol:
            if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
                sayNO;
            break;
@@ -2456,7 +2459,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                             sayNO;
                        if (NATIVE_TO_UNI(*(U8*)s) !=
-                           utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+                           utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY))
                             sayNO;
@@ -2470,7 +2473,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                            sayNO;
                        if (NATIVE_TO_UNI(*((U8*)l)) !=
-                           utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+                           utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY))
                            sayNO;
@@ -2803,8 +2806,8 @@ S_regmatch(pTHX_ regnode *prog)
                 */
                if (OP(scan) == REFF) {
                    STRLEN ulen1, ulen2;
-                   U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-                   U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+                   U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+                   U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
                    while (s < e) {
                        if (l >= PL_regeol)
                            sayNO;
@@ -3577,21 +3580,21 @@ S_regmatch(pTHX_ regnode *prog)
                    else { /* UTF */
                        if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
                             STRLEN ulen1, ulen2;
-                            U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-                            U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+                            U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+                            U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
 
                             to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
                             to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
-                            c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+                            c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
                                                 ckWARN(WARN_UTF8) ?
                                                 0 : UTF8_ALLOW_ANY);
-                            c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+                            c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
                                                 ckWARN(WARN_UTF8) ?
                                                 0 : UTF8_ALLOW_ANY);
                        }
                        else {
-                           c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+                           c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
                                                     ckWARN(WARN_UTF8) ?
                                                     0 : UTF8_ALLOW_ANY);
                        }
@@ -3653,7 +3656,7 @@ S_regmatch(pTHX_ regnode *prog)
                                 * utf8_distance(old, locinput) */
                                while (locinput <= e &&
                                       utf8n_to_uvchr((U8*)locinput,
-                                                     UTF8_MAXLEN, &len,
+                                                     UTF8_MAXBYTES, &len,
                                                      ckWARN(WARN_UTF8) ?
                                                      0 : UTF8_ALLOW_ANY) != (UV)c1) {
                                    locinput += len;
@@ -3664,7 +3667,7 @@ S_regmatch(pTHX_ regnode *prog)
                                 * utf8_distance(old, locinput) */
                                while (locinput <= e) {
                                    UV c = utf8n_to_uvchr((U8*)locinput,
-                                                         UTF8_MAXLEN, &len,
+                                                         UTF8_MAXBYTES, &len,
                                                          ckWARN(WARN_UTF8) ?
                                                          0 : UTF8_ALLOW_ANY);
                                    if (c == (UV)c1 || c == (UV)c2)
@@ -3701,7 +3704,7 @@ S_regmatch(pTHX_ regnode *prog)
                    if (c1 != -1000) {
                        if (do_utf8)
                            c = utf8n_to_uvchr((U8*)PL_reginput,
-                                              UTF8_MAXLEN, 0,
+                                              UTF8_MAXBYTES, 0,
                                               ckWARN(WARN_UTF8) ?
                                               0 : UTF8_ALLOW_ANY);
                        else
@@ -3734,7 +3737,7 @@ S_regmatch(pTHX_ regnode *prog)
                n = regrepeat(scan, n);
                locinput = PL_reginput;
                if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
-                   ((!PL_multiline && OP(next) != MEOL) ||
+                   (OP(next) != MEOL ||
                        OP(next) == SEOL || OP(next) == EOS))
                {
                    ln = n;                     /* why back off? */
@@ -3751,7 +3754,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (c1 != -1000) {
                            if (do_utf8)
                                c = utf8n_to_uvchr((U8*)PL_reginput,
-                                                  UTF8_MAXLEN, 0,
+                                                  UTF8_MAXBYTES, 0,
                                                   ckWARN(WARN_UTF8) ?
                                                   0 : UTF8_ALLOW_ANY);
                            else
@@ -3774,7 +3777,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (c1 != -1000) {
                            if (do_utf8)
                                c = utf8n_to_uvchr((U8*)PL_reginput,
-                                                  UTF8_MAXLEN, 0,
+                                                  UTF8_MAXBYTES, 0,
                                                   ckWARN(WARN_UTF8) ?
                                                   0 : UTF8_ALLOW_ANY);
                            else
@@ -4367,7 +4370,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
     STRLEN plen;
 
     if (do_utf8 && !UTF8_IS_INVARIANT(c))
-        c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+        c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
                            ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
@@ -4404,7 +4407,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
                        }
                    }
                    if (!match) {
-                       U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+                       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
                        STRLEN tmplen;
 
                        to_utf8_fold(p, tmpbuf, &tmplen);
@@ -4565,8 +4568,7 @@ S_to_utf8_substr(pTHX_ register regexp *prog)
 {
     SV* sv;
     if (prog->float_substr && !prog->float_utf8) {
-       prog->float_utf8 = sv = NEWSV(117, 0);
-       SvSetSV(sv, prog->float_substr);
+       prog->float_utf8 = sv = newSVsv(prog->float_substr);
        sv_utf8_upgrade(sv);
        if (SvTAIL(prog->float_substr))
            SvTAIL_on(sv);
@@ -4574,8 +4576,7 @@ S_to_utf8_substr(pTHX_ register regexp *prog)
            prog->check_utf8 = sv;
     }
     if (prog->anchored_substr && !prog->anchored_utf8) {
-       prog->anchored_utf8 = sv = NEWSV(118, 0);
-       SvSetSV(sv, prog->anchored_substr);
+       prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
        sv_utf8_upgrade(sv);
        if (SvTAIL(prog->anchored_substr))
            SvTAIL_on(sv);
@@ -4589,8 +4590,7 @@ S_to_byte_substr(pTHX_ register regexp *prog)
 {
     SV* sv;
     if (prog->float_utf8 && !prog->float_substr) {
-       prog->float_substr = sv = NEWSV(117, 0);
-       SvSetSV(sv, prog->float_utf8);
+       prog->float_substr = sv = newSVsv(prog->float_utf8);
        if (sv_utf8_downgrade(sv, TRUE)) {
            if (SvTAIL(prog->float_utf8))
                SvTAIL_on(sv);
@@ -4602,8 +4602,7 @@ S_to_byte_substr(pTHX_ register regexp *prog)
            prog->check_substr = sv;
     }
     if (prog->anchored_utf8 && !prog->anchored_substr) {
-       prog->anchored_substr = sv = NEWSV(118, 0);
-       SvSetSV(sv, prog->anchored_utf8);
+       prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
        if (sv_utf8_downgrade(sv, TRUE)) {
            if (SvTAIL(prog->anchored_utf8))
                SvTAIL_on(sv);