Various tweaks to Encode
[p5sagit/p5-mst-13.2.git] / regexec.c
index 5dbe166..4380fd8 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
                to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
-               c1 = utf8_to_uvchr(tmpbuf1, 0);
-               c2 = utf8_to_uvchr(tmpbuf2, 0);
+               c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 
+                                   0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+               c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+                                   0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
            }
            else {
                c1 = *(U8*)m;
@@ -1013,7 +1015,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            c1 = *(U8*)m;
            c2 = PL_fold_locale[c1];
          do_exactf:
-           e = do_utf8 ? s + ln : strend - ln;
+           e = HOP3c(strend, -(I32)ln, s);
 
            if (norun && e < s)
                e = s;                  /* Due to minlen logic of intuit() */
@@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                
                if (c1 == c2) {
                    while (s <= e) {
-                       c = utf8_to_uvchr((U8*)s, &len);
+                       c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY);
                        if ( c == c1
                             && (ln == len ||
                                 ibcmp_utf8(s, (char **)0, 0,  do_utf8,
@@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                }
                else {
                    while (s <= e) {
-                       c = utf8_to_uvchr((U8*)s, &len);
+                     c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY);
 
                        /* Handle some of the three Greek sigmas cases.
                         * Note that not all the possible combinations
@@ -1129,11 +1135,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                if (s == PL_bostr)
                    tmp = '\n';
                else {
-                   U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
+                   U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
                
-                   tmp = s > (char*)r ?
-                     utf8n_to_uvchr(r, s - (char*)r, 0, 0) :
-                     utf8n_to_uvchr(s, UTF8SKIP(s), 0, 0);
+                   tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
                }
                tmp = ((OP(c) == BOUND ?
                        isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
@@ -1174,11 +1178,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                if (s == PL_bostr)
                    tmp = '\n';
                else {
-                   U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
+                   U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
                
-                   tmp = s > (char*)r ?
-                     utf8n_to_uvchr(r, s - (char*)r, 0, 0) :
-                     utf8n_to_uvchr(s, UTF8SKIP(s), 0, 0);
+                   tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
                }
                tmp = ((OP(c) == NBOUND ?
                        isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
@@ -2394,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                             sayNO;
                        if (NATIVE_TO_UNI(*(U8*)s) !=
-                           utf8_to_uvuni((U8*)l, &ulen))
+                           utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY))
                             sayNO;
                        l += ulen;
                        s ++;
@@ -2406,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                            sayNO;
                        if (NATIVE_TO_UNI(*((U8*)l)) !=
-                           utf8_to_uvuni((U8*)s, &ulen))
+                           utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+                                          ckWARN(WARN_UTF8) ?
+                                          0 : UTF8_ALLOW_ANY))
                            sayNO;
                        s += ulen;
                        l ++;
@@ -2565,11 +2571,9 @@ S_regmatch(pTHX_ regnode *prog)
                if (locinput == PL_bostr)
                    ln = '\n';
                else {
-                   U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_reg_starttry);
+                   U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
                
-                   ln = locinput > (char*)r ?
-                     utf8n_to_uvchr(r, locinput - (char*)r, 0, 0) :
-                     utf8n_to_uvchr(locinput, UTF8SKIP(locinput), 0, 0);
+                   ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
                }
                if (OP(scan) == BOUND || OP(scan) == NBOUND) {
                    ln = isALNUM_uni(ln);
@@ -3551,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog)
                             to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
                             to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
-                            c1 = utf8_to_uvuni(tmpbuf1, 0);
-                            c2 = utf8_to_uvuni(tmpbuf2, 0);
+                            c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+                                                ckWARN(WARN_UTF8) ?
+                                                0 : UTF8_ALLOW_ANY);
+                            c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+                                                ckWARN(WARN_UTF8) ?
+                                                0 : UTF8_ALLOW_ANY);
                        }
                        else {
-                           c2 = c1 = utf8_to_uvchr(s, NULL);
+                           c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+                                                    ckWARN(WARN_UTF8) ?
+                                                    0 : UTF8_ALLOW_ANY);
                        }
                    }
                }
@@ -3574,6 +3584,7 @@ S_regmatch(pTHX_ regnode *prog)
                if (c1 != -1000) {
                    char *e; /* Should not check after this */
                    char *old = locinput;
+                   int count = 0;
 
                    if  (n == REG_INFTY) {
                        e = PL_regeol - 1;
@@ -3593,7 +3604,6 @@ S_regmatch(pTHX_ regnode *prog)
                            e = PL_regeol - 1;
                    }
                    while (1) {
-                       int count;
                        /* Find place 'next' could work */
                        if (!do_utf8) {
                            if (c1 == c2) {
@@ -3611,18 +3621,28 @@ S_regmatch(pTHX_ regnode *prog)
                        else {
                            STRLEN len;
                            if (c1 == c2) {
-                               for (count = 0;
-                                    locinput <= e &&
-                                        utf8_to_uvchr((U8*)locinput, &len) != c1;
-                                    count++)
+                               /* count initialised to
+                                * utf8_distance(old, locinput) */
+                               while (locinput <= e &&
+                                      utf8n_to_uvchr((U8*)locinput,
+                                                     UTF8_MAXLEN, &len,
+                                                     ckWARN(WARN_UTF8) ?
+                                                     0 : UTF8_ALLOW_ANY) != c1) {
                                    locinput += len;
-                               
+                                   count++;
+                               }
                            } else {
-                               for (count = 0; locinput <= e; count++) {
-                                   UV c = utf8_to_uvchr((U8*)locinput, &len);
+                               /* count initialised to
+                                * utf8_distance(old, locinput) */
+                               while (locinput <= e) {
+                                   UV c = utf8n_to_uvchr((U8*)locinput,
+                                                         UTF8_MAXLEN, &len,
+                                                         ckWARN(WARN_UTF8) ?
+                                                         0 : UTF8_ALLOW_ANY);
                                    if (c == c1 || c == c2)
                                        break;
-                                   locinput += len;                    
+                                   locinput += len;
+                                   count++;
                                }
                            }
                        }
@@ -3644,6 +3664,7 @@ S_regmatch(pTHX_ regnode *prog)
                            locinput += UTF8SKIP(locinput);
                        else
                            locinput++;
+                       count = 1;
                    }
                }
                else
@@ -3651,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog)
                    UV c;
                    if (c1 != -1000) {
                        if (do_utf8)
-                           c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+                           c = utf8n_to_uvchr((U8*)PL_reginput,
+                                              UTF8_MAXLEN, 0,
+                                              ckWARN(WARN_UTF8) ?
+                                              0 : UTF8_ALLOW_ANY);
                        else
                            c = UCHARAT(PL_reginput);
                        /* If it could work, try it. */
@@ -3698,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog)
                    while (n >= ln) {
                        if (c1 != -1000) {
                            if (do_utf8)
-                               c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+                               c = utf8n_to_uvchr((U8*)PL_reginput,
+                                                  UTF8_MAXLEN, 0,
+                                                  ckWARN(WARN_UTF8) ?
+                                                  0 : UTF8_ALLOW_ANY);
                            else
                                c = UCHARAT(PL_reginput);
                        }
@@ -3718,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog)
                    while (n >= ln) {
                        if (c1 != -1000) {
                            if (do_utf8)
-                               c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+                               c = utf8n_to_uvchr((U8*)PL_reginput,
+                                                  UTF8_MAXLEN, 0,
+                                                  ckWARN(WARN_UTF8) ?
+                                                  0 : UTF8_ALLOW_ANY);
                            else
                                c = UCHARAT(PL_reginput);
                        }
@@ -4300,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
     STRLEN len = 0;
     STRLEN plen;
 
-    c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
+    c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p;
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
     if (do_utf8 || (flags & ANYOF_UNICODE)) {