Re: uc($long_utf8_string) exhausts memory
Jarkko Hietaniemi [Sat, 22 Jan 2005 00:20:12 +0000 (02:20 +0200)]
Message-Id:  <41F1801C.3080201@iki.fi>

Make buffer size estimates for utf8 case conversion less maximally
pessimistic

p4raw-id: //depot/perl@23857

doop.c
op.c
pp.c
pp_pack.c
regcomp.c
regexec.c
sv.c
t/op/lc.t
toke.c
utf8.c
utf8.h

diff --git a/doop.c b/doop.c
index b1de08d..9e5a60d 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -338,7 +338,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)
 
     if (grows) {
        /* d needs to be bigger than s, in case e.g. upgrading is required */
-       New(0, d, len*3+UTF8_MAXLEN, U8);
+       New(0, d, len * 3 + UTF8_MAXBYTES, U8);
        dend = d + len * 3;
        dstart = d;
     }
@@ -370,10 +370,10 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)
 
        if (d > dend) {
            STRLEN clen = d - dstart;
-           STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+           STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
            if (!grows)
                Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__);
-           Renew(dstart, nlen+UTF8_MAXLEN, U8);
+           Renew(dstart, nlen + UTF8_MAXBYTES, U8);
            d = dstart + clen;
            dend = dstart + nlen;
        }
@@ -480,7 +480,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
 
     if (grows) {
        /* d needs to be bigger than s, in case e.g. upgrading is required */
-       New(0, d, len*3+UTF8_MAXLEN, U8);
+       New(0, d, len * 3 + UTF8_MAXBYTES, U8);
        dend = d + len * 3;
        dstart = d;
     }
@@ -496,10 +496,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
        
            if (d > dend) {
                STRLEN clen = d - dstart;
-               STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+               STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
                if (!grows)
                    Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
-               Renew(dstart, nlen+UTF8_MAXLEN, U8);
+               Renew(dstart, nlen + UTF8_MAXBYTES, U8);
                d = dstart + clen;
                dend = dstart + nlen;
            }
@@ -550,10 +550,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
            uv = swash_fetch(rv, s, TRUE);
            if (d > dend) {
                STRLEN clen = d - dstart;
-               STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+               STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
                if (!grows)
                    Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
-               Renew(dstart, nlen+UTF8_MAXLEN, U8);
+               Renew(dstart, nlen + UTF8_MAXBYTES, U8);
                d = dstart + clen;
                dend = dstart + nlen;
            }
diff --git a/op.c b/op.c
index ab9f374..77a213e 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2423,7 +2423,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 */
 
        if (complement) {
-           U8 tmpbuf[UTF8_MAXLEN+1];
+           U8 tmpbuf[UTF8_MAXBYTES+1];
            UV *cp;
            UV nextmin = 0;
            New(1109, cp, 2*tlen, UV);
diff --git a/pp.c b/pp.c
index a90d9ee..12f5bfb 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -3344,7 +3344,7 @@ PP(pp_ord)
     }
 
     XPUSHu(DO_UTF8(argsv) ?
-          utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) :
+          utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, UTF8_ALLOW_ANYUV) :
           (*s & 0xff));
 
     RETURN;
@@ -3454,7 +3454,7 @@ PP(pp_ucfirst)
     if (DO_UTF8(sv) &&
        (s = (U8*)SvPV_nomg(sv, slen)) && slen &&
        UTF8_IS_START(*s)) {
-       U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
        STRLEN ulen;
        STRLEN tculen;
 
@@ -3517,7 +3517,7 @@ PP(pp_lcfirst)
        (s = (U8*)SvPV_nomg(sv, slen)) && slen &&
        UTF8_IS_START(*s)) {
        STRLEN ulen;
-       U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
        U8 *tend;
        UV uv;
 
@@ -3574,7 +3574,7 @@ PP(pp_uc)
        STRLEN ulen;
        register U8 *d;
        U8 *send;
-       U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+       U8 tmpbuf[UTF8_MAXBYTES+1];
 
        s = (U8*)SvPV_nomg(sv,len);
        if (!len) {
@@ -3583,18 +3583,28 @@ PP(pp_uc)
            SETs(TARG);
        }
        else {
-           STRLEN nchar = utf8_length(s, s + len);
-
            (void)SvUPGRADE(TARG, SVt_PV);
-           SvGROW(TARG, (nchar * UTF8_MAXLEN_UCLC) + 1);
+           SvGROW(TARG, len + 1);
            (void)SvPOK_only(TARG);
            d = (U8*)SvPVX(TARG);
            send = s + len;
            while (s < send) {
+               STRLEN u = UTF8SKIP(s);
+
                toUPPER_utf8(s, tmpbuf, &ulen);
+               if (ulen > u) {
+                   UV o = d - (U8*)SvPVX(TARG);
+
+                   /* If someone uppercases one million U+03B0s we
+                    * SvGROW() one million times.  Or we could try
+                    * guess how much to allocate without overdoing.
+                    * Such is life. */
+                   SvGROW(TARG, SvCUR(TARG) + ulen - u);
+                   d = (U8*)SvPVX(TARG) + o;
+               }
                Copy(tmpbuf, d, ulen, U8);
                d += ulen;
-               s += UTF8SKIP(s);
+               s += u;
            }
            *d = '\0';
            SvUTF8_on(TARG);
@@ -3643,7 +3653,7 @@ PP(pp_lc)
        STRLEN ulen;
        register U8 *d;
        U8 *send;
-       U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
 
        s = (U8*)SvPV_nomg(sv,len);
        if (!len) {
@@ -3652,16 +3662,16 @@ PP(pp_lc)
            SETs(TARG);
        }
        else {
-           STRLEN nchar = utf8_length(s, s + len);
-
            (void)SvUPGRADE(TARG, SVt_PV);
-           SvGROW(TARG, (nchar * UTF8_MAXLEN_UCLC) + 1);
+           SvGROW(TARG, len + 1);
            (void)SvPOK_only(TARG);
            d = (U8*)SvPVX(TARG);
            send = s + len;
            while (s < send) {
+               STRLEN u = UTF8SKIP(s);
                UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
-#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode */
+
+#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */
                if (uv == GREEK_CAPITAL_LETTER_SIGMA) {
                     /*
                      * Now if the sigma is NOT followed by
@@ -3675,12 +3685,24 @@ PP(pp_lc)
                      * then it should be mapped to 0x03C2,
                      * (GREEK SMALL LETTER FINAL SIGMA),
                      * instead of staying 0x03A3.
-                     * See lib/unicore/SpecCase.txt.
+                     * "should be": in other words,
+                     * this is not implemented yet.
+                     * See lib/unicore/SpecialCasing.txt.
                      */
                }
+               if (ulen > u) {
+                   UV o = d - (U8*)SvPVX(TARG);
+
+                   /* If someone lowercases one million U+0130s we
+                    * SvGROW() one million times.  Or we could try
+                    * guess how much to allocate without overdoing.
+                    Such is life. */
+                   SvGROW(TARG, SvCUR(TARG) + ulen - u);
+                   d = (U8*)SvPVX(TARG) + o;
+               }
                Copy(tmpbuf, d, ulen, U8);
                d += ulen;
-               s += UTF8SKIP(s);
+               s += u;
            }
            *d = '\0';
            SvUTF8_on(TARG);
index 815c326..2e830f4 100644 (file)
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -2183,7 +2183,7 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV
            while (len-- > 0) {
                fromstr = NEXTFROM;
                auint = UNI_TO_NATIVE(SvUV(fromstr));
-               SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1);
+               SvGROW(cat, SvCUR(cat) + UTF8_MAXBYTES + 1);
                SvCUR_set(cat,
                          (char*)uvchr_to_utf8_flags((U8*)SvEND(cat),
                                                     auint,
index 2a942d5..ff939b2 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3088,7 +3088,7 @@ tryagain:
            char *oldp, *s;
            STRLEN numlen;
            STRLEN foldlen;
-           U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
+           U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
 
             parse_start = RExC_parse - 1;
 
@@ -4199,7 +4199,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                else if (prevnatvalue == natvalue) {
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
                    if (FOLD) {
-                        U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+                        U8 foldbuf[UTF8_MAXBYTES_CASE+1];
                         STRLEN foldlen;
                         UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
 
@@ -4869,7 +4869,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
        
            if (lv) {
                if (sw) {
-                   U8 s[UTF8_MAXLEN+1];
+                   U8 s[UTF8_MAXBYTES_CASE+1];
                
                    for (i = 0; i <= 256; i++) { /* just the first 256 */
                        U8 *e = uvchr_to_utf8(s, i);
index 6ed6d95..f254713 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1028,15 +1028,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            if (UTF) {
                STRLEN ulen1, ulen2;
                U8 *sm = (U8 *) m;
-               U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-               U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+               U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+               U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
 
                to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
                to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
-               c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 
+               c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 
                                    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
-               c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+               c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
                                    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
                lnc = 0;
                while (sm < ((U8 *) m + ln)) {
@@ -1074,15 +1074,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 
            if (do_utf8) {
                UV c, f;
-               U8 tmpbuf [UTF8_MAXLEN+1];
-               U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+               U8 tmpbuf [UTF8_MAXBYTES+1];
+               U8 foldbuf[UTF8_MAXBYTES_CASE+1];
                STRLEN len, foldlen;
                
                if (c1 == c2) {
                    /* Upper and lower of 1st char are equal -
                     * probably not a "letter". */
                    while (s <= e) {
-                       c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+                       c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY);
                        if ( c == c1
@@ -1109,7 +1109,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                }
                else {
                    while (s <= e) {
-                     c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+                     c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY);
 
@@ -2459,7 +2459,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                             sayNO;
                        if (NATIVE_TO_UNI(*(U8*)s) !=
-                           utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+                           utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY))
                             sayNO;
@@ -2473,7 +2473,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                            sayNO;
                        if (NATIVE_TO_UNI(*((U8*)l)) !=
-                           utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+                           utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY))
                            sayNO;
@@ -2806,8 +2806,8 @@ S_regmatch(pTHX_ regnode *prog)
                 */
                if (OP(scan) == REFF) {
                    STRLEN ulen1, ulen2;
-                   U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-                   U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+                   U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+                   U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
                    while (s < e) {
                        if (l >= PL_regeol)
                            sayNO;
@@ -3580,21 +3580,21 @@ S_regmatch(pTHX_ regnode *prog)
                    else { /* UTF */
                        if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
                             STRLEN ulen1, ulen2;
-                            U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-                            U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+                            U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+                            U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
 
                             to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
                             to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
-                            c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+                            c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
                                                 ckWARN(WARN_UTF8) ?
                                                 0 : UTF8_ALLOW_ANY);
-                            c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+                            c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
                                                 ckWARN(WARN_UTF8) ?
                                                 0 : UTF8_ALLOW_ANY);
                        }
                        else {
-                           c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+                           c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
                                                     ckWARN(WARN_UTF8) ?
                                                     0 : UTF8_ALLOW_ANY);
                        }
@@ -3656,7 +3656,7 @@ S_regmatch(pTHX_ regnode *prog)
                                 * utf8_distance(old, locinput) */
                                while (locinput <= e &&
                                       utf8n_to_uvchr((U8*)locinput,
-                                                     UTF8_MAXLEN, &len,
+                                                     UTF8_MAXBYTES, &len,
                                                      ckWARN(WARN_UTF8) ?
                                                      0 : UTF8_ALLOW_ANY) != (UV)c1) {
                                    locinput += len;
@@ -3667,7 +3667,7 @@ S_regmatch(pTHX_ regnode *prog)
                                 * utf8_distance(old, locinput) */
                                while (locinput <= e) {
                                    UV c = utf8n_to_uvchr((U8*)locinput,
-                                                         UTF8_MAXLEN, &len,
+                                                         UTF8_MAXBYTES, &len,
                                                          ckWARN(WARN_UTF8) ?
                                                          0 : UTF8_ALLOW_ANY);
                                    if (c == (UV)c1 || c == (UV)c2)
@@ -3704,7 +3704,7 @@ S_regmatch(pTHX_ regnode *prog)
                    if (c1 != -1000) {
                        if (do_utf8)
                            c = utf8n_to_uvchr((U8*)PL_reginput,
-                                              UTF8_MAXLEN, 0,
+                                              UTF8_MAXBYTES, 0,
                                               ckWARN(WARN_UTF8) ?
                                               0 : UTF8_ALLOW_ANY);
                        else
@@ -3754,7 +3754,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (c1 != -1000) {
                            if (do_utf8)
                                c = utf8n_to_uvchr((U8*)PL_reginput,
-                                                  UTF8_MAXLEN, 0,
+                                                  UTF8_MAXBYTES, 0,
                                                   ckWARN(WARN_UTF8) ?
                                                   0 : UTF8_ALLOW_ANY);
                            else
@@ -3777,7 +3777,7 @@ S_regmatch(pTHX_ regnode *prog)
                        if (c1 != -1000) {
                            if (do_utf8)
                                c = utf8n_to_uvchr((U8*)PL_reginput,
-                                                  UTF8_MAXLEN, 0,
+                                                  UTF8_MAXBYTES, 0,
                                                   ckWARN(WARN_UTF8) ?
                                                   0 : UTF8_ALLOW_ANY);
                            else
@@ -4370,7 +4370,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
     STRLEN plen;
 
     if (do_utf8 && !UTF8_IS_INVARIANT(c))
-        c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+        c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
                            ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
@@ -4407,7 +4407,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
                        }
                    }
                    if (!match) {
-                       U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+                       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
                        STRLEN tmplen;
 
                        to_utf8_fold(p, tmpbuf, &tmplen);
diff --git a/sv.c b/sv.c
index c7211b7..343c5f0 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -9271,7 +9271,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 #endif
 
        char esignbuf[4];
-       U8 utf8buf[UTF8_MAXLEN+1];
+       U8 utf8buf[UTF8_MAXBYTES+1];
        STRLEN esignlen = 0;
 
        char *eptr = Nullch;
index cee6661..d75b64e 100644 (file)
--- a/t/op/lc.t
+++ b/t/op/lc.t
@@ -111,8 +111,8 @@ is("\u\x{587}" , "\x{535}\x{582}", "ligature titlecase");
 is("\U\x{587}" , "\x{535}\x{552}", "ligature uppercase");
 
 # mktables had problems where many-to-one case mappings didn't work right.
-# The lib/unifold.t should give the fourth folding, "casefolding", a good
-# workout. 
+# The lib/uni/fold.t should give the fourth folding, "casefolding", a good
+# workout (one cannot directly get that from Perl). 
 # \x{01C4} is LATIN CAPITAL LETTER DZ WITH CARON
 # \x{01C5} is LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
 # \x{01C6} is LATIN SMALL LETTER DZ WITH CARON
@@ -161,6 +161,5 @@ for my $a (0,1) {
        chop $a;
        $a =~ s/^(\s*)(\w*)/$1\u$2/;
        is($a, v10, "[perl #18857]");
-       $test++;
     } 
 }
diff --git a/toke.c b/toke.c
index b4222f6..5d5abf4 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1704,7 +1704,7 @@ S_scan_const(pTHX_ char *start)
                         UV uv = utf8_to_uvchr((U8*)str, 0);
 
                         if (uv < 0x100) {
-                             U8 tmpbuf[UTF8_MAXLEN+1], *d;
+                             U8 tmpbuf[UTF8_MAXBYTES+1], *d;
 
                              d = uvchr_to_utf8(tmpbuf, UNI_TO_NATIVE(uv));
                              sv_setpvn(res, (char *)tmpbuf, d - tmpbuf);
@@ -7137,7 +7137,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
     I32 brackets = 1;                  /* bracket nesting level */
     bool has_utf8 = FALSE;             /* is there any utf8 content? */
     I32 termcode;                      /* terminating char. code */
-    U8 termstr[UTF8_MAXLEN];           /* terminating string */
+    U8 termstr[UTF8_MAXBYTES];         /* terminating string */
     STRLEN termlen;                    /* length of terminating string */
     char *last = NULL;                 /* last position for nesting bracket */
 
@@ -8230,7 +8230,7 @@ Perl_scan_vstring(pTHX_ char *s, SV *sv)
 
     if (!isALPHA(*pos)) {
        UV rev;
-       U8 tmpbuf[UTF8_MAXLEN+1];
+       U8 tmpbuf[UTF8_MAXBYTES+1];
        U8 *tmpend;
 
        if (*s == 'v') s++;  /* get past 'v' */
diff --git a/utf8.c b/utf8.c
index 5c6f271..5fa2aab 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -38,7 +38,7 @@ within non-zero characters.
 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
 
 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
 bytes available. The return value is the pointer to the byte after the
 end of the new character. In other words,
 
@@ -551,7 +551,7 @@ returned and retlen is set, if possible, to -1.
 UV
 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 {
-    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
                               ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
@@ -575,7 +575,7 @@ UV
 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 {
     /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
                               ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
@@ -937,7 +937,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 bool
 Perl_is_uni_alnum(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_alnum(tmpbuf);
 }
@@ -945,7 +945,7 @@ Perl_is_uni_alnum(pTHX_ UV c)
 bool
 Perl_is_uni_alnumc(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_alnumc(tmpbuf);
 }
@@ -953,7 +953,7 @@ Perl_is_uni_alnumc(pTHX_ UV c)
 bool
 Perl_is_uni_idfirst(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_idfirst(tmpbuf);
 }
@@ -961,7 +961,7 @@ Perl_is_uni_idfirst(pTHX_ UV c)
 bool
 Perl_is_uni_alpha(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_alpha(tmpbuf);
 }
@@ -969,7 +969,7 @@ Perl_is_uni_alpha(pTHX_ UV c)
 bool
 Perl_is_uni_ascii(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_ascii(tmpbuf);
 }
@@ -977,7 +977,7 @@ Perl_is_uni_ascii(pTHX_ UV c)
 bool
 Perl_is_uni_space(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_space(tmpbuf);
 }
@@ -985,7 +985,7 @@ Perl_is_uni_space(pTHX_ UV c)
 bool
 Perl_is_uni_digit(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_digit(tmpbuf);
 }
@@ -993,7 +993,7 @@ Perl_is_uni_digit(pTHX_ UV c)
 bool
 Perl_is_uni_upper(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_upper(tmpbuf);
 }
@@ -1001,7 +1001,7 @@ Perl_is_uni_upper(pTHX_ UV c)
 bool
 Perl_is_uni_lower(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_lower(tmpbuf);
 }
@@ -1009,7 +1009,7 @@ Perl_is_uni_lower(pTHX_ UV c)
 bool
 Perl_is_uni_cntrl(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_cntrl(tmpbuf);
 }
@@ -1017,7 +1017,7 @@ Perl_is_uni_cntrl(pTHX_ UV c)
 bool
 Perl_is_uni_graph(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_graph(tmpbuf);
 }
@@ -1025,7 +1025,7 @@ Perl_is_uni_graph(pTHX_ UV c)
 bool
 Perl_is_uni_print(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_print(tmpbuf);
 }
@@ -1033,7 +1033,7 @@ Perl_is_uni_print(pTHX_ UV c)
 bool
 Perl_is_uni_punct(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_punct(tmpbuf);
 }
@@ -1041,7 +1041,7 @@ Perl_is_uni_punct(pTHX_ UV c)
 bool
 Perl_is_uni_xdigit(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_xdigit(tmpbuf);
 }
@@ -1166,7 +1166,7 @@ Perl_to_uni_upper_lc(pTHX_ U32 c)
     /* XXX returns only the first character -- do not use XXX */
     /* XXX no locale support yet */
     STRLEN len;
-    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     return (U32)to_uni_upper(c, tmpbuf, &len);
 }
 
@@ -1176,7 +1176,7 @@ Perl_to_uni_title_lc(pTHX_ U32 c)
     /* XXX returns only the first character XXX -- do not use XXX */
     /* XXX no locale support yet */
     STRLEN len;
-    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     return (U32)to_uni_title(c, tmpbuf, &len);
 }
 
@@ -1186,7 +1186,7 @@ Perl_to_uni_lower_lc(pTHX_ U32 c)
     /* XXX returns only the first character -- do not use XXX */
     /* XXX no locale support yet */
     STRLEN len;
-    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     return (U32)to_uni_lower(c, tmpbuf, &len);
 }
 
@@ -1400,7 +1400,7 @@ UV
 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
 {
     UV uv0, uv1;
-    U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     STRLEN len = 0;
 
     uv0 = utf8_to_uvchr(p, 0);
@@ -1489,9 +1489,8 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma
 
 Convert the UTF-8 encoded character at p to its uppercase version and
 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-uppercase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
+the uppercase version may be longer than the original character.
 
 The first character of the uppercased version is returned
 (but note, as explained above, that there may be more.)
@@ -1510,9 +1509,8 @@ Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 
 Convert the UTF-8 encoded character at p to its titlecase version and
 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-titlecase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+titlecase version may be longer than the original character.
 
 The first character of the titlecased version is returned
 (but note, as explained above, that there may be more.)
@@ -1531,9 +1529,8 @@ Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 
 Convert the UTF-8 encoded character at p to its lowercase version and
 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-lowercase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+lowercase version may be longer than the original character.
 
 The first character of the lowercased version is returned
 (but note, as explained above, that there may be more.)
@@ -1552,7 +1549,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 
 Convert the UTF-8 encoded character at p to its foldcase version and
 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
 foldcase version may be longer than the original character (up to
 three characters).
 
@@ -1711,7 +1708,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
            /* We use utf8n_to_uvuni() as we want an index into
               Unicode tables, not a native character number.
             */
-           UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+           UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY);
            SV *errsv_save;
@@ -1778,7 +1775,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
 
 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
 bytes available. The return value is the pointer to the byte after the
 end of the new character. In other words,
 
@@ -1954,8 +1951,8 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
      STRLEN n1 = 0, n2 = 0;
-     U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
-     U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
+     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
+     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
      U8 natbuf[1+1];
      STRLEN foldlen1, foldlen2;
      bool match;
diff --git a/utf8.h b/utf8.h
index 376280d..c206b3d 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -162,14 +162,26 @@ encoded character.
 #define isIDFIRST_lazy(p)      isIDFIRST_lazy_if(p,1)
 #define isALNUM_lazy(p)                isALNUM_lazy_if(p,1)
 
-/* how wide can a single UTF-8 encoded character become */
-#define UTF8_MAXLEN 13
-/* how wide a character can become when upper/lowercased */
-#define UTF8_MAXLEN_UCLC_MULT 3
-#define UTF8_MAXLEN_UCLC (UTF8_MAXLEN*UTF8_MAXLEN_UCLC_MULT)
-/* how wide a character can become when casefolded */
-#define UTF8_MAXLEN_FOLD_MULT 3
-#define UTF8_MAXLEN_FOLD (UTF8_MAXLEN*UTF8_MAXLEN_FOLD_MULT)
+#define UTF8_MAXBYTES 13
+/* How wide can a single UTF-8 encoded character become in bytes.
+ * NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8
+ * since UTF-8 is an encoding of Unicode and given Unicode's current
+ * upper limit only four bytes is possible.  Perl thinks of UTF-8
+ * as a way to encode non-negative integers in a binary format. */
+#define UTF8_MAXLEN UTF8_MAXBYTES
+
+#define UTF8_MAXLEN_UCLC 3             /* Obsolete, do not use. */
+#define UTF8_MAXLEN_UCLC_MULT 39       /* Obsolete, do not use. */
+#define UTF8_MAXLEN_FOLD 3             /* Obsolete, do not use. */
+#define UTF8_MAXLEN_FOLD_MULT 39       /* Obsolete, do not use. */
+
+/* The maximum number of UTF-8 bytes a single Unicode character can
+ * uppercase/lowercase/fold into; this number depends on the Unicode
+ * version.  An example of maximal expansion is the U+03B0 which
+ * uppercases to U+03C5 U+0308 U+0301.  The Unicode databases that
+ * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and
+ * SpecialCasing.txt. */
+#define UTF8_MAXBYTES_CASE     6
 
 #define IN_BYTES (PL_curcop->op_private & HINT_BYTES)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)