From: Jarkko Hietaniemi Date: Sat, 22 Jan 2005 00:20:12 +0000 (+0200) Subject: Re: uc($long_utf8_string) exhausts memory X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=89ebb4a3f2a55825eeed13aaf58db5c73d2140ef;p=p5sagit%2Fp5-mst-13.2.git Re: uc($long_utf8_string) exhausts memory Message-Id: <41F1801C.3080201@iki.fi> Make buffer size estimates for utf8 case conversion less maximally pessimistic p4raw-id: //depot/perl@23857 --- diff --git a/doop.c b/doop.c index b1de08d..9e5a60d 100644 --- a/doop.c +++ b/doop.c @@ -338,7 +338,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv) if (grows) { /* d needs to be bigger than s, in case e.g. upgrading is required */ - New(0, d, len*3+UTF8_MAXLEN, U8); + New(0, d, len * 3 + UTF8_MAXBYTES, U8); dend = d + len * 3; dstart = d; } @@ -370,10 +370,10 @@ S_do_trans_simple_utf8(pTHX_ SV *sv) if (d > dend) { STRLEN clen = d - dstart; - STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; + STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; if (!grows) Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__); - Renew(dstart, nlen+UTF8_MAXLEN, U8); + Renew(dstart, nlen + UTF8_MAXBYTES, U8); d = dstart + clen; dend = dstart + nlen; } @@ -480,7 +480,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) if (grows) { /* d needs to be bigger than s, in case e.g. upgrading is required */ - New(0, d, len*3+UTF8_MAXLEN, U8); + New(0, d, len * 3 + UTF8_MAXBYTES, U8); dend = d + len * 3; dstart = d; } @@ -496,10 +496,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) if (d > dend) { STRLEN clen = d - dstart; - STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; + STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; if (!grows) Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__); - Renew(dstart, nlen+UTF8_MAXLEN, U8); + Renew(dstart, nlen + UTF8_MAXBYTES, U8); d = dstart + clen; dend = dstart + nlen; } @@ -550,10 +550,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) uv = swash_fetch(rv, s, TRUE); if (d > dend) { STRLEN clen = d - dstart; - STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; + STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; if (!grows) Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__); - Renew(dstart, nlen+UTF8_MAXLEN, U8); + Renew(dstart, nlen + UTF8_MAXBYTES, U8); d = dstart + clen; dend = dstart + nlen; } diff --git a/op.c b/op.c index ab9f374..77a213e 100644 --- a/op.c +++ b/op.c @@ -2423,7 +2423,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) */ if (complement) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; UV *cp; UV nextmin = 0; New(1109, cp, 2*tlen, UV); diff --git a/pp.c b/pp.c index a90d9ee..12f5bfb 100644 --- a/pp.c +++ b/pp.c @@ -3344,7 +3344,7 @@ PP(pp_ord) } XPUSHu(DO_UTF8(argsv) ? - utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) : + utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, UTF8_ALLOW_ANYUV) : (*s & 0xff)); RETURN; @@ -3454,7 +3454,7 @@ PP(pp_ucfirst) if (DO_UTF8(sv) && (s = (U8*)SvPV_nomg(sv, slen)) && slen && UTF8_IS_START(*s)) { - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN ulen; STRLEN tculen; @@ -3517,7 +3517,7 @@ PP(pp_lcfirst) (s = (U8*)SvPV_nomg(sv, slen)) && slen && UTF8_IS_START(*s)) { STRLEN ulen; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; U8 *tend; UV uv; @@ -3574,7 +3574,7 @@ PP(pp_uc) STRLEN ulen; register U8 *d; U8 *send; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; s = (U8*)SvPV_nomg(sv,len); if (!len) { @@ -3583,18 +3583,28 @@ PP(pp_uc) SETs(TARG); } else { - STRLEN nchar = utf8_length(s, s + len); - (void)SvUPGRADE(TARG, SVt_PV); - SvGROW(TARG, (nchar * UTF8_MAXLEN_UCLC) + 1); + SvGROW(TARG, len + 1); (void)SvPOK_only(TARG); d = (U8*)SvPVX(TARG); send = s + len; while (s < send) { + STRLEN u = UTF8SKIP(s); + toUPPER_utf8(s, tmpbuf, &ulen); + if (ulen > u) { + UV o = d - (U8*)SvPVX(TARG); + + /* If someone uppercases one million U+03B0s we + * SvGROW() one million times. Or we could try + * guess how much to allocate without overdoing. + * Such is life. */ + SvGROW(TARG, SvCUR(TARG) + ulen - u); + d = (U8*)SvPVX(TARG) + o; + } Copy(tmpbuf, d, ulen, U8); d += ulen; - s += UTF8SKIP(s); + s += u; } *d = '\0'; SvUTF8_on(TARG); @@ -3643,7 +3653,7 @@ PP(pp_lc) STRLEN ulen; register U8 *d; U8 *send; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; s = (U8*)SvPV_nomg(sv,len); if (!len) { @@ -3652,16 +3662,16 @@ PP(pp_lc) SETs(TARG); } else { - STRLEN nchar = utf8_length(s, s + len); - (void)SvUPGRADE(TARG, SVt_PV); - SvGROW(TARG, (nchar * UTF8_MAXLEN_UCLC) + 1); + SvGROW(TARG, len + 1); (void)SvPOK_only(TARG); d = (U8*)SvPVX(TARG); send = s + len; while (s < send) { + STRLEN u = UTF8SKIP(s); UV uv = toLOWER_utf8(s, tmpbuf, &ulen); -#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode */ + +#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */ if (uv == GREEK_CAPITAL_LETTER_SIGMA) { /* * Now if the sigma is NOT followed by @@ -3675,12 +3685,24 @@ PP(pp_lc) * then it should be mapped to 0x03C2, * (GREEK SMALL LETTER FINAL SIGMA), * instead of staying 0x03A3. - * See lib/unicore/SpecCase.txt. + * "should be": in other words, + * this is not implemented yet. + * See lib/unicore/SpecialCasing.txt. */ } + if (ulen > u) { + UV o = d - (U8*)SvPVX(TARG); + + /* If someone lowercases one million U+0130s we + * SvGROW() one million times. Or we could try + * guess how much to allocate without overdoing. + Such is life. */ + SvGROW(TARG, SvCUR(TARG) + ulen - u); + d = (U8*)SvPVX(TARG) + o; + } Copy(tmpbuf, d, ulen, U8); d += ulen; - s += UTF8SKIP(s); + s += u; } *d = '\0'; SvUTF8_on(TARG); diff --git a/pp_pack.c b/pp_pack.c index 815c326..2e830f4 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -2183,7 +2183,7 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV while (len-- > 0) { fromstr = NEXTFROM; auint = UNI_TO_NATIVE(SvUV(fromstr)); - SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1); + SvGROW(cat, SvCUR(cat) + UTF8_MAXBYTES + 1); SvCUR_set(cat, (char*)uvchr_to_utf8_flags((U8*)SvEND(cat), auint, diff --git a/regcomp.c b/regcomp.c index 2a942d5..ff939b2 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3088,7 +3088,7 @@ tryagain: char *oldp, *s; STRLEN numlen; STRLEN foldlen; - U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; parse_start = RExC_parse - 1; @@ -4199,7 +4199,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) else if (prevnatvalue == natvalue) { Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); if (FOLD) { - U8 foldbuf[UTF8_MAXLEN_FOLD+1]; + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; UV f = to_uni_fold(natvalue, foldbuf, &foldlen); @@ -4869,7 +4869,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) if (lv) { if (sw) { - U8 s[UTF8_MAXLEN+1]; + U8 s[UTF8_MAXBYTES_CASE+1]; for (i = 0; i <= 256; i++) { /* just the first 256 */ U8 *e = uvchr_to_utf8(s, i); diff --git a/regexec.c b/regexec.c index 6ed6d95..f254713 100644 --- a/regexec.c +++ b/regexec.c @@ -1028,15 +1028,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (UTF) { STRLEN ulen1, ulen2; U8 *sm = (U8 *) m; - U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; - U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; + U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; to_utf8_lower((U8*)m, tmpbuf1, &ulen1); to_utf8_upper((U8*)m, tmpbuf2, &ulen2); - c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, + c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, + c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); lnc = 0; while (sm < ((U8 *) m + ln)) { @@ -1074,15 +1074,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (do_utf8) { UV c, f; - U8 tmpbuf [UTF8_MAXLEN+1]; - U8 foldbuf[UTF8_MAXLEN_FOLD+1]; + U8 tmpbuf [UTF8_MAXBYTES+1]; + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN len, foldlen; if (c1 == c2) { /* Upper and lower of 1st char are equal - * probably not a "letter". */ while (s <= e) { - c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); if ( c == c1 @@ -1109,7 +1109,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else { while (s <= e) { - c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); @@ -2459,7 +2459,7 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*(U8*)s) != - utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen, + utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)) sayNO; @@ -2473,7 +2473,7 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*((U8*)l)) != - utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen, + utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)) sayNO; @@ -2806,8 +2806,8 @@ S_regmatch(pTHX_ regnode *prog) */ if (OP(scan) == REFF) { STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; - U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; + U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; while (s < e) { if (l >= PL_regeol) sayNO; @@ -3580,21 +3580,21 @@ S_regmatch(pTHX_ regnode *prog) else { /* UTF */ if (OP(text_node) == EXACTF || OP(text_node) == REFF) { STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; - U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; + U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; to_utf8_lower((U8*)s, tmpbuf1, &ulen1); to_utf8_upper((U8*)s, tmpbuf2, &ulen2); - c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0, + c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0, + c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } else { - c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0, + c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } @@ -3656,7 +3656,7 @@ S_regmatch(pTHX_ regnode *prog) * utf8_distance(old, locinput) */ while (locinput <= e && utf8n_to_uvchr((U8*)locinput, - UTF8_MAXLEN, &len, + UTF8_MAXBYTES, &len, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) != (UV)c1) { locinput += len; @@ -3667,7 +3667,7 @@ S_regmatch(pTHX_ regnode *prog) * utf8_distance(old, locinput) */ while (locinput <= e) { UV c = utf8n_to_uvchr((U8*)locinput, - UTF8_MAXLEN, &len, + UTF8_MAXBYTES, &len, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); if (c == (UV)c1 || c == (UV)c2) @@ -3704,7 +3704,7 @@ S_regmatch(pTHX_ regnode *prog) if (c1 != -1000) { if (do_utf8) c = utf8n_to_uvchr((U8*)PL_reginput, - UTF8_MAXLEN, 0, + UTF8_MAXBYTES, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); else @@ -3754,7 +3754,7 @@ S_regmatch(pTHX_ regnode *prog) if (c1 != -1000) { if (do_utf8) c = utf8n_to_uvchr((U8*)PL_reginput, - UTF8_MAXLEN, 0, + UTF8_MAXBYTES, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); else @@ -3777,7 +3777,7 @@ S_regmatch(pTHX_ regnode *prog) if (c1 != -1000) { if (do_utf8) c = utf8n_to_uvchr((U8*)PL_reginput, - UTF8_MAXLEN, 0, + UTF8_MAXBYTES, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); else @@ -4370,7 +4370,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b STRLEN plen; if (do_utf8 && !UTF8_IS_INVARIANT(c)) - c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len, + c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); @@ -4407,7 +4407,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b } } if (!match) { - U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN tmplen; to_utf8_fold(p, tmpbuf, &tmplen); diff --git a/sv.c b/sv.c index c7211b7..343c5f0 100644 --- a/sv.c +++ b/sv.c @@ -9271,7 +9271,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV #endif char esignbuf[4]; - U8 utf8buf[UTF8_MAXLEN+1]; + U8 utf8buf[UTF8_MAXBYTES+1]; STRLEN esignlen = 0; char *eptr = Nullch; diff --git a/t/op/lc.t b/t/op/lc.t index cee6661..d75b64e 100644 --- a/t/op/lc.t +++ b/t/op/lc.t @@ -111,8 +111,8 @@ is("\u\x{587}" , "\x{535}\x{582}", "ligature titlecase"); is("\U\x{587}" , "\x{535}\x{552}", "ligature uppercase"); # mktables had problems where many-to-one case mappings didn't work right. -# The lib/unifold.t should give the fourth folding, "casefolding", a good -# workout. +# The lib/uni/fold.t should give the fourth folding, "casefolding", a good +# workout (one cannot directly get that from Perl). # \x{01C4} is LATIN CAPITAL LETTER DZ WITH CARON # \x{01C5} is LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON # \x{01C6} is LATIN SMALL LETTER DZ WITH CARON @@ -161,6 +161,5 @@ for my $a (0,1) { chop $a; $a =~ s/^(\s*)(\w*)/$1\u$2/; is($a, v10, "[perl #18857]"); - $test++; } } diff --git a/toke.c b/toke.c index b4222f6..5d5abf4 100644 --- a/toke.c +++ b/toke.c @@ -1704,7 +1704,7 @@ S_scan_const(pTHX_ char *start) UV uv = utf8_to_uvchr((U8*)str, 0); if (uv < 0x100) { - U8 tmpbuf[UTF8_MAXLEN+1], *d; + U8 tmpbuf[UTF8_MAXBYTES+1], *d; d = uvchr_to_utf8(tmpbuf, UNI_TO_NATIVE(uv)); sv_setpvn(res, (char *)tmpbuf, d - tmpbuf); @@ -7137,7 +7137,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) I32 brackets = 1; /* bracket nesting level */ bool has_utf8 = FALSE; /* is there any utf8 content? */ I32 termcode; /* terminating char. code */ - U8 termstr[UTF8_MAXLEN]; /* terminating string */ + U8 termstr[UTF8_MAXBYTES]; /* terminating string */ STRLEN termlen; /* length of terminating string */ char *last = NULL; /* last position for nesting bracket */ @@ -8230,7 +8230,7 @@ Perl_scan_vstring(pTHX_ char *s, SV *sv) if (!isALPHA(*pos)) { UV rev; - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; U8 *tmpend; if (*s == 'v') s++; /* get past 'v' */ diff --git a/utf8.c b/utf8.c index 5c6f271..5fa2aab 100644 --- a/utf8.c +++ b/utf8.c @@ -38,7 +38,7 @@ within non-zero characters. =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags Adds the UTF-8 representation of the Unicode codepoint C to the end -of the string C; C should be have at least C free +of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, @@ -551,7 +551,7 @@ returned and retlen is set, if possible, to -1. UV Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen) { - return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, + return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } @@ -575,7 +575,7 @@ UV Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen) { /* Call the low level routine asking for checks */ - return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, + return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } @@ -937,7 +937,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) bool Perl_is_uni_alnum(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_alnum(tmpbuf); } @@ -945,7 +945,7 @@ Perl_is_uni_alnum(pTHX_ UV c) bool Perl_is_uni_alnumc(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_alnumc(tmpbuf); } @@ -953,7 +953,7 @@ Perl_is_uni_alnumc(pTHX_ UV c) bool Perl_is_uni_idfirst(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_idfirst(tmpbuf); } @@ -961,7 +961,7 @@ Perl_is_uni_idfirst(pTHX_ UV c) bool Perl_is_uni_alpha(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_alpha(tmpbuf); } @@ -969,7 +969,7 @@ Perl_is_uni_alpha(pTHX_ UV c) bool Perl_is_uni_ascii(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_ascii(tmpbuf); } @@ -977,7 +977,7 @@ Perl_is_uni_ascii(pTHX_ UV c) bool Perl_is_uni_space(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_space(tmpbuf); } @@ -985,7 +985,7 @@ Perl_is_uni_space(pTHX_ UV c) bool Perl_is_uni_digit(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_digit(tmpbuf); } @@ -993,7 +993,7 @@ Perl_is_uni_digit(pTHX_ UV c) bool Perl_is_uni_upper(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_upper(tmpbuf); } @@ -1001,7 +1001,7 @@ Perl_is_uni_upper(pTHX_ UV c) bool Perl_is_uni_lower(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_lower(tmpbuf); } @@ -1009,7 +1009,7 @@ Perl_is_uni_lower(pTHX_ UV c) bool Perl_is_uni_cntrl(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_cntrl(tmpbuf); } @@ -1017,7 +1017,7 @@ Perl_is_uni_cntrl(pTHX_ UV c) bool Perl_is_uni_graph(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_graph(tmpbuf); } @@ -1025,7 +1025,7 @@ Perl_is_uni_graph(pTHX_ UV c) bool Perl_is_uni_print(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_print(tmpbuf); } @@ -1033,7 +1033,7 @@ Perl_is_uni_print(pTHX_ UV c) bool Perl_is_uni_punct(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_punct(tmpbuf); } @@ -1041,7 +1041,7 @@ Perl_is_uni_punct(pTHX_ UV c) bool Perl_is_uni_xdigit(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; uvchr_to_utf8(tmpbuf, c); return is_utf8_xdigit(tmpbuf); } @@ -1166,7 +1166,7 @@ Perl_to_uni_upper_lc(pTHX_ U32 c) /* XXX returns only the first character -- do not use XXX */ /* XXX no locale support yet */ STRLEN len; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; return (U32)to_uni_upper(c, tmpbuf, &len); } @@ -1176,7 +1176,7 @@ Perl_to_uni_title_lc(pTHX_ U32 c) /* XXX returns only the first character XXX -- do not use XXX */ /* XXX no locale support yet */ STRLEN len; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; return (U32)to_uni_title(c, tmpbuf, &len); } @@ -1186,7 +1186,7 @@ Perl_to_uni_lower_lc(pTHX_ U32 c) /* XXX returns only the first character -- do not use XXX */ /* XXX no locale support yet */ STRLEN len; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; return (U32)to_uni_lower(c, tmpbuf, &len); } @@ -1400,7 +1400,7 @@ UV Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special) { UV uv0, uv1; - U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN len = 0; uv0 = utf8_to_uvchr(p, 0); @@ -1489,9 +1489,8 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma Convert the UTF-8 encoded character at p to its uppercase version and store that in UTF-8 in ustrp and its length in bytes in lenp. Note -that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the -uppercase version may be longer than the original character (up to two -characters). +that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since +the uppercase version may be longer than the original character. The first character of the uppercased version is returned (but note, as explained above, that there may be more.) @@ -1510,9 +1509,8 @@ Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) Convert the UTF-8 encoded character at p to its titlecase version and store that in UTF-8 in ustrp and its length in bytes in lenp. Note -that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the -titlecase version may be longer than the original character (up to two -characters). +that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the +titlecase version may be longer than the original character. The first character of the titlecased version is returned (but note, as explained above, that there may be more.) @@ -1531,9 +1529,8 @@ Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) Convert the UTF-8 encoded character at p to its lowercase version and store that in UTF-8 in ustrp and its length in bytes in lenp. Note -that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the -lowercase version may be longer than the original character (up to two -characters). +that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the +lowercase version may be longer than the original character. The first character of the lowercased version is returned (but note, as explained above, that there may be more.) @@ -1552,7 +1549,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) Convert the UTF-8 encoded character at p to its foldcase version and store that in UTF-8 in ustrp and its length in bytes in lenp. Note -that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the +that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the foldcase version may be longer than the original character (up to three characters). @@ -1711,7 +1708,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) /* We use utf8n_to_uvuni() as we want an index into Unicode tables, not a native character number. */ - UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0, + UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); SV *errsv_save; @@ -1778,7 +1775,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv Adds the UTF-8 representation of the Native codepoint C to the end -of the string C; C should be have at least C free +of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, @@ -1954,8 +1951,8 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const register U8 *e1 = 0, *f1 = 0, *q1 = 0; register U8 *e2 = 0, *f2 = 0, *q2 = 0; STRLEN n1 = 0, n2 = 0; - U8 foldbuf1[UTF8_MAXLEN_FOLD+1]; - U8 foldbuf2[UTF8_MAXLEN_FOLD+1]; + U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; + U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; U8 natbuf[1+1]; STRLEN foldlen1, foldlen2; bool match; diff --git a/utf8.h b/utf8.h index 376280d..c206b3d 100644 --- a/utf8.h +++ b/utf8.h @@ -162,14 +162,26 @@ encoded character. #define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1) #define isALNUM_lazy(p) isALNUM_lazy_if(p,1) -/* how wide can a single UTF-8 encoded character become */ -#define UTF8_MAXLEN 13 -/* how wide a character can become when upper/lowercased */ -#define UTF8_MAXLEN_UCLC_MULT 3 -#define UTF8_MAXLEN_UCLC (UTF8_MAXLEN*UTF8_MAXLEN_UCLC_MULT) -/* how wide a character can become when casefolded */ -#define UTF8_MAXLEN_FOLD_MULT 3 -#define UTF8_MAXLEN_FOLD (UTF8_MAXLEN*UTF8_MAXLEN_FOLD_MULT) +#define UTF8_MAXBYTES 13 +/* How wide can a single UTF-8 encoded character become in bytes. + * NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 + * since UTF-8 is an encoding of Unicode and given Unicode's current + * upper limit only four bytes is possible. Perl thinks of UTF-8 + * as a way to encode non-negative integers in a binary format. */ +#define UTF8_MAXLEN UTF8_MAXBYTES + +#define UTF8_MAXLEN_UCLC 3 /* Obsolete, do not use. */ +#define UTF8_MAXLEN_UCLC_MULT 39 /* Obsolete, do not use. */ +#define UTF8_MAXLEN_FOLD 3 /* Obsolete, do not use. */ +#define UTF8_MAXLEN_FOLD_MULT 39 /* Obsolete, do not use. */ + +/* The maximum number of UTF-8 bytes a single Unicode character can + * uppercase/lowercase/fold into; this number depends on the Unicode + * version. An example of maximal expansion is the U+03B0 which + * uppercases to U+03C5 U+0308 U+0301. The Unicode databases that + * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and + * SpecialCasing.txt. */ +#define UTF8_MAXBYTES_CASE 6 #define IN_BYTES (PL_curcop->op_private & HINT_BYTES) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)