From: Jarkko Hietaniemi Date: Tue, 11 Dec 2001 20:16:29 +0000 (+0000) Subject: More UTF-8 API docs. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d2cc3551ad7322839f752bb576bc76b9557f2445;p=p5sagit%2Fp5-mst-13.2.git More UTF-8 API docs. p4raw-id: //depot/perl@13630 --- diff --git a/embed.pl b/embed.pl index 383c305..639ba6c 100755 --- a/embed.pl +++ b/embed.pl @@ -1333,7 +1333,7 @@ Apd |HE* |hv_store_ent |HV* tb|SV* key|SV* val|U32 hash Apd |void |hv_undef |HV* tb Ap |I32 |ibcmp |const char* a|const char* b|I32 len Ap |I32 |ibcmp_locale |const char* a|const char* b|I32 len -Ap |I32 |ibcmp_utf8 |const char* a|bool ua|const char* b|bool ub|I32 len +Apd |I32 |ibcmp_utf8 |const char* a|bool ua|const char* b|bool ub|I32 len p |bool |ingroup |Gid_t testgid|Uid_t effective p |void |init_argv_symbols|int|char ** p |void |init_debugger @@ -1851,9 +1851,9 @@ Adp |UV |utf8n_to_uvchr |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags Adp |UV |utf8n_to_uvuni |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags Apd |U8* |uvchr_to_utf8 |U8 *d|UV uv Apd |U8* |uvuni_to_utf8 |U8 *d|UV uv -Ap |char* |pv_uni_display |SV *dsv|U8 *spv|STRLEN len \ +Apd |char* |pv_uni_display |SV *dsv|U8 *spv|STRLEN len \ |STRLEN pvlim|UV flags -Ap |char* |sv_uni_display |SV *dsv|SV *ssv|STRLEN pvlim|UV flags +Apd |char* |sv_uni_display |SV *dsv|SV *ssv|STRLEN pvlim|UV flags p |void |vivify_defelem |SV* sv p |void |vivify_ref |SV* sv|U32 to_what p |I32 |wait4pid |Pid_t pid|int* statusp|int flags diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 2ca1b21..6ac32f4 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -1108,6 +1108,23 @@ Undefines the hash. =for hackers Found in file hv.c +=item ibcmp_utf8 + +Return true if the strings s1 and s2 differ case-insensitively, false +if not (if they are equal case-insensitively). If u1 is true, the +string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, +the string s2 is assumed to be in UTF-8-encoded Unicode. (If both u1 +and u2 are false, ibcmp() is called.) + +For case-insensitiveness, the "casefolding" of Unicode is used +instead of upper/lowercasing both the characters, see +http://www.unicode.org/unicode/reports/tr21/ (Case Mappings). + + I32 ibcmp_utf8(const char* a, bool ua, const char* b, bool ub, I32 len) + +=for hackers +Found in file utf8.c + =item isALNUM Returns a boolean indicating whether the C C is an ASCII alphanumeric @@ -1404,6 +1421,17 @@ SV is B incremented. =for hackers Found in file sv.c +=item newSV + +Create a new null SV, or if len > 0, create a new empty SVt_PV type SV +with an initial PV allocation of len+1. Normally accessed via the C +macro. + + SV* newSV(STRLEN len) + +=for hackers +Found in file sv.c + =item NEWSV Creates a new SV. A non-zero C parameter indicates the number of @@ -1417,17 +1445,6 @@ C is an integer id between 0 and 1299 (used to identify leaks). =for hackers Found in file handy.h -=item newSV - -Create a new null SV, or if len > 0, create a new empty SVt_PV type SV -with an initial PV allocation of len+1. Normally accessed via the C -macro. - - SV* newSV(STRLEN len) - -=for hackers -Found in file sv.c - =item newSViv Creates a new SV and copies an integer into it. The reference count for the @@ -1867,6 +1884,19 @@ See C and L for other uses. =for hackers Found in file pp.h +=item pv_uni_display + +Build to the scalar dsv a displayable version of the string spv, +length len, the displayable version being at most pvlim bytes long +(if longer, the rest is truncated and "..." will be appended). +The flags argument is currently unused but available for future extensions. +The pointer to the PV of the dsv is returned. + + char* pv_uni_display(SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags) + +=for hackers +Found in file utf8.c + =item Renew The XSUB-writer's interface to the C C function. @@ -2299,22 +2329,22 @@ version which guarantees to evaluate sv only once. =for hackers Found in file sv.h -=item SvIVX +=item SvIVx -Returns the raw value in the SV's IV slot, without checks or conversions. -Only use when you are sure SvIOK is true. See also C. +Coerces the given SV to an integer and returns it. Guarantees to evaluate +sv only once. Use the more efficient C otherwise. - IV SvIVX(SV* sv) + IV SvIVx(SV* sv) =for hackers Found in file sv.h -=item SvIVx +=item SvIVX -Coerces the given SV to an integer and returns it. Guarantees to evaluate -sv only once. Use the more efficient C otherwise. +Returns the raw value in the SV's IV slot, without checks or conversions. +Only use when you are sure SvIOK is true. See also C. - IV SvIVx(SV* sv) + IV SvIVX(SV* sv) =for hackers Found in file sv.h @@ -4035,6 +4065,19 @@ instead use an in-line version. =for hackers Found in file sv.c +=item sv_uni_display + +Build to the scalar dsv a displayable version of the scalar sv, +he displayable version being at most pvlim bytes long +(if longer, the rest is truncated and "..." will be appended). +The flags argument is currently unused but available for future extensions. +The pointer to the PV of the dsv is returned. + + char* sv_uni_display(SV *dsv, SV *ssv, STRLEN pvlim, UV flags) + +=for hackers +Found in file utf8.c + =item sv_unmagic Removes all magic of type C from an SV. diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index e8a5fff..b1ffed5 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -876,6 +876,19 @@ utf8_hop(s, off) will return a pointer to an UTF-8 encoded buffer that is C (positive or negative) Unicode characters displaced from the UTF-8 buffer C. +=item * + +pv_uni_display(dsv, spv, len, pvlim, flags) and sv_uni_display(dsv, +ssv, pvlim, flags) are useful for debug output of Unicode strings and +scalars (only for debug: they display B characters as hexadecimal +code points). + +=item * + +ibcmp_utf8(s1, u1, s2, u2, len) can be used to compare two strings +case-insensitively in Unicode. (For case-sensitive comparisons you +can just use memEQ() and memNE() as usual.) + =back For more information, see L, and F and F diff --git a/utf8.c b/utf8.c index 7da1e5b..30a4908 100644 --- a/utf8.c +++ b/utf8.c @@ -1524,6 +1524,16 @@ Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) return UNI_TO_NATIVE(uv); } +/* +=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags + +Build to the scalar dsv a displayable version of the string spv, +length len, the displayable version being at most pvlim bytes long +(if longer, the rest is truncated and "..." will be appended). +The flags argument is currently unused but available for future extensions. +The pointer to the PV of the dsv is returned. + +=cut */ char * Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags) { @@ -1546,6 +1556,16 @@ Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags) return SvPVX(dsv); } +/* +=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags + +Build to the scalar dsv a displayable version of the scalar sv, +he displayable version being at most pvlim bytes long +(if longer, the rest is truncated and "..." will be appended). +The flags argument is currently unused but available for future extensions. +The pointer to the PV of the dsv is returned. + +=cut */ char * Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) { @@ -1553,47 +1573,65 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) pvlim, flags); } +/* +=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len + +Return true if the strings s1 and s2 differ case-insensitively, false +if not (if they are equal case-insensitively). If u1 is true, the +string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, +the string s2 is assumed to be in UTF-8-encoded Unicode. (If both u1 +and u2 are false, ibcmp() is called.) + +For case-insensitiveness, the "casefolding" of Unicode is used +instead of upper/lowercasing both the characters, see +http://www.unicode.org/unicode/reports/tr21/ (Case Mappings). + +=cut */ I32 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len) { - register U8 *a = (U8*)s1; - register U8 *b = (U8*)s2; - STRLEN la, lb; - UV ca, cb; - STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXLEN*3+1]; - U8 tmpbuf2[UTF8_MAXLEN*3+1]; - - while (len) { - if (u1) - ca = utf8_to_uvchr((U8*)a, &la); - else { - ca = *a; - la = 1; - } - if (u2) - cb = utf8_to_uvchr((U8*)b, &lb); - else { - cb = *b; - lb = 1; - } - if (ca != cb) { + if (u1 || u2) { + register U8 *a = (U8*)s1; + register U8 *b = (U8*)s2; + STRLEN la, lb; + UV ca, cb; + STRLEN ulen1, ulen2; + U8 tmpbuf1[UTF8_MAXLEN*3+1]; + U8 tmpbuf2[UTF8_MAXLEN*3+1]; + + while (len) { if (u1) - to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1); - else - ulen1 = 1; + ca = utf8_to_uvchr((U8*)a, &la); + else { + ca = *a; + la = 1; + } if (u2) - to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2); - else - ulen2 = 1; - if (ulen1 != ulen2 - || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb]) - || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) - return 1; + cb = utf8_to_uvchr((U8*)b, &lb); + else { + cb = *b; + lb = 1; + } + if (ca != cb) { + if (u1) + to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1); + else + ulen1 = 1; + if (u2) + to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2); + else + ulen2 = 1; + if (ulen1 != ulen2 + || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb]) + || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) + return 1; + } + a += la; + b += lb; } - a += la; - b += lb; - } - return 0; + return 0; + } + else + return ibcmp(s1, s2); }