From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Tue, 11 Dec 2001 20:16:29 +0000 (+0000)
Subject: More UTF-8 API docs.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d2cc3551ad7322839f752bb576bc76b9557f2445;p=p5sagit%2Fp5-mst-13.2.git

More UTF-8 API docs.

p4raw-id: //depot/perl@13630
---

diff --git a/embed.pl b/embed.pl
index 383c305..639ba6c 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1333,7 +1333,7 @@ Apd	|HE*	|hv_store_ent	|HV* tb|SV* key|SV* val|U32 hash
 Apd	|void	|hv_undef	|HV* tb
 Ap	|I32	|ibcmp		|const char* a|const char* b|I32 len
 Ap	|I32	|ibcmp_locale	|const char* a|const char* b|I32 len
-Ap	|I32	|ibcmp_utf8	|const char* a|bool ua|const char* b|bool ub|I32 len
+Apd	|I32	|ibcmp_utf8	|const char* a|bool ua|const char* b|bool ub|I32 len
 p	|bool	|ingroup	|Gid_t testgid|Uid_t effective
 p	|void	|init_argv_symbols|int|char **
 p	|void	|init_debugger
@@ -1851,9 +1851,9 @@ Adp	|UV	|utf8n_to_uvchr	|U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
 Adp	|UV	|utf8n_to_uvuni	|U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
 Apd	|U8*	|uvchr_to_utf8	|U8 *d|UV uv
 Apd	|U8*	|uvuni_to_utf8	|U8 *d|UV uv
-Ap	|char*	|pv_uni_display	|SV *dsv|U8 *spv|STRLEN len \
+Apd	|char*	|pv_uni_display	|SV *dsv|U8 *spv|STRLEN len \
 				|STRLEN pvlim|UV flags
-Ap	|char*	|sv_uni_display	|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
+Apd	|char*	|sv_uni_display	|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
 p	|void	|vivify_defelem	|SV* sv
 p	|void	|vivify_ref	|SV* sv|U32 to_what
 p	|I32	|wait4pid	|Pid_t pid|int* statusp|int flags
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 2ca1b21..6ac32f4 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -1108,6 +1108,23 @@ Undefines the hash.
 =for hackers
 Found in file hv.c
 
+=item ibcmp_utf8
+
+Return true if the strings s1 and s2 differ case-insensitively, false
+if not (if they are equal case-insensitively).  If u1 is true, the
+string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
+the string s2 is assumed to be in UTF-8-encoded Unicode.  (If both u1
+and u2 are false, ibcmp() is called.)
+
+For case-insensitiveness, the "casefolding" of Unicode is used
+instead of upper/lowercasing both the characters, see
+http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
+
+	I32	ibcmp_utf8(const char* a, bool ua, const char* b, bool ub, I32 len)
+
+=for hackers
+Found in file utf8.c
+
 =item isALNUM
 
 Returns a boolean indicating whether the C C<char> is an ASCII alphanumeric
@@ -1404,6 +1421,17 @@ SV is B<not> incremented.
 =for hackers
 Found in file sv.c
 
+=item newSV
+
+Create a new null SV, or if len > 0, create a new empty SVt_PV type SV
+with an initial PV allocation of len+1. Normally accessed via the C<NEWSV>
+macro.
+
+	SV*	newSV(STRLEN len)
+
+=for hackers
+Found in file sv.c
+
 =item NEWSV
 
 Creates a new SV.  A non-zero C<len> parameter indicates the number of
@@ -1417,17 +1445,6 @@ C<id> is an integer id between 0 and 1299 (used to identify leaks).
 =for hackers
 Found in file handy.h
 
-=item newSV
-
-Create a new null SV, or if len > 0, create a new empty SVt_PV type SV
-with an initial PV allocation of len+1. Normally accessed via the C<NEWSV>
-macro.
-
-	SV*	newSV(STRLEN len)
-
-=for hackers
-Found in file sv.c
-
 =item newSViv
 
 Creates a new SV and copies an integer into it.  The reference count for the
@@ -1867,6 +1884,19 @@ See C<PUSHMARK> and L<perlcall> for other uses.
 =for hackers
 Found in file pp.h
 
+=item pv_uni_display
+
+Build to the scalar dsv a displayable version of the string spv,
+length len, the displayable version being at most pvlim bytes long
+(if longer, the rest is truncated and "..." will be appended).
+The flags argument is currently unused but available for future extensions.
+The pointer to the PV of the dsv is returned.
+
+	char*	pv_uni_display(SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
+
+=for hackers
+Found in file utf8.c
+
 =item Renew
 
 The XSUB-writer's interface to the C C<realloc> function.
@@ -2299,22 +2329,22 @@ version which guarantees to evaluate sv only once.
 =for hackers
 Found in file sv.h
 
-=item SvIVX
+=item SvIVx
 
-Returns the raw value in the SV's IV slot, without checks or conversions.
-Only use when you are sure SvIOK is true. See also C<SvIV()>.
+Coerces the given SV to an integer and returns it. Guarantees to evaluate
+sv only once. Use the more efficient C<SvIV> otherwise.
 
-	IV	SvIVX(SV* sv)
+	IV	SvIVx(SV* sv)
 
 =for hackers
 Found in file sv.h
 
-=item SvIVx
+=item SvIVX
 
-Coerces the given SV to an integer and returns it. Guarantees to evaluate
-sv only once. Use the more efficient C<SvIV> otherwise.
+Returns the raw value in the SV's IV slot, without checks or conversions.
+Only use when you are sure SvIOK is true. See also C<SvIV()>.
 
-	IV	SvIVx(SV* sv)
+	IV	SvIVX(SV* sv)
 
 =for hackers
 Found in file sv.h
@@ -4035,6 +4065,19 @@ instead use an in-line version.
 =for hackers
 Found in file sv.c
 
+=item sv_uni_display
+
+Build to the scalar dsv a displayable version of the scalar sv,
+he displayable version being at most pvlim bytes long
+(if longer, the rest is truncated and "..." will be appended).
+The flags argument is currently unused but available for future extensions.
+The pointer to the PV of the dsv is returned.
+
+	char*	sv_uni_display(SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
+
+=for hackers
+Found in file utf8.c
+
 =item sv_unmagic
 
 Removes all magic of type C<type> from an SV.
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index e8a5fff..b1ffed5 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -876,6 +876,19 @@ utf8_hop(s, off) will return a pointer to an UTF-8 encoded buffer that
 is C<off> (positive or negative) Unicode characters displaced from the
 UTF-8 buffer C<s>.
 
+=item *
+
+pv_uni_display(dsv, spv, len, pvlim, flags) and sv_uni_display(dsv,
+ssv, pvlim, flags) are useful for debug output of Unicode strings and
+scalars (only for debug: they display B<all> characters as hexadecimal
+code points).
+
+=item *
+
+ibcmp_utf8(s1, u1, s2, u2, len) can be used to compare two strings
+case-insensitively in Unicode.  (For case-sensitive comparisons you
+can just use memEQ() and memNE() as usual.)
+
 =back
 
 For more information, see L<perlapi>, and F<utf8.c> and F<utf8.h>
diff --git a/utf8.c b/utf8.c
index 7da1e5b..30a4908 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1524,6 +1524,16 @@ Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
     return UNI_TO_NATIVE(uv);
 }
 
+/*
+=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
+
+Build to the scalar dsv a displayable version of the string spv,
+length len, the displayable version being at most pvlim bytes long
+(if longer, the rest is truncated and "..." will be appended).
+The flags argument is currently unused but available for future extensions.
+The pointer to the PV of the dsv is returned.
+
+=cut */
 char *
 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
 {
@@ -1546,6 +1556,16 @@ Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
     return SvPVX(dsv);
 }
 
+/*
+=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
+
+Build to the scalar dsv a displayable version of the scalar sv,
+he displayable version being at most pvlim bytes long
+(if longer, the rest is truncated and "..." will be appended).
+The flags argument is currently unused but available for future extensions.
+The pointer to the PV of the dsv is returned.
+
+=cut */
 char *
 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
 {
@@ -1553,47 +1573,65 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
 				pvlim, flags);
 }
 
+/*
+=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
+
+Return true if the strings s1 and s2 differ case-insensitively, false
+if not (if they are equal case-insensitively).  If u1 is true, the
+string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
+the string s2 is assumed to be in UTF-8-encoded Unicode.  (If both u1
+and u2 are false, ibcmp() is called.)
+
+For case-insensitiveness, the "casefolding" of Unicode is used
+instead of upper/lowercasing both the characters, see
+http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
+
+=cut */
 I32
 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len)
 {
-     register U8 *a = (U8*)s1;
-     register U8 *b = (U8*)s2;
-     STRLEN la, lb;
-     UV ca, cb;
-     STRLEN ulen1, ulen2;
-     U8 tmpbuf1[UTF8_MAXLEN*3+1];
-     U8 tmpbuf2[UTF8_MAXLEN*3+1];
-
-     while (len) {
-	  if (u1)
-	       ca = utf8_to_uvchr((U8*)a, &la);
-	  else {
-	       ca = *a;
-	       la = 1;
-	  }
-	  if (u2)
-	       cb = utf8_to_uvchr((U8*)b, &lb);
-	  else {
-	       cb = *b;
-	       lb = 1;
-	  }
-	  if (ca != cb) {
+     if (u1 || u2) {
+	  register U8 *a = (U8*)s1;
+	  register U8 *b = (U8*)s2;
+	  STRLEN la, lb;
+	  UV ca, cb;
+	  STRLEN ulen1, ulen2;
+	  U8 tmpbuf1[UTF8_MAXLEN*3+1];
+	  U8 tmpbuf2[UTF8_MAXLEN*3+1];
+	  
+	  while (len) {
 	       if (u1)
-		    to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
-	       else
-		    ulen1 = 1;
+		    ca = utf8_to_uvchr((U8*)a, &la);
+	       else {
+		    ca = *a;
+		    la = 1;
+	       }
 	       if (u2)
-		    to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
-	       else
-		    ulen2 = 1;
-	       if (ulen1 != ulen2
-		   || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
-		   || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
-		    return 1;
+		    cb = utf8_to_uvchr((U8*)b, &lb);
+	       else {
+		    cb = *b;
+		    lb = 1;
+	       }
+	       if (ca != cb) {
+		    if (u1)
+			 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
+		    else
+			 ulen1 = 1;
+		    if (u2)
+			 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
+		    else
+			 ulen2 = 1;
+		    if (ulen1 != ulen2
+			|| (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
+			|| memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
+			 return 1;
+	       }
+	       a += la;
+	       b += lb;
 	  }
-	  a += la;
-	  b += lb;
-    }
-    return 0;
+	  return 0;
+     }
+     else
+         return ibcmp(s1, s2);
 }