s = (U8*)SvPV(sv, len);
send = s + len;
- /* First, take care of non-UTF8 input strings, because they're easy */
+ /* First, take care of non-UTF-8 input strings, because they're easy */
if (!SvUTF8(sv)) {
while (s < send) {
if ((ch = tbl[*s]) >= 0) {
I32 hek_len; /* length of hash key */
char hek_key[1]; /* variable-length hash key */
/* the hash-key is \0-terminated */
- /* after the \0 there is a byte for flags, such as whether the key is
- UTF8 */
+ /* after the \0 there is a byte for flags, such as whether the key
+ is UTF-8 */
};
/* hash structure: */
=item *
-C<pack('U0a*', ...)> can now be used to force a string to UTF8.
+C<pack('U0a*', ...)> can now be used to force a string to UTF-8.
=item *
Concatenation with the C<.> operator or via variable interpolation,
C<eq>, C<substr>, C<reverse>, C<quotemeta>, the C<x> operator,
-substitution with C<s///>, single-quoted UTF8, should now work--in
+substitution with C<s///>, single-quoted UTF-8, should now work--in
theory.
=item *
=item *
-C<pack('U0a*', ...)> can now be used to force a string to UTF8.
+C<pack('U0a*', ...)> can now be used to force a string to UTF-8.
=item *
Concatenation with the C<.> operator or via variable interpolation,
C<eq>, C<substr>, C<reverse>, C<quotemeta>, the C<x> operator,
-substitution with C<s///>, single-quoted UTF8, should now work.
+substitution with C<s///>, single-quoted UTF-8, should now work.
=item *
=item SvPOK_only
Tells an SV that it is a string and disables all other OK bits.
-Will also turn off the UTF8 status.
+Will also turn off the UTF-8 status.
void SvPOK_only(SV* sv)
=item SvPOK_only_UTF8
Tells an SV that it is a string and disables all other OK bits,
-and leaves the UTF8 status as it was.
+and leaves the UTF-8 status as it was.
void SvPOK_only_UTF8(SV* sv)
=item SvUTF8_off
-Unsets the UTF8 status of an SV.
+Unsets the UTF-8 status of an SV.
void SvUTF8_off(SV *sv)
=item SvUTF8_on
-Turn on the UTF8 status of an SV (the data is not changed, just the flag).
+Turn on the UTF-8 status of an SV (the data is not changed, just the flag).
Do not use frivolously.
void SvUTF8_on(SV *sv)
=item sv_2pvbyte
Return a pointer to the byte-encoded representation of the SV, and set *lp
-to its length. May cause the SV to be downgraded from UTF8 as a
+to its length. May cause the SV to be downgraded from UTF-8 as a
side-effect.
Usually accessed via the C<SvPVbyte> macro.
=item sv_2pvbyte_nolen
Return a pointer to the byte-encoded representation of the SV.
-May cause the SV to be downgraded from UTF8 as a side-effect.
+May cause the SV to be downgraded from UTF-8 as a side-effect.
Usually accessed via the C<SvPVbyte_nolen> macro.
=item sv_2pvutf8
-Return a pointer to the UTF8-encoded representation of the SV, and set *lp
-to its length. May cause the SV to be upgraded to UTF8 as a side-effect.
+Return a pointer to the UTF-8-encoded representation of the SV, and set *lp
+to its length. May cause the SV to be upgraded to UTF-8 as a side-effect.
Usually accessed via the C<SvPVutf8> macro.
=item sv_2pvutf8_nolen
-Return a pointer to the UTF8-encoded representation of the SV.
-May cause the SV to be upgraded to UTF8 as a side-effect.
+Return a pointer to the UTF-8-encoded representation of the SV.
+May cause the SV to be upgraded to UTF-8 as a side-effect.
Usually accessed via the C<SvPVutf8_nolen> macro.
=item sv_catpv
Concatenates the string onto the end of the string which is in the SV.
-If the SV has the UTF8 status set, then the bytes appended should be
-valid UTF8. Handles 'get' magic, but not 'set' magic. See C<sv_catpv_mg>.
+If the SV has the UTF-8 status set, then the bytes appended should be
+valid UTF-8. Handles 'get' magic, but not 'set' magic. See C<sv_catpv_mg>.
void sv_catpv(SV* sv, const char* ptr)
=item sv_catpvn
Concatenates the string onto the end of the string which is in the SV. The
-C<len> indicates number of bytes to copy. If the SV has the UTF8
-status set, then the bytes appended should be valid UTF8.
+C<len> indicates number of bytes to copy. If the SV has the UTF-8
+status set, then the bytes appended should be valid UTF-8.
Handles 'get' magic, but not 'set' magic. See C<sv_catpvn_mg>.
void sv_catpvn(SV* sv, const char* ptr, STRLEN len)
=item sv_catpvn_flags
Concatenates the string onto the end of the string which is in the SV. The
-C<len> indicates number of bytes to copy. If the SV has the UTF8
-status set, then the bytes appended should be valid UTF8.
+C<len> indicates number of bytes to copy. If the SV has the UTF-8
+status set, then the bytes appended should be valid UTF-8.
If C<flags> has C<SV_GMAGIC> bit set, will C<mg_get> on C<dsv> if
appropriate, else not. C<sv_catpvn> and C<sv_catpvn_nomg> are implemented
in terms of this function.
=item sv_len_utf8
Returns the number of characters in the string in an SV, counting wide
-UTF8 bytes as a single character. Handles magic and type coercion.
+UTF-8 bytes as a single character. Handles magic and type coercion.
STRLEN sv_len_utf8(SV* sv)
=item sv_pos_b2u
Converts the value pointed to by offsetp from a count of bytes from the
-start of the string, to a count of the equivalent number of UTF8 chars.
+start of the string, to a count of the equivalent number of UTF-8 chars.
Handles magic and type coercion.
void sv_pos_b2u(SV* sv, I32* offsetp)
=item sv_pos_u2b
-Converts the value pointed to by offsetp from a count of UTF8 chars from
+Converts the value pointed to by offsetp from a count of UTF-8 chars from
the start of the string, to a count of the equivalent number of bytes; if
lenp is non-zero, it does the same to lenp, but this time starting from
the offset, rather than from the start of the string. Handles magic and
=item sv_utf8_downgrade
-Attempt to convert the PV of an SV from UTF8-encoded to byte encoding.
+Attempt to convert the PV of an SV from UTF-8-encoded to byte encoding.
This may not be possible if the PV contains non-byte encoding characters;
if this is the case, either returns false or, if C<fail_ok> is not
true, croaks.
=item sv_utf8_encode
-Convert the PV of an SV to UTF8-encoded, but then turn off the C<SvUTF8>
+Convert the PV of an SV to UTF-8-encoded, but then turn off the C<SvUTF8>
flag so that it looks like octets again. Used as a building block
for encode_utf8 in Encode.xs
=item sv_utf8_upgrade
-Convert the PV of an SV to its UTF8-encoded form.
+Convert the PV of an SV to its UTF-8-encoded form.
Forces the SV to string form if it is not already.
Always sets the SvUTF8 flag to avoid future validity checks even
if all the bytes have hibit clear.
=item sv_utf8_upgrade_flags
-Convert the PV of an SV to its UTF8-encoded form.
+Convert the PV of an SV to its UTF-8-encoded form.
Forces the SV to string form if it is not already.
Always sets the SvUTF8 flag to avoid future validity checks even
if all the bytes have hibit clear. If C<flags> has C<SV_GMAGIC> bit set,
=item bytes_from_utf8
-Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
the newly-created string, and updates C<len> to contain the new
length. Returns the original string if no conversion occurs, C<len>
=item bytes_to_utf8
-Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
+Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
Returns a pointer to the newly-created string, and sets C<len> to
reflect the new length.
-If you want to convert to UTF8 from other encodings than ASCII,
+If you want to convert to UTF-8 from other encodings than ASCII,
see sv_recode_to_utf8().
NOTE: this function is experimental and may change or be
=item is_utf8_string
Returns true if first C<len> bytes of the given string form a valid
-UTF8 string, false otherwise. Note that 'a valid UTF8 string' does
-not mean 'a string that contains code points above 0x7F encoded in
-UTF8' because a valid ASCII string is a valid UTF8 string.
+UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
+not mean 'a string that contains code points above 0x7F encoded in UTF-8'
+because a valid ASCII string is a valid UTF-8 string.
bool is_utf8_string(U8 *s, STRLEN len)
=item utf8n_to_uvchr
Returns the native character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
Allows length and flags to be passed to low level routine.
Bottom level UTF-8 decode routine.
Returns the unicode code point value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
C<retlen> will be set to the length, in bytes, of that character.
-If C<s> does not point to a well-formed UTF8 character, the behaviour
+If C<s> does not point to a well-formed UTF-8 character, the behaviour
is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
it is assumed that the caller will raise a warning, and this function
will silently just set C<retlen> to C<-1> and return zero. If the
=item utf8_distance
-Returns the number of UTF8 characters between the UTF-8 pointers C<a>
+Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
and C<b>.
WARNING: use only if you *know* that the pointers point inside the
=item utf8_to_bytes
-Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
Unlike C<bytes_to_utf8>, this over-writes the original string, and
updates len to contain the new length.
Returns zero on failure, setting C<len> to -1.
=item utf8_to_uvchr
Returns the native character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
-If C<s> does not point to a well-formed UTF8 character, zero is
+If C<s> does not point to a well-formed UTF-8 character, zero is
returned and retlen is set, if possible, to -1.
UV utf8_to_uvchr(U8 *s, STRLEN* retlen)
=item utf8_to_uvuni
Returns the Unicode code point of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
This function should only be used when returned UV is considered
an index into the Unicode semantic tables (e.g. swashes).
-If C<s> does not point to a well-formed UTF8 character, zero is
+If C<s> does not point to a well-formed UTF-8 character, zero is
returned and retlen is set, if possible, to -1.
UV utf8_to_uvuni(U8 *s, STRLEN* retlen)
=item uvchr_to_utf8
-Adds the UTF8 representation of the Native codepoint C<uv> to the end
+Adds the UTF-8 representation of the Native codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
=item uvuni_to_utf8_flags
-Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
+Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
Returns the character represented by that NUMBER in the character set.
For example, C<chr(65)> is C<"A"> in either ASCII or Unicode, and
-chr(0x263a) is a Unicode smiley face. Note that characters from 127
-to 255 (inclusive) are by default not encoded in Unicode for backward
-compatibility reasons (but see L<encoding>).
+chr(0x263a) is a Unicode smiley face. Note that characters from 128
+to 255 (inclusive) are by default not encoded in UTF-8 Unicode for
+backward compatibility reasons (but see L<encoding>).
If NUMBER is omitted, uses C<$_>.
=item *
-If the pattern begins with a C<U>, the resulting string will be treated
-as Unicode-encoded. You can force UTF8 encoding on in a string with an
-initial C<U0>, and the bytes that follow will be interpreted as Unicode
-characters. If you don't want this to happen, you can begin your pattern
-with C<C0> (or anything else) to force Perl not to UTF8 encode your
-string, and then follow this with a C<U*> somewhere in your pattern.
+If the pattern begins with a C<U>, the resulting string will be
+treated as UTF-8-encoded Unicode. You can force UTF-8 encoding on in a
+string with an initial C<U0>, and the bytes that follow will be
+interpreted as Unicode characters. If you don't want this to happen,
+you can begin your pattern with C<C0> (or anything else) to force Perl
+not to UTF-8 encode your string, and then follow this with a C<U*>
+somewhere in your pattern.
=item *
to try to write off the beginning of the string (i.e. negative OFFSET).
The string should not contain any character with the value > 255 (which
-can only happen if you're using UTF8 encoding). If it does, it will be
-treated as something which is not UTF8 encoded. When the C<vec> was
+can only happen if you're using UTF-8 encoding). If it does, it will be
+treated as something which is not UTF-8 encoded. When the C<vec> was
assigned to, other parts of your program will also no longer consider the
-string to be UTF8 encoded. In other words, if you do have such characters
+string to be UTF-8 encoded. In other words, if you do have such characters
in your string, vec() will operate on the actual byte string, and not the
conceptual character string.
To fix this, some people formed Unicode, Inc. and
produced a new character set containing all the characters you can
possibly think of and more. There are several ways of representing these
-characters, and the one Perl uses is called UTF8. UTF8 uses
+characters, and the one Perl uses is called UTF-8. UTF-8 uses
a variable number of bytes to represent a character, instead of just
one. You can learn more about Unicode at http://www.unicode.org/
-=head2 How can I recognise a UTF8 string?
+=head2 How can I recognise a UTF-8 string?
-You can't. This is because UTF8 data is stored in bytes just like
-non-UTF8 data. The Unicode character 200, (C<0xC8> for you hex types)
+You can't. This is because UTF-8 data is stored in bytes just like
+non-UTF-8 data. The Unicode character 200, (C<0xC8> for you hex types)
capital E with a grave accent, is represented by the two bytes
C<v196.172>. Unfortunately, the non-Unicode string C<chr(196).chr(172)>
has that byte sequence as well. So you can't tell just by looking - this
is what makes Unicode input an interesting problem.
The API function C<is_utf8_string> can help; it'll tell you if a string
-contains only valid UTF8 characters. However, it can't do the work for
+contains only valid UTF-8 characters. However, it can't do the work for
you. On a character-by-character basis, C<is_utf8_char> will tell you
-whether the current character in a string is valid UTF8.
+whether the current character in a string is valid UTF-8.
-=head2 How does UTF8 represent Unicode characters?
+=head2 How does UTF-8 represent Unicode characters?
-As mentioned above, UTF8 uses a variable number of bytes to store a
+As mentioned above, UTF-8 uses a variable number of bytes to store a
character. Characters with values 1...128 are stored in one byte, just
like good ol' ASCII. Character 129 is stored as C<v194.129>; this
continues up to character 191, which is C<v194.191>. Now we've run out of
bits (191 is binary C<10111111>) so we move on; 192 is C<v195.128>. And
so it goes on, moving to three bytes at character 2048.
-Assuming you know you're dealing with a UTF8 string, you can find out
+Assuming you know you're dealing with a UTF-8 string, you can find out
how long the first character in it is with the C<UTF8SKIP> macro:
char *utf = "\305\233\340\240\201";
utf += len;
len = UTF8SKIP(utf); /* len is 3 here */
-Another way to skip over characters in a UTF8 string is to use
+Another way to skip over characters in a UTF-8 string is to use
C<utf8_hop>, which takes a string and a number of characters to skip
over. You're on your own about bounds checking, though, so don't use it
lightly.
-All bytes in a multi-byte UTF8 character will have the high bit set,
+All bytes in a multi-byte UTF-8 character will have the high bit set,
so you can test if you need to do something special with this
character like this (the UTF8_IS_INVARIANT() is a macro that tests
whether the byte can be encoded as a single byte even in UTF-8):
UV uv; /* Note: a UV, not a U8, not a char */
if (!UTF8_IS_INVARIANT(*utf))
- /* Must treat this as UTF8 */
+ /* Must treat this as UTF-8 */
uv = utf8_to_uv(utf);
else
/* OK to treat this character as a byte */
You can also see in that example that we use C<utf8_to_uv> to get the
value of the character; the inverse function C<uv_to_utf8> is available
-for putting a UV into UTF8:
+for putting a UV into UTF-8:
if (!UTF8_IS_INVARIANT(uv))
/* Must treat this as UTF8 */
*utf8++ = uv;
You B<must> convert characters to UVs using the above functions if
-you're ever in a situation where you have to match UTF8 and non-UTF8
-characters. You may not skip over UTF8 characters in this case. If you
-do this, you'll lose the ability to match hi-bit non-UTF8 characters;
-for instance, if your UTF8 string contains C<v196.172>, and you skip
-that character, you can never match a C<chr(200)> in a non-UTF8 string.
+you're ever in a situation where you have to match UTF-8 and non-UTF-8
+characters. You may not skip over UTF-8 characters in this case. If you
+do this, you'll lose the ability to match hi-bit non-UTF-8 characters;
+for instance, if your UTF-8 string contains C<v196.172>, and you skip
+that character, you can never match a C<chr(200)> in a non-UTF-8 string.
So don't do that!
-=head2 How does Perl store UTF8 strings?
+=head2 How does Perl store UTF-8 strings?
Currently, Perl deals with Unicode strings and non-Unicode strings
slightly differently. If a string has been identified as being UTF-8
undesirable results.
The problem comes when you have, for instance, a string that isn't
-flagged is UTF8, and contains a byte sequence that could be UTF8 -
-especially when combining non-UTF8 and UTF8 strings.
+flagged is UTF-8, and contains a byte sequence that could be UTF-8 -
+especially when combining non-UTF-8 and UTF-8 strings.
Never forget that the C<SVf_UTF8> flag is separate to the PV value; you
need be sure you don't accidentally knock it off while you're
The C<char*> string does not tell you the whole story, and you can't
copy or reconstruct an SV just by copying the string value. Check if the
-old SV has the UTF8 flag set, and act accordingly:
+old SV has the UTF-8 flag set, and act accordingly:
p = SvPV(sv, len);
frobnicate(p);
SvUTF8_on(nsv);
In fact, your C<frobnicate> function should be made aware of whether or
-not it's dealing with UTF8 data, so that it can handle the string
+not it's dealing with UTF-8 data, so that it can handle the string
appropriately.
Since just passing an SV to an XS function and copying the data of
-the SV is not enough to copy the UTF8 flags, even less right is just
+the SV is not enough to copy the UTF-8 flags, even less right is just
passing a C<char *> to an XS function.
-=head2 How do I convert a string to UTF8?
+=head2 How do I convert a string to UTF-8?
-If you're mixing UTF8 and non-UTF8 strings, you might find it necessary
-to upgrade one of the strings to UTF8. If you've got an SV, the easiest
+If you're mixing UTF-8 and non-UTF-8 strings, you might find it necessary
+to upgrade one of the strings to UTF-8. If you've got an SV, the easiest
way to do this is:
sv_utf8_upgrade(sv);
strings that came into the operator, and, while it shouldn't be noticeable
by the end user, it can cause problems.
-Instead, C<bytes_to_utf8> will give you a UTF8-encoded B<copy> of its
+Instead, C<bytes_to_utf8> will give you a UTF-8-encoded B<copy> of its
string argument. This is useful for having the data available for
comparisons and so on, without harming the original SV. There's also
C<utf8_to_bytes> to go the other way, but naturally, this will fail if
=item *
-There's no way to tell if a string is UTF8 or not. You can tell if an SV
-is UTF8 by looking at is C<SvUTF8> flag. Don't forget to set the flag if
-something should be UTF8. Treat the flag as part of the PV, even though
+There's no way to tell if a string is UTF-8 or not. You can tell if an SV
+is UTF-8 by looking at is C<SvUTF8> flag. Don't forget to set the flag if
+something should be UTF-8. Treat the flag as part of the PV, even though
it's not - if you pass on the PV to somewhere, pass on the flag too.
=item *
-If a string is UTF8, B<always> use C<utf8_to_uv> to get at the value,
+If a string is UTF-8, B<always> use C<utf8_to_uv> to get at the value,
unless C<UTF8_IS_INVARIANT(*s)> in which case you can use C<*s>.
=item *
-When writing a character C<uv> to a UTF8 string, B<always> use
+When writing a character C<uv> to a UTF-8 string, B<always> use
C<uv_to_utf8>, unless C<UTF8_IS_INVARIANT(uv))> in which case
you can use C<*s = uv>.
=item *
-Mixing UTF8 and non-UTF8 strings is tricky. Use C<bytes_to_utf8> to get
-a new string which is UTF8 encoded. There are tricks you can use to
-delay deciding whether you need to use a UTF8 string until you get to a
+Mixing UTF-8 and non-UTF-8 strings is tricky. Use C<bytes_to_utf8> to get
+a new string which is UTF-8 encoded. There are tricks you can use to
+delay deciding whether you need to use a UTF-8 string until you get to a
high character - C<HALF_UPGRADE> is one of those.
=back
Line 13 manipulates the flags; since we've changed the PV, any IV or NV
values will no longer be valid: if we have C<$a=10; $a.="6";> we don't
-want to use the old IV of 10. C<SvPOK_only_utf8> is a special UTF8-aware
+want to use the old IV of 10. C<SvPOK_only_utf8> is a special UTF-8-aware
version of C<SvPOK_only>, a macro which turns off the IOK and NOK flags
and turns on POK. The final C<SvTAINT> is a macro which launders tainted
data if taint mode is turned on.
on and create a simple patch. Here's something Larry suggested: if a
C<U> is the first active format during a C<pack>, (for example,
C<pack "U3C8", @stuff>) then the resulting string should be treated as
-UTF8 encoded.
+UTF-8 encoded.
How do we prepare to fix this up? First we locate the code in question -
the C<pack> happens at runtime, so it's going to be in one of the F<pp>
while (pat < patend) {
Now if we see a C<U> which was at the start of the string, we turn on
-the UTF8 flag for the output SV, C<cat>:
+the C<UTF8> flag for the output SV, C<cat>:
+ if (datumtype == 'U' && pat==patcopy+1)
+ SvUTF8_on(cat);
=item *
If the pattern begins with a C<U>, the resulting string will be treated
- as Unicode-encoded. You can force UTF8 encoding on in a string with an
- initial C<U0>, and the bytes that follow will be interpreted as Unicode
- characters. If you don't want this to happen, you can begin your pattern
- with C<C0> (or anything else) to force Perl not to UTF8 encode your
+ as UTF-8-encoded Unicode. You can force UTF-8 encoding on in a string
+ with an initial C<U0>, and the bytes that follow will be interpreted as
+ Unicode characters. If you don't want this to happen, you can begin your
+ pattern with C<C0> (or anything else) to force Perl not to UTF-8 encode your
string, and then follow this with a C<U*> somewhere in your pattern.
All done. Now let's create the patch. F<Porting/patching.pod> tells us
=item "=encoding encodingname"
This command, which should occur early in the document (at least
-before any non-USASCII data!), declares that this document is
+before any non-US-ASCII data!), declares that this document is
encoded in the encoding I<encodingname>, which must be
an encoding name that L<Encoding> recognizes. (Encoding's list
of supported encodings, in L<Encoding::Supported>, is useful here.)
(e.g., if there is a "=encoding utf8" early in the document and
"=encoding big5" later). Pod processors that recognize BOMs
may also complain if they see an "=encoding" line
-that contradicts the BOM (e.g., if a document with a UTF16LE BOM
-has an "=encoding shiftjis" line).
+that contradicts the BOM (e.g., if a document with a UTF-16LE
+BOM has an "=encoding shiftjis" line).
=back
0xEF 0xBB 0xBF
=for comment
- If toke.c is modified to support UTF32, add mention of those here.
+ If toke.c is modified to support UTF-32, add mention of those here.
=item *
=item Are Perl regexes DFAs or NFAs? Are they POSIX compliant?
-=item What's wrong with using grep or map in a void context?
+=item What's wrong with using grep in a void context?
=item How can I match strings with multibyte characters?
FILEHANDLE, stat EXPR, stat, study SCALAR, study, sub NAME BLOCK, sub NAME
(PROTO) BLOCK, sub NAME : ATTRS BLOCK, sub NAME (PROTO) : ATTRS BLOCK,
substr EXPR,OFFSET,LENGTH,REPLACEMENT, substr EXPR,OFFSET,LENGTH, substr
-EXPR,OFFSET, symlink OLDFILE,NEWFILE, syscall LIST, sysopen
+EXPR,OFFSET, symlink OLDFILE,NEWFILE, syscall NUMBER, LIST, sysopen
FILEHANDLE,FILENAME,MODE, sysopen FILEHANDLE,FILENAME,MODE,PERMS, sysread
FILEHANDLE,SCALAR,LENGTH,OFFSET, sysread FILEHANDLE,SCALAR,LENGTH, sysseek
FILEHANDLE,POSITION,WHENCE, system LIST, system PROGRAM LIST, syswrite
C<=head1 I<Heading Text>>, C<=head2 I<Heading Text>>, C<=head3 I<Heading
Text>>, C<=head4 I<Heading Text>>, C<=over I<indentlevel>>, C<=item
I<stuff...>>, C<=back>, C<=cut>, C<=pod>, C<=begin I<formatname>>, C<=end
-I<formatname>>, C<=for I<formatname> I<text...>>
+I<formatname>>, C<=for I<formatname> I<text...>>, C<=encoding
+I<encodingname>>
=item Formatting Codes
=item Pod Commands
"=head1", "=head2", "=head3", "=head4", "=pod", "=cut", "=over", "=item",
-"=back", "=begin formatname", "=end formatname", "=for formatname text..."
+"=back", "=begin formatname", "=end formatname", "=for formatname text...",
+"=encoding encodingname"
=item Pod Formatting Codes
HANDLE->autoflush(EXPR), $OUTPUT_AUTOFLUSH, $|,
IO::Handle->output_field_separator EXPR, $OUTPUT_FIELD_SEPARATOR, $OFS, $,,
IO::Handle->output_record_separator EXPR, $OUTPUT_RECORD_SEPARATOR, $ORS,
-$\, $LIST_SEPARATOR, $", $SUBSCRIPT_SEPARATOR, $SUBSEP, $;, $OFMT, $#,
+$\, $LIST_SEPARATOR, $", $SUBSCRIPT_SEPARATOR, $SUBSEP, $;, $#,
HANDLE->format_page_number(EXPR), $FORMAT_PAGE_NUMBER, $%,
HANDLE->format_lines_per_page(EXPR), $FORMAT_LINES_PER_PAGE, $=,
HANDLE->format_lines_left(EXPR), $FORMAT_LINES_LEFT, $-, @LAST_MATCH_START,
=item What B<is> Unicode, anyway?
-=item How can I recognise a UTF8 string?
+=item How can I recognise a UTF-8 string?
-=item How does UTF8 represent Unicode characters?
+=item How does UTF-8 represent Unicode characters?
-=item How does Perl store UTF8 strings?
+=item How does Perl store UTF-8 strings?
-=item How do I convert a string to UTF8?
+=item How do I convert a string to UTF-8?
=item Is there anything else I need to know?
POPpbytex, POPpx, POPs, PUSHi, PUSHMARK, PUSHn, PUSHp, PUSHs, PUSHu,
PUTBACK, SP, SPAGAIN, XPUSHi, XPUSHn, XPUSHp, XPUSHs, XPUSHu, XSRETURN,
XSRETURN_IV, XSRETURN_NO, XSRETURN_NV, XSRETURN_PV, XSRETURN_UNDEF,
-XSRETURN_YES, XST_mIV, XST_mNO, XST_mNV, XST_mPV, XST_mUNDEF, XST_mYES
+XSRETURN_UV, XSRETURN_YES, XST_mIV, XST_mNO, XST_mNV, XST_mPV, XST_mUNDEF,
+XST_mYES
=item SV Flags
newSVpv, newSVpvf, newSVpvn, newSVpvn_share, newSVrv, newSVsv, newSVuv,
SvCUR, SvCUR_set, SvEND, SvGROW, SvIOK, SvIOKp, SvIOK_notUV, SvIOK_off,
SvIOK_on, SvIOK_only, SvIOK_only_UV, SvIOK_UV, SvIsCOW,
-SvIsCOW_shared_hash, SvIV, SvIVx, SvIVX, SvLEN, SvNIOK, SvNIOKp,
-SvNIOK_off, SvNOK, SvNOKp, SvNOK_off, SvNOK_on, SvNOK_only, SvNV, SvNVX,
-SvNVx, SvOK, SvOOK, SvPOK, SvPOKp, SvPOK_off, SvPOK_on, SvPOK_only,
+SvIsCOW_shared_hash, SvIV, SvIVX, SvIVx, SvLEN, SvNIOK, SvNIOKp,
+SvNIOK_off, SvNOK, SvNOKp, SvNOK_off, SvNOK_on, SvNOK_only, SvNV, SvNVx,
+SvNVX, SvOK, SvOOK, SvPOK, SvPOKp, SvPOK_off, SvPOK_on, SvPOK_only,
SvPOK_only_UTF8, SvPV, SvPVbyte, SvPVbytex, SvPVbytex_force,
SvPVbyte_force, SvPVbyte_nolen, SvPVutf8, SvPVutf8x, SvPVutf8x_force,
-SvPVutf8_force, SvPVutf8_nolen, SvPVx, SvPVX, SvPV_force, SvPV_force_nomg,
+SvPVutf8_force, SvPVutf8_nolen, SvPVX, SvPVx, SvPV_force, SvPV_force_nomg,
SvPV_nolen, SvREFCNT, SvREFCNT_dec, SvREFCNT_inc, SvROK, SvROK_off,
SvROK_on, SvRV, SvSTASH, SvTAINT, SvTAINTED, SvTAINTED_off, SvTAINTED_on,
SvTRUE, SvTYPE, SvUNLOCK, SvUOK, SvUPGRADE, SvUTF8, SvUTF8_off, SvUTF8_on,
=item Unicode Support
bytes_from_utf8, bytes_to_utf8, ibcmp_utf8, is_utf8_char, is_utf8_string,
-pv_uni_display, sv_cat_decode, sv_recode_to_utf8, sv_uni_display,
-to_utf8_case, to_utf8_fold, to_utf8_lower, to_utf8_title, to_utf8_upper,
-utf8n_to_uvchr, utf8n_to_uvuni, utf8_distance, utf8_hop, utf8_length,
-utf8_to_bytes, utf8_to_uvchr, utf8_to_uvuni, uvchr_to_utf8,
+is_utf8_string_loc, pv_uni_display, sv_cat_decode, sv_recode_to_utf8,
+sv_uni_display, to_utf8_case, to_utf8_fold, to_utf8_lower, to_utf8_title,
+to_utf8_upper, utf8n_to_uvchr, utf8n_to_uvuni, utf8_distance, utf8_hop,
+utf8_length, utf8_to_bytes, utf8_to_uvchr, utf8_to_uvuni, uvchr_to_utf8,
uvuni_to_utf8_flags
=item Variables created by C<xsubpp> and C<xsubpp> internal functions
=item Mmap for input
-=item Byte to/from UTF8 and UTF8 to/from local conversion
+=item Byte to/from UTF-8 and UTF-8 to/from local conversion
=item Add sockatmark support
=item The IBM ANSI C Compiler
+=item The usenm option
+
=item Using GNU's gcc for building perl
=item Using Large Files with Perl
=item Floating point anomalies on BS2000
+=item Using PerlIO and different encodings on ASCII and EBCDIC partitions
+
=back
=item AUTHORS
=back
+=item Starting From Scratch
+
=item AUTHOR
=item DATE
=back
+=item SunOS 4.x
+
=item AUTHOR
=item LAST MODIFIED
=item HISTORY
+=item CAVEATS
+
=item SEE ALSO
=back
=item DESCRIPTION
+=item LIMITATIONS
+
=item SEE ALSO
=back
=back
-=head2 ExtUtils::Miniperl, writemain - write the C code for perlmain.c
-
-=over 4
-
-=item SYNOPSIS
-
-=item DESCRIPTION
-
-=item SEE ALSO
-
-=back
-
=head2 ExtUtils::Mkbootstrap - make a bootstrap file for use by DynaLoader
=over 4
B<lock_hash>, B<unlock_hash>
+B<hash_seed>
+
=over 4
=item CAVEATS
=item DESCRIPTION
-Canonical notation, Input, Output
+Input, Output
=item METHODS
=back
-=head2 PerlIO::scalar - support module for in-memory IO.
+=head2 PerlIO::scalar - in-memory IO, scalar IO
=over 4
=item DESCRIPTION
+=item IMPLEMENTATION NOTE
+
=back
=head2 PerlIO::via - Helper class for PerlIO layers implemented in perl
=item DESCRIPTION
+=item BUGS
+
=back
=head2 Sys::Hostname - Try every conceivable way to get hostname
This would be useful for printing warnings, or data and regex dumping,
not_a_number(), and so on.
-Requirements: should handle both byte and UTF8 strings. isPRINT()
+Requirements: should handle both byte and UTF-8 strings. isPRINT()
characters printed as-is, character less than 256 as \xHH, Unicode
characters as \x{HHH}. Don't assume ASCII-like, either, get somebody
on EBCDIC to test the output.
Nick Ing-Simmons' C<perlio> supports an C<mmap> IO method.
-=head2 Byte to/from UTF8 and UTF8 to/from local conversion
+=head2 Byte to/from UTF-8 and UTF-8 to/from local conversion
C<Encode> provides this.
=item *
-UTF-16, UTF-16BE, UTF16-LE, Surrogates, and BOMs (Byte Order Marks)
+UTF-16, UTF-16BE, UTF-16LE, Surrogates, and BOMs (Byte Order Marks)
The followings items are mostly for reference and general Unicode
knowledge, Perl doesn't use these constructs internally.
=item *
-UTF-32, UTF-32BE, UTF32-LE
+UTF-32, UTF-32BE, UTF-32LE
The UTF-32 family is pretty much like the UTF-16 family, expect that
the units are 32-bit, and therefore the surrogate scheme is not
perl -MDevel::Peek -e 'Dump(chr(0x100))'
-That shows the UTF8 flag in FLAGS and both the UTF-8 bytes
+That shows the C<UTF8> flag in FLAGS and both the UTF-8 bytes
and Unicode characters in C<PV>. See also later in this document
the discussion about the C<utf8::is_utf8()> function.
as a single byte encoding. If the flag is on, the bytes in the scalar
are interpreted as the (multi-byte, variable-length) UTF-8 encoded code
points of the characters. Bytes added to an UTF-8 encoded string are
-automatically upgraded to UTF-8. If mixed non-UTF8 and UTF-8 scalars
+automatically upgraded to UTF-8. If mixed non-UTF-8 and UTF-8 scalars
are merged (double-quoted interpolation, explicit concatenation, and
printf/sprintf parameter substitution), the result will be UTF-8 encoded
as if copies of the byte strings were upgraded to UTF-8: for example,
if (TARG == right && right != left) {
right = sv_2mortal(newSVpvn(rpv, rlen));
- rpv = SvPV(right, rlen); /* no point setting UTF8 here */
+ rpv = SvPV(right, rlen); /* no point setting UTF-8 here */
rcopied = TRUE;
}
}
if ((fp_utf8 = PerlIO_isutf8(IoIFP(io))) && !IN_BYTES) {
buffer = SvPVutf8_force(bufsv, blen);
- /* UTF8 may not have been set if they are all low bytes */
+ /* UTF-8 may not have been set if they are all low bytes */
SvUTF8_on(bufsv);
}
else {
=for apidoc sv_2pvbyte_nolen
Return a pointer to the byte-encoded representation of the SV.
-May cause the SV to be downgraded from UTF8 as a side-effect.
+May cause the SV to be downgraded from UTF-8 as a side-effect.
Usually accessed via the C<SvPVbyte_nolen> macro.
=for apidoc sv_2pvbyte
Return a pointer to the byte-encoded representation of the SV, and set *lp
-to its length. May cause the SV to be downgraded from UTF8 as a
+to its length. May cause the SV to be downgraded from UTF-8 as a
side-effect.
Usually accessed via the C<SvPVbyte> macro.
/*
=for apidoc sv_2pvutf8_nolen
-Return a pointer to the UTF8-encoded representation of the SV.
-May cause the SV to be upgraded to UTF8 as a side-effect.
+Return a pointer to the UTF-8-encoded representation of the SV.
+May cause the SV to be upgraded to UTF-8 as a side-effect.
Usually accessed via the C<SvPVutf8_nolen> macro.
/*
=for apidoc sv_2pvutf8
-Return a pointer to the UTF8-encoded representation of the SV, and set *lp
-to its length. May cause the SV to be upgraded to UTF8 as a side-effect.
+Return a pointer to the UTF-8-encoded representation of the SV, and set *lp
+to its length. May cause the SV to be upgraded to UTF-8 as a side-effect.
Usually accessed via the C<SvPVutf8> macro.
/*
=for apidoc sv_utf8_upgrade
-Convert the PV of an SV to its UTF8-encoded form.
+Convert the PV of an SV to its UTF-8-encoded form.
Forces the SV to string form if it is not already.
Always sets the SvUTF8 flag to avoid future validity checks even
if all the bytes have hibit clear.
=for apidoc sv_utf8_upgrade_flags
-Convert the PV of an SV to its UTF8-encoded form.
+Convert the PV of an SV to its UTF-8-encoded form.
Forces the SV to string form if it is not already.
Always sets the SvUTF8 flag to avoid future validity checks even
if all the bytes have hibit clear. If C<flags> has C<SV_GMAGIC> bit set,
/*
=for apidoc sv_utf8_downgrade
-Attempt to convert the PV of an SV from UTF8-encoded to byte encoding.
+Attempt to convert the PV of an SV from UTF-8-encoded to byte encoding.
This may not be possible if the PV contains non-byte encoding characters;
if this is the case, either returns false or, if C<fail_ok> is not
true, croaks.
/*
=for apidoc sv_utf8_encode
-Convert the PV of an SV to UTF8-encoded, but then turn off the C<SvUTF8>
+Convert the PV of an SV to UTF-8-encoded, but then turn off the C<SvUTF8>
flag so that it looks like octets again. Used as a building block
for encode_utf8 in Encode.xs
=for apidoc sv_catpvn
Concatenates the string onto the end of the string which is in the SV. The
-C<len> indicates number of bytes to copy. If the SV has the UTF8
-status set, then the bytes appended should be valid UTF8.
+C<len> indicates number of bytes to copy. If the SV has the UTF-8
+status set, then the bytes appended should be valid UTF-8.
Handles 'get' magic, but not 'set' magic. See C<sv_catpvn_mg>.
=for apidoc sv_catpvn_flags
Concatenates the string onto the end of the string which is in the SV. The
-C<len> indicates number of bytes to copy. If the SV has the UTF8
-status set, then the bytes appended should be valid UTF8.
+C<len> indicates number of bytes to copy. If the SV has the UTF-8
+status set, then the bytes appended should be valid UTF-8.
If C<flags> has C<SV_GMAGIC> bit set, will C<mg_get> on C<dsv> if
appropriate, else not. C<sv_catpvn> and C<sv_catpvn_nomg> are implemented
in terms of this function.
=for apidoc sv_catpv
Concatenates the string onto the end of the string which is in the SV.
-If the SV has the UTF8 status set, then the bytes appended should be
-valid UTF8. Handles 'get' magic, but not 'set' magic. See C<sv_catpv_mg>.
+If the SV has the UTF-8 status set, then the bytes appended should be
+valid UTF-8. Handles 'get' magic, but not 'set' magic. See C<sv_catpv_mg>.
=cut */
=for apidoc sv_len_utf8
Returns the number of characters in the string in an SV, counting wide
-UTF8 bytes as a single character. Handles magic and type coercion.
+UTF-8 bytes as a single character. Handles magic and type coercion.
=cut
*/
/*
=for apidoc sv_pos_u2b
-Converts the value pointed to by offsetp from a count of UTF8 chars from
+Converts the value pointed to by offsetp from a count of UTF-8 chars from
the start of the string, to a count of the equivalent number of bytes; if
lenp is non-zero, it does the same to lenp, but this time starting from
the offset, rather than from the start of the string. Handles magic and
=for apidoc sv_pos_b2u
Converts the value pointed to by offsetp from a count of bytes from the
-start of the string, to a count of the equivalent number of UTF8 chars.
+start of the string, to a count of the equivalent number of UTF-8 chars.
Handles magic and type coercion.
=cut
=for apidoc Am|void|SvPOK_only|SV* sv
Tells an SV that it is a string and disables all other OK bits.
-Will also turn off the UTF8 status.
+Will also turn off the UTF-8 status.
=for apidoc Am|bool|SvVOK|SV* sv
Returns a boolean indicating whether the SV contains a v-string.
Returns a boolean indicating whether the SV contains UTF-8 encoded data.
=for apidoc Am|void|SvUTF8_on|SV *sv
-Turn on the UTF8 status of an SV (the data is not changed, just the flag).
+Turn on the UTF-8 status of an SV (the data is not changed, just the flag).
Do not use frivolously.
=for apidoc Am|void|SvUTF8_off|SV *sv
-Unsets the UTF8 status of an SV.
+Unsets the UTF-8 status of an SV.
=for apidoc Am|void|SvPOK_only_UTF8|SV* sv
Tells an SV that it is a string and disables all other OK bits,
-and leaves the UTF8 status as it was.
+and leaves the UTF-8 status as it was.
=cut
*/
=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
-Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
+Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
Returns true if first C<len> bytes of the given string form a valid
-UTF8 string, false otherwise. Note that 'a valid UTF8 string' does
-not mean 'a string that contains code points above 0x7F encoded in
-UTF8' because a valid ASCII string is a valid UTF8 string.
+UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
+not mean 'a string that contains code points above 0x7F encoded in UTF-8'
+because a valid ASCII string is a valid UTF-8 string.
=cut
*/
Bottom level UTF-8 decode routine.
Returns the unicode code point value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
C<retlen> will be set to the length, in bytes, of that character.
-If C<s> does not point to a well-formed UTF8 character, the behaviour
+If C<s> does not point to a well-formed UTF-8 character, the behaviour
is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
it is assumed that the caller will raise a warning, and this function
will silently just set C<retlen> to C<-1> and return zero. If the
=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
Returns the native character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
-If C<s> does not point to a well-formed UTF8 character, zero is
+If C<s> does not point to a well-formed UTF-8 character, zero is
returned and retlen is set, if possible, to -1.
=cut
=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
Returns the Unicode code point of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
This function should only be used when returned UV is considered
an index into the Unicode semantic tables (e.g. swashes).
-If C<s> does not point to a well-formed UTF8 character, zero is
+If C<s> does not point to a well-formed UTF-8 character, zero is
returned and retlen is set, if possible, to -1.
=cut
/*
=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
-Returns the number of UTF8 characters between the UTF-8 pointers C<a>
+Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
and C<b>.
WARNING: use only if you *know* that the pointers point inside the
/*
=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
-Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
Unlike C<bytes_to_utf8>, this over-writes the original string, and
updates len to contain the new length.
Returns zero on failure, setting C<len> to -1.
U8 *d;
U8 *save = s;
- /* ensure valid UTF8 and chars < 256 before updating string */
+ /* ensure valid UTF-8 and chars < 256 before updating string */
for (send = s + *len; s < send; ) {
U8 c = *s++;
/*
=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
-Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
the newly-created string, and updates C<len> to contain the new
length. Returns the original string if no conversion occurs, C<len>
if (!*is_utf8)
return start;
- /* ensure valid UTF8 and chars < 256 before converting string */
+ /* ensure valid UTF-8 and chars < 256 before converting string */
for (send = s + *len; s < send;) {
U8 c = *s++;
if (!UTF8_IS_INVARIANT(c)) {
/*
=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
-Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
+Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
Returns a pointer to the newly-created string, and sets C<len> to
reflect the new length.
-If you want to convert to UTF8 from other encodings than ASCII,
+If you want to convert to UTF-8 from other encodings than ASCII,
see sv_recode_to_utf8().
=cut
if (klen == 0)
{
/* If char in invariant then swatch is for all the invariant chars
- * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
+ * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
*/
needents = UTF_CONTINUATION_MARK;
off = NATIVE_TO_UTF(ptr[klen]);
/*
=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
-Adds the UTF8 representation of the Native codepoint C<uv> to the end
+Adds the UTF-8 representation of the Native codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
Returns the native character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
Allows length and flags to be passed to low level routine.
*/
/* Use UTF-8 as the default script encoding?
- * Turning this on will break scripts having non-UTF8 binary
+ * Turning this on will break scripts having non-UTF-8 binary
* data (such as Latin-1) in string literals. */
#ifdef USE_UTF8_SCRIPTS
# define USE_UTF8_IN_NAMES (!IN_BYTES)
#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)
#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)
-/* how wide can a single UTF8 encoded character become */
+/* how wide can a single UTF-8 encoded character become */
#define UTF8_MAXLEN 13
/* how wide a character can become when upper/lowercased */
#define UTF8_MAXLEN_UCLC_MULT 3
#define UTF8SKIP(s) PL_utf8skip[*(U8*)s]
-/* EBCDIC-happy ways of converting native code to UTF8 */
+/* EBCDIC-happy ways of converting native code to UTF-8 */
/* Native to iso-8859-1 */
#define NATIVE_TO_ASCII(ch) PL_e2a[(U8)(ch)]