From: Steve Peters Date: Fri, 19 Dec 2008 17:38:31 +0000 (-0600) Subject: Subject: PATCH 5.10 documentation X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2bbc8d558d247c6ef91207a12a4650c0bc292dd6;p=p5sagit%2Fp5-mst-13.2.git Subject: PATCH 5.10 documentation From: karl williamson Date: Tue, 16 Dec 2008 16:00:34 -0700 Message-ID: <49483312.80804@khwilliamson.com> --- diff --git a/autodoc.pl b/autodoc.pl index f97af93..25fabf0 100644 --- a/autodoc.pl +++ b/autodoc.pl @@ -238,7 +238,30 @@ Note that all Perl API global variables must be referenced with the C prefix. Some macros are provided for compatibility with the older, unadorned names, but this support may be disabled in a future release. -The listing is alphabetical, case insensitive. +Perl was originally written to handle US-ASCII only (that is characters +whose ordinal numbers are in the range 0 - 127). +And documentation and comments may still use the term ASCII, when +sometimes in fact the entire range from 0 - 255 is meant. + +Note that Perl can be compiled and run under EBCDIC (See L) +or ASCII. Most of the documentation (and even comments in the code) +ignore the EBCDIC possibility. +For almost all purposes the differences are transparent. +As an example, under EBCDIC, +instead of UTF-8, UTF-EBCDIC is used to encode Unicode strings, and so +whenever this documentation refers to C +(and variants of that name, including in function names), +it also (essentially transparently) means C. +But the ordinals of characters differ between ASCII, EBCDIC, and +the UTF- encodings, and a string encoded in UTF-EBCDIC may occupy more bytes +than in UTF-8. + +Also, on some EBCDIC machines, functions that are documented as operating on +US-ASCII (or Basic Latin in Unicode terminology) may in fact operate on all +256 characters in the EBCDIC range, not just the subset corresponding to +US-ASCII. + +The listing below is alphabetical, case insensitive. _EOB_ diff --git a/embed.fnc b/embed.fnc index 9b2a2ad..033bb5b 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1257,6 +1257,7 @@ AmdbR |char* |sv_pv |NN SV *sv AmdbR |char* |sv_pvutf8 |NN SV *sv AmdbR |char* |sv_pvbyte |NN SV *sv Amdb |STRLEN |sv_utf8_upgrade|NN SV *sv +Amdb |STRLEN |sv_utf8_upgrade_nomg|NN SV *sv ApdM |bool |sv_utf8_downgrade|NN SV *const sv|const bool fail_ok Apd |void |sv_utf8_encode |NN SV *const sv ApdM |bool |sv_utf8_decode |NN SV *const sv diff --git a/global.sym b/global.sym index fe26578..9598d52 100644 --- a/global.sym +++ b/global.sym @@ -663,6 +663,7 @@ Perl_sv_pv Perl_sv_pvutf8 Perl_sv_pvbyte Perl_sv_utf8_upgrade +Perl_sv_utf8_upgrade_nomg Perl_sv_utf8_downgrade Perl_sv_utf8_encode Perl_sv_utf8_decode diff --git a/handy.h b/handy.h index f2eeadc..e82a644 100644 --- a/handy.h +++ b/handy.h @@ -412,33 +412,36 @@ C). =head1 Character classes =for apidoc Am|bool|isALNUM|char ch -Returns a boolean indicating whether the C C is an ASCII alphanumeric -character (including underscore) or digit. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +alphanumeric character (including underscore) or digit. =for apidoc Am|bool|isALPHA|char ch -Returns a boolean indicating whether the C C is an ASCII alphabetic -character. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +alphabetic character. =for apidoc Am|bool|isSPACE|char ch -Returns a boolean indicating whether the C C is whitespace. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +whitespace. =for apidoc Am|bool|isDIGIT|char ch -Returns a boolean indicating whether the C C is an ASCII +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) digit. =for apidoc Am|bool|isUPPER|char ch -Returns a boolean indicating whether the C C is an uppercase -character. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +uppercase character. =for apidoc Am|bool|isLOWER|char ch -Returns a boolean indicating whether the C C is a lowercase -character. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +lowercase character. =for apidoc Am|char|toUPPER|char ch -Converts the specified character to uppercase. +Converts the specified character to uppercase. Characters outside the +US-ASCII (Basic Latin) range are viewed as not having any case. =for apidoc Am|char|toLOWER|char ch -Converts the specified character to lowercase. +Converts the specified character to lowercase. Characters outside the +US-ASCII (Basic Latin) range are viewed as not having any case. =cut */ diff --git a/pod/perlapi.pod b/pod/perlapi.pod index f9eda90..cf85505 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -22,7 +22,30 @@ Note that all Perl API global variables must be referenced with the C prefix. Some macros are provided for compatibility with the older, unadorned names, but this support may be disabled in a future release. -The listing is alphabetical, case insensitive. +Perl was originally written to handle US-ASCII only (that is characters +whose ordinal numbers are in the range 0 - 127). +And documentation and comments may still use the term ASCII, when +sometimes in fact the entire range from 0 - 256 is meant. + +Note that Perl can be compiled and run under EBCDIC (See L) +or ASCII. Most of the documentation (and even comments in the code) +ignore the EBCDIC possibility. +For almost all purposes the differences are transparent. +As an example, under EBCDIC, +instead of UTF-8, UTF-EBCDIC is used to encode Unicode strings, and so +whenever this documentation refers to C +(and variants of that name, including in function names), +it also (essentially transparently) means C. +But the ordinals of characters differ between ASCII, EBCDIC, and +the UTF- encodings, and a string encoded in UTF-EBCDIC may occupy more bytes +than in UTF-8. + +Also, on some EBCDIC machines, functions that are documented as operating on +US-ASCII (or Basic Latin in Unicode terminology) may in fact operate on all +256 characters in the EBCDIC range, not just the subset corresponding to +US-ASCII. + +The listing below is alphabetical, case insensitive. =head1 "Gimme" Values @@ -510,8 +533,8 @@ Found in file scope.h =item isALNUM X -Returns a boolean indicating whether the C C is an ASCII alphanumeric -character (including underscore) or digit. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +alphanumeric character (including underscore) or digit. bool isALNUM(char ch) @@ -521,8 +544,8 @@ Found in file handy.h =item isALPHA X -Returns a boolean indicating whether the C C is an ASCII alphabetic -character. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +alphabetic character. bool isALPHA(char ch) @@ -532,7 +555,7 @@ Found in file handy.h =item isDIGIT X -Returns a boolean indicating whether the C C is an ASCII +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) digit. bool isDIGIT(char ch) @@ -543,8 +566,8 @@ Found in file handy.h =item isLOWER X -Returns a boolean indicating whether the C C is a lowercase -character. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +lowercase character. bool isLOWER(char ch) @@ -554,7 +577,8 @@ Found in file handy.h =item isSPACE X -Returns a boolean indicating whether the C C is whitespace. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +whitespace. bool isSPACE(char ch) @@ -564,8 +588,8 @@ Found in file handy.h =item isUPPER X -Returns a boolean indicating whether the C C is an uppercase -character. +Returns a boolean indicating whether the C C is a US-ASCII (Basic Latin) +uppercase character. bool isUPPER(char ch) @@ -575,7 +599,8 @@ Found in file handy.h =item toLOWER X -Converts the specified character to lowercase. +Converts the specified character to lowercase. Characters outside the +US-ASCII (Basic Latin) range are viewed as not having any case. char toLOWER(char ch) @@ -585,7 +610,8 @@ Found in file handy.h =item toUPPER X -Converts the specified character to uppercase. +Converts the specified character to uppercase. Characters outside the +US-ASCII (Basic Latin) range are viewed as not having any case. char toUPPER(char ch) @@ -6443,7 +6469,8 @@ Found in file sv.c X Attempts to convert the PV of an SV from characters to bytes. -If the PV contains a character beyond byte, this conversion will fail; +If the PV contains a character that cannot fit +in a byte, this conversion will fail; in this case, either returns false or, if C is not true, croaks. @@ -6474,8 +6501,10 @@ X Converts the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. +Will C on C if appropriate. Always sets the SvUTF8 flag to avoid future validity checks even -if all the bytes have hibit clear. +if the whole string is the same in UTF-8 as not. +Returns the number of bytes in the converted string This is not as a general purpose byte encoding to Unicode interface: use the Encode extension for that. @@ -6491,8 +6520,10 @@ X Converts the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even -if all the bytes have hibit clear. If C has C bit set, -will C on C if appropriate, else not. C and +if all the bytes are invariant in UTF-8. If C has C bit set, +will C on C if appropriate, else not. +Returns the number of bytes in the converted string +C and C are implemented in terms of this function. This is not as a general purpose byte encoding to Unicode interface: @@ -6503,6 +6534,16 @@ use the Encode extension for that. =for hackers Found in file sv.c +=item sv_utf8_upgrade_nomg +X + +Like sv_utf8_upgrade, but doesn't do magic on C + + STRLEN sv_utf8_upgrade_nomg(SV *sv) + +=for hackers +Found in file sv.c + =item sv_vcatpvf X @@ -6592,12 +6633,13 @@ Found in file sv.c =item bytes_from_utf8 X -Converts a string C of length C from UTF-8 into byte encoding. +Converts a string C of length C from UTF-8 into native byte encoding. Unlike C but like C, returns a pointer to the newly-created string, and updates C to contain the new length. Returns the original string if no conversion occurs, C is unchanged. Do nothing if C points to 0. Sets C to -0 if C is converted or contains all 7bit characters. +0 if C is converted or consisted entirely of characters that are invariant +in utf8 (i.e., US-ASCII on non-EBCDIC machines). NOTE: this function is experimental and may change or be removed without notice. @@ -6610,11 +6652,14 @@ Found in file utf8.c =item bytes_to_utf8 X -Converts a string C of length C from ASCII into UTF-8 encoding. +Converts a string C of length C from the native encoding into UTF-8. Returns a pointer to the newly-created string, and sets C to reflect the new length. -If you want to convert to UTF-8 from other encodings than ASCII, +A NUL character will be written after the end of the string. + +If you want to convert to UTF-8 from encodings other than +the native (Latin1 or EBCDIC), see sv_recode_to_utf8(). NOTE: this function is experimental and may change or be @@ -6658,9 +6703,9 @@ Found in file utf8.c X Tests if some arbitrary number of bytes begins in a valid UTF-8 -character. Note that an INVARIANT (i.e. ASCII) character is a valid -UTF-8 character. The actual number of bytes in the UTF-8 character -will be returned if it is valid, otherwise 0. +character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) +character is a valid UTF-8 character. The actual number of bytes in the UTF-8 +character will be returned if it is valid, otherwise 0. STRLEN is_utf8_char(const U8 *s) @@ -6965,7 +7010,7 @@ Found in file utf8.c =item utf8_to_bytes X -Converts a string C of length C from UTF-8 into byte encoding. +Converts a string C of length C from UTF-8 into native byte encoding. Unlike C, this over-writes the original string, and updates len to contain the new length. Returns zero on failure, setting C to -1. @@ -7002,7 +7047,7 @@ Returns the Unicode code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -This function should only be used when returned UV is considered +This function should only be used when the returned UV is considered an index into the Unicode semantic tables (e.g. swashes). If C does not point to a well-formed UTF-8 character, zero is diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod index 942526b..ca695a6 100644 --- a/pod/perlebcdic.pod +++ b/pod/perlebcdic.pod @@ -15,9 +15,10 @@ Portions that are still incomplete are marked with XXX. =head2 ASCII -The American Standard Code for Information Interchange is a set of +The American Standard Code for Information Interchange (ASCII or US-ASCII) is a +set of integers running from 0 to 127 (decimal) that imply character -interpretation by the display and other system(s) of computers. +interpretation by the display and other systems of computers. The range 0..127 can be covered by setting the bits in a 7-bit binary digit, hence the set is sometimes referred to as a "7-bit ASCII". ASCII was described by the American National Standards Institute @@ -69,6 +70,9 @@ identification numbers (CCSID numbers) or code page numbers. Leading zero digits in CCSID numbers within this document are insignificant. E.g. CCSID 0037 may be referred to as 37 in places. +Perl can be compiled on platforms that run any of three commonly used EBCDIC +character sets, listed below. + =head2 13 variant characters Among IBM EBCDIC character code sets there are 13 characters that @@ -77,6 +81,13 @@ are known as the 13 "variant" characters and are: \ [ ] { } ^ ~ ! # | $ @ ` +When Perl is compiled for a platform, it looks at some of these characters to +guess which EBCDIC character set the platform uses, and adapts itself +accordingly to that platform. If the platform uses a character set that is not +one of the three Perl knows about, Perl will either fail to compile, or +mistakenly and silently choose one of the three. +They are: + =head2 0037 Character code set ID 0037 is a mapping of the ASCII plus Latin-1 @@ -123,10 +134,7 @@ equal I or chr(101), and unpack("U", "A") would equal =item * -Many of the remaining seem to be related to case-insensitive matching: -for example, C<< /[\x{131}]/ >> (LATIN SMALL LETTER DOTLESS I) does -not match "I" case-insensitively, as it should under Unicode. -(The match succeeds in ASCII-derived platforms.) +Many of the remaining problems seem to be related to case-insensitive matching =item * @@ -137,15 +145,37 @@ supported under EBCDIC, likewise for the encoding pragma. =head2 Unicode and UTF -UTF is a Unicode Transformation Format. UTF-8 is a Unicode conforming -representation of the Unicode standard that looks very much like ASCII. -UTF-EBCDIC is an attempt to represent Unicode characters in an EBCDIC -transparent manner. +UTF stands for C. +UTF-8 is an encoding of Unicode into a sequence of 8-bit byte chunks, based on +ASCII and Latin-1. +The length of a sequence required to represent a Unicode code point +depends on the ordinal number of that code point, +with larger numbers requiring more bytes. +UTF-EBCDIC is like UTF-8, but based on EBCDIC. + +In UTF-8, the code points corresponding to the lowest 128 +ordinal numbers (0 - 127) are the same (or C) +in UTF-8 or not. They occupy one byte each. All other Unicode code points +require more than one byte to be represented in UTF-8. +With UTF-EBCDIC, the term C has a somewhat different meaning. +(First, note that this is very different from the L +mentioned above.) +In UTF-EBCDIC, an C character or code point +is one which takes up exactly one byte encoded, regardless +of whether or not the encoding changes its value +(which it most likely will). +(If you care, the EBCDIC invariants are those characters +which correspond to the the ASCII characters, plus those that correspond to +the C1 controls (80..9f on ASCII platforms).) +A string encoded in UTF-EBCDIC may be longer (but never shorter) than +one encoded in UTF-8. =head2 Using Encode Starting from Perl 5.8 you can use the standard new module Encode -to translate from EBCDIC to Latin-1 code points +to translate from EBCDIC to Latin-1 code points. +Encode knows about more EBCDIC character sets than Perl can currently +be compiled to run on. use Encode 'from_to'; @@ -181,9 +211,11 @@ you to use different encodings per IO channel. For example you may use open($f, ">:encoding(utf8)", "test.utf8"); print $f "Hello World!\n"; -to get two files containing "Hello World!\n" in ASCII, CP 37 EBCDIC, -ISO 8859-1 (Latin-1) (in this example identical to ASCII) respective -UTF-EBCDIC (in this example identical to normal EBCDIC). See the +to get four files containing "Hello World!\n" in ASCII, CP 37 EBCDIC, +ISO 8859-1 (Latin-1) (in this example identical to ASCII since only ASCII +characters were printed), and +UTF-EBCDIC (in this example identical to normal EBCDIC since only characters +that don't differ between EBCDIC and UTF-EBCDIC were printed). See the documentation of Encode::PerlIO for details. As the PerlIO layer uses raw IO (bytes) internally, all this totally @@ -618,11 +650,11 @@ However, it would be unwise to write tests such as: $is_ascii = "\r" ne chr(13); # WRONG $is_ascii = "\n" ne chr(10); # ILL ADVISED -Obviously the first of these will fail to distinguish most ASCII machines -from either a CCSID 0037, a 1047, or a POSIX-BC EBCDIC machine since "\r" eq +Obviously the first of these will fail to distinguish most ASCII platforms +from either a CCSID 0037, a 1047, or a POSIX-BC EBCDIC platform since "\r" eq chr(13) under all of those coded character sets. But note too that because "\n" is chr(13) and "\r" is chr(10) on the MacIntosh (which is an -ASCII machine) the second C<$is_ascii> test will lead to trouble there. +ASCII platform) the second C<$is_ascii> test will lead to trouble there. To determine whether or not perl was built under an EBCDIC code page you can use the Config module like so: @@ -702,15 +734,15 @@ The OS/390 and z/OS C run time libraries provide _atoe() and _etoa() functions. =head1 OPERATOR DIFFERENCES The C<..> range operator treats certain character ranges with -care on EBCDIC machines. For example the following array -will have twenty six elements on either an EBCDIC machine -or an ASCII machine: +care on EBCDIC platforms. For example the following array +will have twenty six elements on either an EBCDIC platform +or an ASCII platform: @alphabet = ('A'..'Z'); # $#alphabet == 25 The bitwise operators such as & ^ | may return different results when operating on string or character data in a perl program running -on an EBCDIC machine than when run on an ASCII machine. Here is +on an EBCDIC platform than when run on an ASCII platform. Here is an example adapted from the one in L: # EBCDIC-based examples @@ -722,7 +754,7 @@ an example adapted from the one in L: An interesting property of the 32 C0 control characters in the ASCII table is that they can "literally" be constructed as control characters in perl, e.g. C<(chr(0) eq "\c@")> -C<(chr(1) eq "\cA")>, and so on. Perl on EBCDIC machines has been +C<(chr(1) eq "\cA")>, and so on. Perl on EBCDIC platforms has been ported to take "\c@" to chr(0) and "\cA" to chr(1) as well, but the thirty three characters that result depend on which code page you are using. The table below uses the character names from the previous table @@ -732,9 +764,9 @@ s/HORIZONTAL/HORIZ./; s/DEVICE CONTROL/D.C./; s/SEPARATOR/SEP./; s/NEGATIVE ACKNOWLEDGE/NEG. ACK./;. The POSIX-BC and 1047 sets are identical throughout this range and differ from the 0037 set at only one spot (21 decimal). Note that the C character -may be generated by "\cJ" on ASCII machines but by "\cU" on 1047 or POSIX-BC -machines and cannot be generated as a C<"\c.letter."> control character on -0037 machines. Note also that "\c\\" maps to two characters +may be generated by "\cJ" on ASCII platforms but by "\cU" on 1047 or POSIX-BC +platforms and cannot be generated as a C<"\c.letter."> control character on +0037 platforms. Note also that "\c\\" maps to two characters not one. chr ord 8859-1 0037 1047 && POSIX-BC @@ -781,13 +813,13 @@ not one. =item chr() chr() must be given an EBCDIC code number argument to yield a desired -character return value on an EBCDIC machine. For example: +character return value on an EBCDIC platform. For example: $CAPITAL_LETTER_A = chr(193); =item ord() -ord() will return EBCDIC code number values on an EBCDIC machine. +ord() will return EBCDIC code number values on an EBCDIC platform. For example: $the_number_193 = ord("A"); @@ -830,7 +862,7 @@ further details. The formats that can convert characters to numbers and vice versa will be different from their ASCII counterparts when executed -on an EBCDIC machine. Examples include: +on an EBCDIC platform. Examples include: printf("%c%c%c",193,194,195); # prints ABC @@ -866,7 +898,7 @@ is a gap character from the alphabetic viewpoint. If you do want to match the alphabet gap characters in a single octet regular expression try matching the hex or octal code such -as C on EBCDIC or C on ASCII machines to +as C on EBCDIC or C on ASCII platforms to have your regular expression match C. Another construct to be wary of is the inappropriate use of hex or @@ -900,7 +932,7 @@ set of subs: The above would be adequate if the concern was only with numeric code points. However, the concern may be with characters rather than code points -and on an EBCDIC machine it may be desirable for constructs such as +and on an EBCDIC platform it may be desirable for constructs such as C to print out the expected message. One way to represent the above collection of character classification subs that is capable of working across the @@ -994,7 +1026,7 @@ output. One big difference between ASCII based character sets and EBCDIC ones are the relative positions of upper and lower case letters and the -letters compared to the digits. If sorted on an ASCII based machine the +letters compared to the digits. If sorted on an ASCII based platform the two letter abbreviation for a physician comes before the two letter for drive, that is: @@ -1004,15 +1036,15 @@ for drive, that is: The property of lower case before uppercase letters in EBCDIC is even carried to the Latin 1 EBCDIC pages such as 0037 and 1047. An example would be that E C (203) comes -before E C (235) on an ASCII machine, but -the latter (83) comes before the former (115) on an EBCDIC machine. +before E C (235) on an ASCII platform, but +the latter (83) comes before the former (115) on an EBCDIC platform. (Astute readers will note that the upper case version of E C is simply "SS" and that the upper case version of E C is not in the 0..255 range but it is at U+x0178 in Unicode, or C<"\x{178}"> in a Unicode enabled Perl). The sort order will cause differences between results obtained on -ASCII machines versus EBCDIC machines. What follows are some suggestions +ASCII platforms versus EBCDIC platforms. What follows are some suggestions on how to deal with these differences. =head2 Ignore ASCII vs. EBCDIC sort differences. @@ -1035,7 +1067,7 @@ and include Latin-1 characters then apply: then sort(). Do note however that such Latin-1 manipulation does not address the E C character that will remain at -code point 255 on ASCII machines, but 223 on most EBCDIC machines +code point 255 on ASCII platforms, but 223 on most EBCDIC platforms where it will sort to a place less than the EBCDIC numerals. With a Unicode enabled Perl you might try: @@ -1049,7 +1081,7 @@ of the data and may not be acceptable for that reason. This is the most expensive proposition that does not employ a network connection. -=head2 Perform sorting on one type of machine only. +=head2 Perform sorting on one type of platform only. This strategy can employ a network connection. As such it would be computationally expensive. @@ -1180,13 +1212,13 @@ that the @e2a array is filled in appropriately: =head2 Quoted-Printable encoding and decoding -On ASCII encoded machines it is possible to strip characters outside of +On ASCII encoded platforms it is possible to strip characters outside of the printable set using: # This QP encoder works on ASCII only $qp_string =~ s/([=\x00-\x1F\x80-\xFF])/sprintf("=%02X",ord($1))/ge; -Whereas a QP encoder that works on both ASCII and EBCDIC machines +Whereas a QP encoder that works on both ASCII and EBCDIC platforms would look somewhat like the following (where the EBCDIC branch @e2a array is omitted for brevity): @@ -1211,7 +1243,7 @@ Such QP strings can be decoded with: $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr hex $1/ge; $string =~ s/=[\n\r]+$//; -Whereas a QP decoder that works on both ASCII and EBCDIC machines +Whereas a QP decoder that works on both ASCII and EBCDIC platforms would look somewhat like the following (where the @a2e array is omitted for brevity): @@ -1229,7 +1261,7 @@ on the 26 letter English version of the Latin alphabet. Rot13 has the interesting property that alternate subsequent invocations are identity maps (thus rot13 is its own non-trivial inverse in the group of 26 alphabet rotations). Hence the following is a rot13 encoder and decoder that will -work on ASCII and EBCDIC machines: +work on ASCII and EBCDIC platforms: #!/usr/local/bin/perl @@ -1247,13 +1279,13 @@ In one-liner form: To the extent that it is possible to write code that depends on hashing order there may be differences between hashes as stored -on an ASCII based machine and hashes stored on an EBCDIC based machine. +on an ASCII based platform and hashes stored on an EBCDIC based platform. XXX =head1 I18N AND L10N Internationalization(I18N) and localization(L10N) are supported at least -in principle even on EBCDIC machines. The details are system dependent +in principle even on EBCDIC platforms. The details are system dependent and discussed under the L section below. =head1 MULTI OCTET CHARACTER SETS @@ -1349,13 +1381,13 @@ L, L, L, L. =head1 REFERENCES -http://anubis.dkuug.dk/i18n/charmaps +L -http://www.unicode.org/ +L -http://www.unicode.org/unicode/reports/tr16/ +L -http://www.wps.com/texts/codes/ +L B Tom Jennings, September 1999. @@ -1373,7 +1405,7 @@ B Fred B. Wrixon, ISBN 1-57912-040-7, Black Dog & Leventhal Publishers, 1998. -http://www.bobbemer.com/P-BIT.HTM +L B Robert Bemer. =head1 HISTORY diff --git a/pod/perlguts.pod b/pod/perlguts.pod index 33346c6..58e866d 100644 --- a/pod/perlguts.pod +++ b/pod/perlguts.pod @@ -2610,9 +2610,7 @@ you can use C<*s = uv>. =item * Mixing UTF-8 and non-UTF-8 strings is tricky. Use C to get -a new string which is UTF-8 encoded. There are tricks you can use to -delay deciding whether you need to use a UTF-8 string until you get to a -high character - C is one of those. +a new string which is UTF-8 encoded, and then combine them. =back diff --git a/pod/perlhack.pod b/pod/perlhack.pod index b2192d2..ef648e7 100644 --- a/pod/perlhack.pod +++ b/pod/perlhack.pod @@ -518,7 +518,7 @@ you should see something like this: (Then creating the symlinks...) The specifics may vary based on your operating system, of course. -After you see this, you can abort the F script, and you +After it's all done, you will see that the directory you are in has a tree of symlinks to the F directories and files. @@ -2646,6 +2646,61 @@ sizeof() of the field =item * +Assuming the character set is ASCIIish + +Perl can compile and run under EBCDIC platforms. See L. +This is transparent for the most part, but because the character sets +differ, you shouldn't use numeric (decimal, octal, nor hex) constants +to refer to characters. You can safely say 'A', but not 0x41. +You can safely say '\n', but not \012. +If a character doesn't have a trivial input form, you can +create a #define for it in both C and C, so that +it resolves to different values depending on the character set being used. +(There are three different EBCDIC character sets defined in C, +so it might be best to insert the #define three times in that file.) + +Also, the range 'A' - 'Z' in ASCII is an unbroken sequence of 26 upper case +alphabetic characters. That is not true in EBCDIC. Nor for 'a' to 'z'. +But '0' - '9' is an unbroken range in both systems. Don't assume anything +about other ranges. + +Many of the comments in the existing code ignore the possibility of EBCDIC, +and may be wrong therefore, even if the code works. +This is actually a tribute to the successful transparent insertion of being +able to handle EBCDIC. without having to change pre-existing code. + +UTF-8 and UTF-EBCDIC are two different encodings used to represent Unicode +code points as sequences of bytes. Macros +with the same names (but different definitions) +in C and C +are used to allow the calling code think that there is only one such encoding. +This is almost always referred to as C, but it means the EBCDIC +version as well. Comments in the code may well be wrong even if the code +itself is right. +For example, the concept of C differs between ASCII and +EBCDIC. +On ASCII platforms, only characters that do not have the high-order +bit set (i.e. whose ordinals are strict ASCII, 0 - 127) +are invariant, and the documentation and comments in the code +may assume that, +often referring to something like, say, C. +The situation differs and is not so simple on EBCDIC machines, but as long as +the code itself uses the C macro appropriately, it +works, even if the comments are wrong. + +=item * + +Assuming the character set is just ASCII + +ASCII is a 7 bit encoding, but bytes have 8 bits in them. The 128 extra +characters have different meanings depending on the locale. Absent a locale, +currently these extra characters are generally considered to be unassigned, +and this has presented some problems. +This is scheduled to be changed in 5.12 so that these characters will +be considered to be Latin-1 (ISO-8859-1). + +=item * + Mixing #define and #ifdef #define BURGLE(x) ... \ @@ -2660,7 +2715,7 @@ you need two separate BURGLE() #defines, one for each #ifdef branch. =item * -Adding stuff after #endif or #else +Adding non-comment stuff after #endif or #else #ifdef SNOSH ... @@ -2836,7 +2891,7 @@ admittedly use them if available to gain some extra speed =item * -Binding together several statements +Binding together several statements in a macro Use the macros STMT_START and STMT_END. diff --git a/pod/perlintern.pod b/pod/perlintern.pod index 0ffe8d1..e622841 100644 --- a/pod/perlintern.pod +++ b/pod/perlintern.pod @@ -484,7 +484,7 @@ semi-permanently (otherwise it might be deleted out from under you the next time the cache is invalidated). - AV* mro_get_linear_isa_c3(HV* stash, I32 level) + AV* mro_get_linear_isa_c3(HV* stash, U32 level) =for hackers Found in file mro.c @@ -503,7 +503,7 @@ semi-permanently (otherwise it might be deleted out from under you the next time the cache is invalidated). - AV* mro_get_linear_isa_dfs(HV* stash, I32 level) + AV* mro_get_linear_isa_dfs(HV* stash, U32 level) =for hackers Found in file mro.c diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 4e62fed..068b2f3 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -79,6 +79,16 @@ character semantics. For operations where this determination cannot be made without additional information from the user, Perl decides in favor of compatibility and chooses to use byte semantics. +Under byte semantics, when C is in effect, Perl uses the +semantics associated with the current locale. Absent a C, Perl +currently uses US-ASCII (or Basic Latin in Unicode terminology) byte semantics, +meaning that characters whose ordinal numbers are in the range 128 - 255 are +undefined except for their ordinal numbers. This means that none have case +(upper and lower), nor are any a member of character classes, like C<[:alpha:]> +or C<\w>. +(But all do belong to the C<\W> class or the Perl regular expression extension +C<[:^alpha:]>.) + This behavior preserves compatibility with earlier versions of Perl, which allowed byte semantics in Perl operations only if none of the program's inputs were marked as being as source of Unicode @@ -105,10 +115,8 @@ Otherwise, byte semantics are in effect. The C pragma should be used to force byte semantics on Unicode data. If strings operating under byte semantics and strings with Unicode -character data are concatenated, the new string will be created by -decoding the byte strings as I, even if the -old Unicode string used EBCDIC. This translation is done without -regard to the system's native 8-bit encoding. +character data are concatenated, the new string will have +character semantics. Under character semantics, many operations that formerly operated on bytes now operate on characters. A character in Perl is @@ -135,7 +143,7 @@ occur directly within the literal strings in UTF-8 encoding, or UTF-16. Unicode characters can also be added to a string by using the C<\x{...}> notation. The Unicode code for the desired character, in hexadecimal, should be placed in the braces. For instance, a smiley face is -C<\x{263A}>. This encoding scheme only works for all characters, but +C<\x{263A}>. This encoding scheme works for all characters, but for characters under 0x100, note that Perl may use an 8 bit encoding internally, for optimization and/or backward compatibility. @@ -939,8 +947,8 @@ Level 1 - Basic Unicode Support user-defined character properties [b] to emulate set operations [6] \b \B [7] note that Perl does Full case-folding in matching, not Simple: - for example U+1F88 is equivalent with U+1F00 U+03B9, - not with 1F80. This difference matters for certain Greek + for example U+1F88 is equivalent to U+1F00 U+03B9, + not with 1F80. This difference matters mainly for certain Greek capital letters with certain modifiers: the Full case-folding decomposes the letter, while the Simple case-folding would map it to a single character. @@ -1299,15 +1307,13 @@ readdir, readlink =head2 Forcing Unicode in Perl (Or Unforcing Unicode in Perl) Sometimes (see L) there are -situations where you simply need to force Perl to believe that a byte -string is UTF-8, or vice versa. The low-level calls -utf8::upgrade($bytestring) and utf8::downgrade($utf8string) are +situations where you simply need to force a byte +string into UTF-8, or vice versa. The low-level calls +utf8::upgrade($bytestring) and utf8::downgrade($utf8string[, FAIL_OK]) are the answers. -Do not use them without careful thought, though: Perl may easily get -very confused, angry, or even crash, if you suddenly change the 'nature' -of scalar like that. Especially careful you have to be if you use the -utf8::upgrade(): any random byte string is not valid UTF-8. +Note that utf8::downgrade() can fail if the string contains characters +that don't fit into a byte. =head2 Using Unicode in XS @@ -1321,7 +1327,7 @@ details. =item * C returns true if the C flag is on and the bytes -pragma is not in effect. C returns true is the C +pragma is not in effect. C returns true if the C flag is on; the bytes pragma is ignored. The C flag being on does B mean that there are any characters of code points greater than 255 (or 127) in the scalar or that there are even any characters @@ -1334,15 +1340,15 @@ Unicode model is not to use UTF-8 until it is absolutely necessary. =item * -C writes a Unicode character code point into +C writes a Unicode character code point into a buffer encoding the code point as UTF-8, and returns a pointer -pointing after the UTF-8 bytes. +pointing after the UTF-8 bytes. It works appropriately on EBCDIC machines. =item * -C reads UTF-8 encoded bytes from a buffer and +C reads UTF-8 encoded bytes from a buffer and returns the Unicode character code point and, optionally, the length of -the UTF-8 byte sequence. +the UTF-8 byte sequence. It works appropriately on EBCDIC machines. =item * @@ -1388,7 +1394,7 @@ two pointers pointing to the same UTF-8 encoded buffer. =item * -C will return a pointer to an UTF-8 encoded buffer +C will return a pointer to a UTF-8 encoded buffer that is C (positive or negative) Unicode characters displaced from the UTF-8 buffer C. Be careful not to overstep the buffer: C will merrily run off the end or the beginning of the @@ -1406,7 +1412,7 @@ output more readable. =item * -C can be used to +C can be used to compare two strings case-insensitively in Unicode. For case-sensitive comparisons you can just use C and C as usual. @@ -1426,6 +1432,27 @@ use characters above that range when mapped into Unicode. Perl's Unicode support will also tend to run slower. Use of locales with Unicode is discouraged. +=head2 Problems with characters whose ordinal numbers are in the range 128 - 255 with no Locale specified + +Without a locale specified, unlike all other characters or code points, +these characters have very different semantics in byte semantics versus +character semantics. +In character semantics they are interpreted as Unicode code points, which means +they are viewed as Latin-1 (ISO-8859-1). +In byte semantics, they are considered to be unassigned characters, +meaning that the only semantics they have is their +ordinal numbers, and that they are not members of various character classes. +None are considered to match C<\w> for example, but all match C<\W>. +Besides these class matches, +the known operations that this affects are those that change the case, +regular expression matching while ignoring case, +and B. +This can lead to unexpected results in which a string's semantics suddenly +change if a code point above 255 is appended to or removed from it, +which changes the string's semantics from byte to character or vice versa. +This behavior is scheduled to change in version 5.12, but in the meantime, +a workaround is to always call utf8::upgrade($string). + =head2 Interaction with Extensions When Perl exchanges data with an extension, the extension should be diff --git a/pod/perlunifaq.pod b/pod/perlunifaq.pod index b291334..83edc7d 100644 --- a/pod/perlunifaq.pod +++ b/pod/perlunifaq.pod @@ -145,11 +145,13 @@ strings differently, depending on the internal state. Affected are C, C, C, C, C<\U>, C<\L>, C<\u>, C<\l>, C<\d>, C<\s>, C<\w>, C<\D>, C<\S>, C<\W>, C, C<(?i:...)>, -C. +C, and C (though this last should not cause any real +problems). To force Unicode semantics, you can upgrade the internal representation to -by doing C. This does not change strings that were -already upgraded. +by doing C. This can be used +safely on any string, as it checks and does not change strings that have +already been upgraded. For a more detailed discussion, see L on CPAN. diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index 86360d4..36f729c 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -24,7 +24,7 @@ Unicode 1.0 was released in October 1991, and 4.0 in April 2003. A Unicode I is an abstract entity. It is not bound to any particular integer width, especially not to the C language C. Unicode is language-neutral and display-neutral: it does not encode the -language of the text and it does not define fonts or other graphical +language of the text and it does not generally define fonts or other graphical layout details. Unicode operates on characters and on text built from those characters. @@ -125,8 +125,7 @@ serious Unicode work. The maintenance release 5.6.1 fixed many of the problems of the initial Unicode implementation, but for example regular expressions still do not work with Unicode in 5.6.1. -B is no longer -necessary.> In earlier releases the C pragma was used to declare +B is needed only in much more restricted circumstances.> In earlier releases the C pragma was used to declare that operations in the current block or file would be Unicode-aware. This model was found to be wrong, or at least clumsy: the "Unicodeness" is now carried with the data, instead of being attached to the @@ -514,8 +513,8 @@ CAPITAL LETTER As should be considered equal, or even As of any case. The long answer is that you need to consider character normalization and casing issues: see L, Unicode Technical Reports #15 and #21, I and I, http://www.unicode.org/unicode/reports/tr15/ and -http://www.unicode.org/unicode/reports/tr21/ +Mappings>, L and +L As of Perl 5.8.0, the "Full" case-folding of I is implemented. @@ -538,7 +537,7 @@ C<0x00C1> > C<0x00C0>. The long answer is that "it depends", and a good answer cannot be given without knowing (at the very least) the language context. See L, and I -http://www.unicode.org/unicode/reports/tr10/ +L =back @@ -552,7 +551,7 @@ Character Ranges and Classes Character ranges in regular expression character classes (C) and in the C (also known as C) operator are not magically -Unicode-aware. What this means that C<[A-Za-z]> will not magically start +Unicode-aware. What this means is that C<[A-Za-z]> will not magically start to mean "all alphabetic letters"; not that it does mean that even for 8-bit characters, you should be using C in that case. @@ -603,11 +602,12 @@ Unicode; for that, see the earlier I/O discussion. How Do I Know Whether My String Is In Unicode? -You shouldn't care. No, you really shouldn't. No, really. If you -have to care--beyond the cases described above--it means that we -didn't get the transparency of Unicode quite right. +You shouldn't have to care. But you may, because currently the semantics of the +characters whose ordinals are in the range 128 to 255 is different depending on +whether the string they are contained within is in Unicode or not. +(See L.) -Okay, if you insist: +To determine if a string is in Unicode, use: print utf8::is_utf8($string) ? 1 : 0, "\n"; @@ -634,8 +634,8 @@ C<$a> will stay byte-encoded. Sometimes you might really need to know the byte length of a string instead of the character length. For that use either the -C function or the C pragma and its only -defined function C: +C function or the C pragma and +the C function: my $unicode = chr(0x100); print length($unicode), "\n"; # will print 1 @@ -653,7 +653,7 @@ Use the C package to try converting it. For example, use Encode 'decode_utf8'; - + if (eval { decode_utf8($string, Encode::FB_CROAK); 1 }) { # $string is valid utf8 } else { @@ -724,18 +724,20 @@ or: $Unicode = pack("U0a*", $bytes); -You can convert well-formed UTF-8 to a sequence of bytes, but if -you just want to convert random binary data into UTF-8, you can't. -B. You can -use C for the former, and you can create -well-formed Unicode data by C. +You can find the bytes that make up a UTF-8 sequence with + + @bytes = unpack("C*", $Unicode_string) + +and you can create well-formed Unicode with + + $Unicode_string = pack("U*", 0xff, ...) =item * How Do I Display Unicode? How Do I Input Unicode? -See http://www.alanwood.net/unicode/ and -http://www.cl.cam.ac.uk/~mgk25/unicode.html +See L and +L =item * @@ -787,44 +789,44 @@ show a decimal number in hexadecimal. If you have just the Unicode Consortium -http://www.unicode.org/ +L =item * Unicode FAQ -http://www.unicode.org/unicode/faq/ +L =item * Unicode Glossary -http://www.unicode.org/glossary/ +L =item * Unicode Useful Resources -http://www.unicode.org/unicode/onlinedat/resources.html +L =item * Unicode and Multilingual Support in HTML, Fonts, Web Browsers and Other Applications -http://www.alanwood.net/unicode/ +L =item * UTF-8 and Unicode FAQ for Unix/Linux -http://www.cl.cam.ac.uk/~mgk25/unicode.html +L =item * Legacy Character Sets -http://www.czyborra.com/ -http://www.eki.ee/letter/ +L +L =item * diff --git a/proto.h b/proto.h index f152635..bf5e1ac 100644 --- a/proto.h +++ b/proto.h @@ -3969,6 +3969,11 @@ PERL_CALLCONV void Perl_reginitcolors(pTHX); #define PERL_ARGS_ASSERT_SV_UTF8_UPGRADE \ assert(sv) +/* PERL_CALLCONV STRLEN Perl_sv_utf8_upgrade_nomg(pTHX_ SV *sv) + __attribute__nonnull__(pTHX_1); */ +#define PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_NOMG \ + assert(sv) + PERL_CALLCONV bool Perl_sv_utf8_downgrade(pTHX_ SV *const sv, const bool fail_ok) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_SV_UTF8_DOWNGRADE \ diff --git a/sv.c b/sv.c index 9fe0a3f..917c897 100644 --- a/sv.c +++ b/sv.c @@ -3146,19 +3146,27 @@ Perl_sv_2bool(pTHX_ register SV *const sv) Converts the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. +Will C on C if appropriate. Always sets the SvUTF8 flag to avoid future validity checks even -if all the bytes have hibit clear. +if the whole string is the same in UTF-8 as not. +Returns the number of bytes in the converted string This is not as a general purpose byte encoding to Unicode interface: use the Encode extension for that. +=for apidoc sv_utf8_upgrade_nomg + +Like sv_utf8_upgrade, but doesn't do magic on C + =for apidoc sv_utf8_upgrade_flags Converts the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even -if all the bytes have hibit clear. If C has C bit set, -will C on C if appropriate, else not. C and +if all the bytes are invariant in UTF-8. If C has C bit set, +will C on C if appropriate, else not. +Returns the number of bytes in the converted string +C and C are implemented in terms of this function. This is not as a general purpose byte encoding to Unicode interface: @@ -3199,7 +3207,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags) sv_recode_to_utf8(sv, PL_encoding); else { /* Assume Latin-1/EBCDIC */ /* This function could be much more efficient if we - * had a FLAG in SVs to signal if there are any hibit + * had a FLAG in SVs to signal if there are any variant * chars in the PV. Given that there isn't such a flag * make the loop as fast as possible. */ const U8 * const s = (U8 *) SvPVX_const(sv); @@ -3208,7 +3216,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags) while (t < e) { const U8 ch = *t++; - /* Check for hi bit */ + /* Check for variant */ if (!NATIVE_IS_INVARIANT(ch)) { STRLEN len = SvCUR(sv); /* *Currently* bytes_to_utf8() adds a '\0' after every string @@ -3228,7 +3236,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags) break; } } - /* Mark as UTF-8 even if no hibit - saves scanning loop */ + /* Mark as UTF-8 even if no variant - saves scanning loop */ SvUTF8_on(sv); } return SvCUR(sv); @@ -3238,7 +3246,8 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags) =for apidoc sv_utf8_downgrade Attempts to convert the PV of an SV from characters to bytes. -If the PV contains a character beyond byte, this conversion will fail; +If the PV contains a character that cannot fit +in a byte, this conversion will fail; in this case, either returns false or, if C is not true, croaks. diff --git a/t/uni/fold.t b/t/uni/fold.t index 789ba67..f6f467c 100644 --- a/t/uni/fold.t +++ b/t/uni/fold.t @@ -15,7 +15,9 @@ if (open(CF, $CF)) { my @CF; while () { - # Skip S since we are going for 'F'ull case folding + # Skip S since we are going for 'F'ull case folding. I is obsolete starting + # with Unicode 3.2, but leaving it in does no harm, and allows backward + # compatibility if (/^([0-9A-F]+); ([CFI]); ((?:[0-9A-F]+)(?: [0-9A-F]+)*); \# (.+)/) { next if EBCDIC && hex $1 < 0x100; push @CF, [$1, $2, $3, $4]; diff --git a/utf8.c b/utf8.c index ecbc4ea..8243793 100644 --- a/utf8.c +++ b/utf8.c @@ -247,9 +247,9 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len) =for apidoc is_utf8_char Tests if some arbitrary number of bytes begins in a valid UTF-8 -character. Note that an INVARIANT (i.e. ASCII) character is a valid -UTF-8 character. The actual number of bytes in the UTF-8 character -will be returned if it is valid, otherwise 0. +character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) +character is a valid UTF-8 character. The actual number of bytes in the UTF-8 +character will be returned if it is valid, otherwise 0. =cut */ STRLEN @@ -648,7 +648,7 @@ Returns the Unicode code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -This function should only be used when returned UV is considered +This function should only be used when the returned UV is considered an index into the Unicode semantic tables (e.g. swashes). If C does not point to a well-formed UTF-8 character, zero is @@ -772,7 +772,7 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off) /* =for apidoc utf8_to_bytes -Converts a string C of length C from UTF-8 into byte encoding. +Converts a string C of length C from UTF-8 into native byte encoding. Unlike C, this over-writes the original string, and updates len to contain the new length. Returns zero on failure, setting C to -1. @@ -817,12 +817,13 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) /* =for apidoc bytes_from_utf8 -Converts a string C of length C from UTF-8 into byte encoding. +Converts a string C of length C from UTF-8 into native byte encoding. Unlike C but like C, returns a pointer to the newly-created string, and updates C to contain the new length. Returns the original string if no conversion occurs, C is unchanged. Do nothing if C points to 0. Sets C to -0 if C is converted or contains all 7bit characters. +0 if C is converted or consisted entirely of characters that are invariant +in utf8 (i.e., US-ASCII on non-EBCDIC machines). =cut */ @@ -874,11 +875,14 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) /* =for apidoc bytes_to_utf8 -Converts a string C of length C from ASCII into UTF-8 encoding. +Converts a string C of length C from the native encoding into UTF-8. Returns a pointer to the newly-created string, and sets C to reflect the new length. -If you want to convert to UTF-8 from other encodings than ASCII, +A NUL character will be written after the end of the string. + +If you want to convert to UTF-8 from encodings other than +the native (Latin1 or EBCDIC), see sv_recode_to_utf8(). =cut