From: Nick Ing-Simmons Date: Sun, 18 Nov 2001 11:00:34 +0000 (+0000) Subject: Integrate mainline - a few Devel::Peak fails. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=07c0ef9631ba1cc35934b95864a88ebd642fda81;p=p5sagit%2Fp5-mst-13.2.git Integrate mainline - a few Devel::Peak fails. p4raw-id: //depot/perlio@13066 --- 07c0ef9631ba1cc35934b95864a88ebd642fda81 diff --cc lib/Pod/ParseLink.pm index 0000000,cced975..e812f24 mode 000000,100644..100644 --- a/lib/Pod/ParseLink.pm +++ b/lib/Pod/ParseLink.pm @@@ -1,0 -1,161 +1,161 @@@ + # Pod::ParseLink -- Parse an L<> formatting code in POD text. + # $Id: ParseLink.pm,v 1.1 2001/11/15 07:58:57 eagle Exp $ + # + # Copyright 2001 by Russ Allbery + # + # This program is free software; you may redistribute it and/or modify it + # under the same terms as Perl itself. + # + # This module implements parsing of the text of an L<> formatting code as + # defined in perlpodspec. It should be suitable for any POD formatter. It + # exports only one function, parselink(), which returns the five-item parse + # defined in perlpodspec. + # + # Perl core hackers, please note that this module is also separately + # maintained outside of the Perl core as part of the podlators. Please send + # me any patches at the address above in addition to sending them to the + # standard Perl mailing lists. + + ############################################################################## + # Modules and declarations + ############################################################################## + + package Pod::ParseLink; + + require 5.004; + + use strict; + use vars qw(@EXPORT @ISA $VERSION); + + use Exporter; + @ISA = qw(Exporter); + @EXPORT = qw(parselink); + + # Don't use the CVS revision as the version, since this module is also in Perl + # core and too many things could munge CVS magic revision strings. This + # number should ideally be the same as the CVS revision in podlators, however. + $VERSION = 1.01; + + + ############################################################################## + # Implementation + ############################################################################## + + # Parse the name and section portion of a link into a name and section. + sub _parse_section { + my ($link) = @_; + $link =~ s/^\s+//; + $link =~ s/\s+$//; + + # If the whole link is enclosed in quotes, interpret it all as a section + # even if it contains a slash. + return (undef, $1) if (/^"\s*(.*?)\s*"$/); + + # Split into page and section on slash, and then clean up quoting in the + # section. If there is no section and the name contains spaces, also + # guess that it's an old section link. + my ($page, $section) = split (/\s*\/\s*/, $link, 2); - $section =~ s/^"\s*(.*?)\s*"$/$1/ if $section; - if ($page && $page =~ / / && !defined ($section)) { ++ $section =~ s/^"\s*(.*?)\s*"$/$1/; ++ if ($page =~ / / && !defined ($section)) { + $section = $page; + $page = undef; + } else { + $page = undef unless $page; + $section = undef unless $section; + } + return ($page, $section); + } + + # Infer link text from the page and section. + sub _infer_text { + my ($page, $section) = @_; + my $inferred; + if ($page && !$section) { + $inferred = $page; + } elsif (!$page && $section) { + $inferred = '"' . $section . '"'; + } elsif ($page && $section) { + $inferred = '"' . $section . '" in ' . $page; + } + return $inferred; + } + + # Given the contents of an L<> formatting code, parse it into the link text, + # the possibly inferred link text, the name or URL, the section, and the type + # of link (pod, man, or url). + sub parselink { + my ($link) = @_; + $link =~ s/\s+/ /g; + if ($link =~ /\A\w+:[^:\s]\S*\Z/) { + return (undef, $link, $link, undef, 'url'); + } else { + my $text; + if ($link =~ /\|/) { + ($text, $link) = split (/\|/, $link, 2); + } + my ($name, $section) = _parse_section ($link); + my $inferred = $text || _infer_text ($name, $section); + my $type = ($name =~ /\(\S*\)/) ? 'man' : 'pod'; + return ($text, $inferred, $name, $section, $type); + } + } + + + ############################################################################## + # Module return value and documentation + ############################################################################## + + # Ensure we evaluate to true. + 1; + __END__ + + =head1 NAME + + Pod::ParseLink -- Parse an L<> formatting code in POD text + + =head1 SYNOPSIS + + use Pod::ParseLink; + my ($text, $inferred, $name, $section, $type) = parselink ($link); + + =head1 DESCRIPTION + + This module only provides a single function, parselink(), which takes the + text of an LEE formatting code and parses it. It returns the anchor + text for the link (if any was given), the anchor text possibly inferred from + the name and section, the name or URL, the section if any, and the type of + link. The type will be one of 'url', 'pod', or 'man', indicating a URL, a + link to a POD page, or a link to a Unix manual page. + + Parsing is implemented per L. For backward compatibility, + links where there is no section and name contains spaces, or links where the + entirety of the link (except for the anchor text if given) is enclosed in + double-quotes are interpreted as links to a section (LE/sectionE). + + The inferred anchor text is implemented per L: + + L => L + L => L<"section"|/section> + L => L<"section" in name|name/section> + + The name may contain embedded EEE and ZEE formatting codes, + and the section, anchor text, and inferred anchor text may contain any + formatting codes. Any double quotes around the name or section are removed + as part of the parsing, as are any leading or trailing whitespace. + + No attempt is made to resolve formatting codes. The caller must be prepared + to do that either before or after calling parselink(). (This is because + interpretation of EEE formatting codes may vary by formatter.) + + =head1 AUTHOR + + Russ Allbery . + + =head1 COPYRIGHT AND LICENSE + + Copyright 2001 by Russ Allbery . + + This program is free software; you may redistribute it and/or modify it + under the same terms as Perl itself. + + =cut diff --cc lib/Thread.pm index 0000000,fc39769..4e88706 mode 000000,100644..100644 --- a/lib/Thread.pm +++ b/lib/Thread.pm @@@ -1,0 -1,341 +1,339 @@@ + package Thread; + ++$VERSION = '2.00'; ++ + use strict; + -our($VERSION, $ithreads, $othreads); ++our $ithreads; ++our $othreads; + + BEGIN { - $VERSION = '2.00'; + use Config; + $ithreads = $Config{useithreads}; + $othreads = $Config{use5005threads}; + } + + require Exporter; + use XSLoader (); -our(@ISA, @EXPORT, @EXPORT_OK); ++our($VERSION, @ISA, @EXPORT, @EXPORT_OK); + + @ISA = qw(Exporter); + + BEGIN { + if ($ithreads) { + @EXPORT = qw(share cond_wait cond_broadcast cond_signal unlock) + } elsif ($othreads) { + @EXPORT_OK = qw(cond_signal cond_broadcast cond_wait); + } + push @EXPORT_OK, qw(async yield); + } + + =head1 NAME + + Thread - manipulate threads in Perl + + =head1 CAVEAT + + Perl has two thread models. + + In Perl 5.005 the thread model was that all data is implicitly shared + and shared access to data has to be explicitly synchronized. + This model is called "5005threads". + + In Perl 5.6 a new model was introduced in which all is was thread + local and shared access to data has to be explicitly declared. + This model is called "ithreads", for "interpreter threads". + + In Perl 5.6 the ithreads model was not available as a public API, + only as an internal API that was available for extension writers, + and to implement fork() emulation on Win32 platforms. + + In Perl 5.8 the ithreads model became available through the C + module. + + Neither model is configured by default into Perl (except, as mentioned + above, in Win32 ithreads are always available.) You can see your + Perl's threading configuration by running C and looking for + the I variables, or inside script by C + and testing for C<$Config{use5005threads}> and C<$Config{useithreads}>. + + For old code and interim backwards compatibility, the Thread module + has been reworked to function as a frontend for both 5005threads and + ithreads. + + Note that the compatibility is not complete: because the data sharing + models are directly opposed, anything to do with data sharing has to + be thought differently. With the ithreads you must explicitly share() + variables between the threads. + + For new code the use of the C module is discouraged and + the direct use use of the C and C modules + is encouraged instead. + + Finally, note that there are many known serious problems with the + 5005threads, one of the least of which is that regular expression + match variables like $1 are not threadsafe, that is, they easily get + corrupted by competing threads. Other problems include more insidious + data corruption and mysterious crashes. You are seriously urged to + use ithreads instead. + + =head1 SYNOPSIS + + use Thread; + + my $t = Thread->new(\&start_sub, @start_args); + + $result = $t->join; + $result = $t->eval; + $t->detach; + + if ($t->done) { + $t->join; + } + + if($t->equal($another_thread)) { + # ... + } + + yield(); + + my $tid = Thread->self->tid; + + lock($scalar); + lock(@array); + lock(%hash); + + lock(\&sub); # not available with ithreads + + $flags = $t->flags; # not available with ithreads + + my @list = Thread->list; # not available with ithreads + + unlock(...); # not available with the 5.005 threads + + use Thread 'async'; + + =head1 DESCRIPTION + + The C module provides multithreading support for perl. + + =head1 FUNCTIONS + + =over 8 + + =item $thread = Thread->new(\&start_sub) + + =item $thread = Thread->new(\&start_sub, LIST) + + C starts a new thread of execution in the referenced subroutine. The + optional list is passed as parameters to the subroutine. Execution + continues in both the subroutine and the code after the C call. + + C returns a thread object representing the newly created + thread. + + =item lock VARIABLE + + C places a lock on a variable until the lock goes out of scope + (with ithreads you can also explicitly unlock()). + + If the variable is locked by another thread, the C call will + block until it's available. C is recursive, so multiple calls + to C are safe--the variable will remain locked until the + outermost lock on the variable goes out of scope. + + Locks on variables only affect C calls--they do I affect normal + access to a variable. (Locks on subs are different, and covered in a bit.) + If you really, I want locks to block access, then go ahead and tie + them to something and manage this yourself. This is done on purpose. + While managing access to variables is a good thing, Perl doesn't force + you out of its living room... + + If a container object, such as a hash or array, is locked, all the + elements of that container are not locked. For example, if a thread + does a C, any other thread doing a C won't + block. + + With 5005threads you may also C a sub, using C. + Any calls to that sub from another thread will block until the lock + is released. This behaviour is not equivalent to declaring the sub + with the C attribute. The C attribute serializes + access to a subroutine, but allows different threads non-simultaneous + access. C, on the other hand, will not allow I other + thread access for the duration of the lock. + + Finally, C will traverse up references exactly I level. + C is equivalent to C, while C is not. + + =item async BLOCK; + + C creates a thread to execute the block immediately following + it. This block is treated as an anonymous sub, and so must have a + semi-colon after the closing brace. Like C, C + returns a thread object. + + =item Thread->self + + The Cself> function returns a thread object that represents + the thread making the Cself> call. + + =item cond_wait VARIABLE + + The C function takes a B variable as + a parameter, unlocks the variable, and blocks until another thread + does a C or C for that same locked + variable. The variable that C blocked on is relocked + after the C is satisfied. If there are multiple threads + Cing on the same variable, all but one will reblock waiting + to reaquire the lock on the variable. (So if you're only using + C for synchronization, give up the lock as soon as + possible.) + + =item cond_signal VARIABLE + + The C function takes a locked variable as a parameter and + unblocks one thread that's Cing on that variable. If more than + one thread is blocked in a C on that variable, only one (and + which one is indeterminate) will be unblocked. + + If there are no threads blocked in a C on the variable, + the signal is discarded. + + =item cond_broadcast VARIABLE + + The C function works similarly to C. + C, though, will unblock B the threads that are + blocked in a C on the locked variable, rather than only + one. + + =item yield + + The C function allows another thread to take control of the + CPU. The exact results are implementation-dependent. + + =back + + =head1 METHODS + + =over 8 + + =item join + + C waits for a thread to end and returns any values the thread + exited with. C will block until the thread has ended, though + it won't block if the thread has already terminated. + + If the thread being Ced Cd, the error it died with will + be returned at this time. If you don't want the thread performing + the C to die as well, you should either wrap the C in + an C or use the C thread method instead of C. + + =item eval + + The C method wraps an C around a C, and so waits for + a thread to exit, passing along any values the thread might have returned. + Errors, of course, get placed into C<$@>. (Not available with ithreads.) + + =item detach + + C tells a thread that it is never going to be joined i.e. + that all traces of its existence can be removed once it stops running. + Errors in detached threads will not be visible anywhere - if you want + to catch them, you should use $SIG{__DIE__} or something like that. + + =item equal + + C tests whether two thread objects represent the same thread and + returns true if they do. + + =item tid + + The C method returns the tid of a thread. The tid is + a monotonically increasing integer assigned when a thread is + created. The main thread of a program will have a tid of zero, + while subsequent threads will have tids assigned starting with one. + + =item flags + + The C method returns the flags for the thread. This is the + integer value corresponding to the internal flags for the thread, + and the value may not be all that meaningful to you. + (Not available with ithreads.) + + =item done + + The C method returns true if the thread you're checking has + finished, and false otherwise. (Not available with ithreads.) + + =back + + =head1 LIMITATIONS + + The sequence number used to assign tids is a simple integer, and no + checking is done to make sure the tid isn't currently in use. If a + program creates more than 2**32 - 1 threads in a single run, threads + may be assigned duplicate tids. This limitation may be lifted in + a future version of Perl. + + =head1 SEE ALSO + + L (not available with 5005threads) + + L, L, L, + L (not available with ithreads) + + =cut + + # + # Methods + # + + # + # Exported functions + # + + sub async (&) { + return Thread->new($_[0]); + } + + sub eval { + return eval { shift->join; }; + } + + sub unimplemented { + print $_[0], " unimplemented with ", + $Config{useithreads} ? "ithreads" : "5005threads", "\n"; + + } + + sub unimplement { + for my $m (@_) { + no strict 'refs'; + *{"Thread::$m"} = sub { unimplemented $m }; + } + } + + BEGIN { + if ($ithreads) { - if ($othreads) { - require Carp; - Carp::croak("This Perl has both ithreads and 5005threads (serious malconfiguration)"); - } + XSLoader::load 'threads'; + for my $m (qw(new join detach yield self tid equal)) { + no strict 'refs'; + *{"Thread::$m"} = \&{"threads::$m"}; + } + XSLoader::load 'threads::shared'; + for my $m (qw(cond_signal cond_broadcast cond_wait unlock share)) { + no strict 'refs'; + *{"Thread::$m"} = \&{"threads::shared::${m}_enabled"}; + } + # trying to unimplement eval gives redefined warning + unimplement(qw(list done flags)); + } elsif ($othreads) { + XSLoader::load 'Thread'; + unimplement(qw(unlock)); + } else { + require Carp; - Carp::croak("This Perl has neither ithreads nor 5005threads"); ++ Carp::croak("This Perl has neither ithreads not 5005threads"); + } + } + + 1; diff --cc pod/perluniintro.pod index 0000000,cd978d0..cdd0b40 mode 000000,100644..100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@@ -1,0 -1,698 +1,689 @@@ + =head1 NAME + + perluniintro - Perl Unicode introduction + + =head1 DESCRIPTION + + This document gives a general idea of Unicode and how to use Unicode + in Perl. + + =head2 Unicode + + Unicode is a character set standard with plans to cover all of the + writing systems of the world, plus many other symbols. + + Unicode and ISO/IEC 10646 are coordinated standards that provide code + points for the characters in almost all modern character set standards, + covering more than 30 writing systems and hundreds of languages, + including all commercially important modern languages. All characters + in the largest Chinese, Japanese, and Korean dictionaries are also + encoded. The standards will eventually cover almost all characters in + more than 250 writing systems and thousands of languages. + + A Unicode I is an abstract entity. It is not bound to any + particular integer width, and especially not to the C language C. + Unicode is language neutral and display neutral: it doesn't encode the + language of the text, and it doesn't define fonts or other graphical + layout details. Unicode operates on characters and on text built from + those characters. + + Unicode defines characters like C or C, and then unique numbers for those, hexadecimal + 0x0041 or 0x03B1 for those particular characters. Such unique + numbers are called I. + + The Unicode standard prefers using hexadecimal notation for the code + points. (In case this notation, numbers like 0x0041, is unfamiliar to + you, take a peek at a later section, L.) + The Unicode standard uses the notation C, + which gives the hexadecimal code point, and the normative name of + the character. + + Unicode also defines various I for the characters, like + "uppercase" or "lowercase", "decimal digit", or "punctuation": + these properties are independent of the names of the characters. + Furthermore, various operations on the characters like uppercasing, + lowercasing, and collating (sorting), are defined. + + A Unicode character consists either of a single code point, or a + I (like C), followed by one or + more I (like C). This sequence of + a base character and modifiers is called a I. + + Whether to call these combining character sequences, as a whole, + "characters" depends on your point of view. If you are a programmer, you + probably would tend towards seeing each element in the sequences as one + unit, one "character", but from the user viewpoint, the sequence as a + whole is probably considered one "character", since that's probably what + it looks like in the context of the user's language. + + With this "as a whole" view of characters, the number of characters is + open-ended. But in the programmer's "one unit is one character" point of + view, the concept of "characters" is more deterministic, and so we take + that point of view in this document: one "character" is one Unicode + code point, be it a base character or a combining character. + + For some of the combinations there are I characters, + for example C is defined as + a single code point. These precomposed characters are, however, + often available only for some combinations, and mainly they are + meant to support round-trip conversions between Unicode and legacy + standards (like the ISO 8859), and in general case the composing + method is more extensible. To support conversion between the + different compositions of the characters, various I are also defined. + + Because of backward compatibility with legacy encodings, the "a unique + number for every character" breaks down a bit: "at least one number + for every character" is closer to truth. (This happens when the same + character has been encoded in several legacy encodings.) The converse + is also not true: not every code point has an assigned character. + Firstly, there are unallocated code points within otherwise used + blocks. Secondly, there are special Unicode control characters that + do not represent true characters. + + A common myth about Unicode is that it would be "16-bit", that is, + 0x10000 (or 65536) characters from 0x0000 to 0xFFFF. B + Since Unicode 2.0 Unicode has been defined all the way up to 21 bits + (0x10FFFF), and since 3.1 characters have been defined beyond 0xFFFF. + The first 0x10000 characters are called the I, or the I (BMP). With the Unicode 3.1, 17 planes in all are + defined (but nowhere near full of defined characters yet). + + Another myth is that the 256-character blocks have something to do + with languages: a block per language. B + The division into the blocks exists but it is almost completely + accidental, an artifact of how the characters have been historically + allocated. Instead, there is a concept called I, which may + be more useful: there is C script, C script, and so on. + Scripts usually span several parts of several blocks. For further + information see L. + + The Unicode code points are just abstract numbers. To input and + output these abstract numbers, the numbers must be I somehow. + Unicode defines several I, of which I + is perhaps the most popular. UTF-8 is a variable length encoding that + encodes Unicode characters as 1 to 6 bytes (only 4 with the currently + defined characters). Other encodings are UTF-16 and UTF-32 and their + big and little endian variants (UTF-8 is byteorder independent). + The ISO/IEC 10646 defines the UCS-2 and UCS-4 encoding forms. + + For more information about encodings, for example to learn what + I and I (BOMs) are, see L. + + =head2 Perl's Unicode Support + + Starting from Perl 5.6.0, Perl has had the capability of handling + Unicode natively. The first recommended release for serious Unicode + work is Perl 5.8.0, however. The maintenance release 5.6.1 fixed many + of the problems of the initial implementation of Unicode, but for + example regular expressions didn't really work with Unicode. + + B is no longer + necessary.> In earlier releases the C pragma was used to declare + that operations in the current block or file would be Unicode-aware. + This model was found to be wrong, or at least clumsy: the Unicodeness + is now carried with the data, not attached to the operations. (There + is one remaining case where an explicit C is needed: if your + Perl script is in UTF-8, you can use UTF-8 in your variable and + subroutine names, and in your string and regular expression literals, + by saying C. This is not the default because that would + break existing scripts having legacy 8-bit data in them.) + + =head2 Perl's Unicode Model + + Perl supports both the old, pre-5.6, model of strings of eight-bit + native bytes, and strings of Unicode characters. The principle is + that Perl tries to keep its data as eight-bit bytes for as long as + possible, but as soon as Unicodeness cannot be avoided, the data is + transparently upgraded to Unicode. + + The internal encoding of Unicode in Perl is UTF-8. The internal + encoding is normally hidden, however, and one need not and should not + worry about the internal encoding at all: it is all just characters. + + Perl 5.8.0 will also support Unicode on EBCDIC platforms. There the + support is somewhat harder to implement since additional conversions + are needed at every step. Because of these difficulties the Unicode + support won't be quite as full as in other, mainly ASCII-based, + platforms (the Unicode support will be better than in the 5.6 series, + which didn't work much at all for EBCDIC platform). On EBCDIC + platforms the internal encoding form used is UTF-EBCDIC. + + =head2 Creating Unicode + + To create Unicode literals, use the C<\x{...}> notation in + doublequoted strings: + + my $smiley = "\x{263a}"; + + Similarly for regular expression literals + + $smiley =~ /\x{263a}/; + + At run-time you can use C: + + my $hebrew_alef = chr(0x05d0); + + (See L for how to find all these numeric codes.) + + Naturally, C will do the reverse: turn a character to a code point. + + Note that C<\x..>, C<\x{..}> and C for arguments less than + 0x100 (decimal 256) will generate an eight-bit character for backward + compatibility with older Perls. For arguments of 0x100 or more, + Unicode will always be produced. If you want UTF-8 always, use + C instead of C<\x..>, C<\x{..}>, or C. + + You can also use the C pragma to invoke characters + by name in doublequoted strings: + + use charnames ':full'; + my $arabic_alef = "\N{ARABIC LETTER ALEF}"; + + And, as mentioned above, you can also C numbers into Unicode + characters: + + my $georgian_an = pack("U", 0x10a0); + + =head2 Handling Unicode + + Handling Unicode is for the most part transparent: just use the + strings as usual. Functions like C, C, and + C will work on the Unicode characters; regular expressions + will work on the Unicode characters (see L and L). + + Note that Perl does B consider combining character sequences + to be characters, such for example + + use charnames ':full'; + print length("\N{LATIN CAPITAL LETTER A}\N{COMBINING ACUTE ACCENT}"), "\n"; + + will print 2, not 1. The only exception is that regular expressions + have C<\X> for matching a combining character sequence. + + When life is not quite so transparent is working with legacy + encodings, and I/O, and certain special cases. + + =head2 Legacy Encodings + + When you combine legacy data and Unicode the legacy data needs + to be upgraded to Unicode. Normally ISO 8859-1 (or EBCDIC, if + applicable) is assumed. You can override this assumption by + using the C pragma, for example + + use encoding 'latin2'; # ISO 8859-2 + + in which case literals (string or regular expression) and chr/ord + in your whole script are assumed to produce Unicode characters from + ISO 8859-2 code points. Note that the matching for the encoding + names is forgiving: instead of C you could have said + C, or C, and so forth. With just + + use encoding; + + first the environment variable C will be consulted, + and if that doesn't exist, ISO 8859-1 (Latin 1) will be assumed. + + The C module knows about many encodings and it has interfaces + for doing conversions between those encodings: + + use Encode 'from_to'; + from_to($data, "iso-8859-3", "utf-8"); # from legacy to utf-8 + + =head2 Unicode I/O + + Normally writing out Unicode data + - print FH chr(0x100), "\n"; ++ print chr(0x100), "\n"; + -will print out the raw UTF-8 bytes, but you will get a warning -out of that if you use C<-w> or C. To avoid the -warning open the stream explicitly in UTF-8: ++will print out the raw UTF-8 bytes. + - open FH, ">:utf8", "file"; - -and on already open streams use C: - - binmode(STDOUT, ":utf8"); - -Reading in correctly formed UTF-8 data will not magically turn ++But reading in correctly formed UTF-8 data will not magically turn + the data into Unicode in Perl's eyes. + + You can use either the C<':utf8'> I/O discipline when opening files + + open(my $fh,'<:utf8', 'anything'); + my $line_of_utf8 = <$fh>; + + The I/O disciplines can also be specified more flexibly with + the C pragma; see L: + - use open ':utf8'; # input and output default discipline will be UTF-8 - open X, ">file"; - print X chr(0x100), "\n"; ++ use open ':utf8'; # input and output will be UTF-8 ++ open X, ">utf8"; ++ print X chr(0x100), "\n"; # this would have been UTF-8 without the pragma + close X; - open Y, "); # this should print 0x100 + close Y; + + With the C pragma you can use the C<:locale> discipline + + $ENV{LANG} = 'ru_RU.KOI8-R'; + # the :locale will probe the locale environment variables like LANG + use open OUT => ':locale'; # russki parusski + open(O, ">koi8"); + print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1 + close O; + open(I, "), "\n"; # this should print 0xc1 + close I; + + or you can also use the C<':encoding(...)'> discipline + + open(my $epic,'<:encoding(iso-8859-7)','iliad.greek'); + my $line_of_iliad = <$epic>; + + Both of these methods install a transparent filter on the I/O stream that + will convert data from the specified encoding when it is read in from the + stream. In the first example the F file is assumed to be UTF-8 + encoded Unicode, in the second example the F file is assumed + to be ISO-8858-7 encoded Greek, but the lines read in will be in both + cases Unicode. + + The L pragma affects all the C calls after the pragma by + setting default disciplines. If you want to affect only certain + streams, use explicit disciplines directly in the C call. + + You can switch encodings on an already opened stream by using + C, see L. + + The C<:locale> does not currently work with C and + C, only with the C pragma. The C<:utf8> and + C<:encoding(...)> do work with all of C, C, + and the C pragma. + + Similarly, you may use these I/O disciplines on input streams to + automatically convert data from the specified encoding when it is + written to the stream. + + open(my $unicode, '<:utf8', 'japanese.uni'); + open(my $nihongo, '>:encoding(iso2022-jp)', 'japanese.jp'); + while (<$unicode>) { print $nihongo } + + The naming of encodings, both by the C and by the C + pragma, is similarly understanding as with the C pragma: + C and C will both be understood. + + Common encodings recognized by ISO, MIME, IANA, and various other + standardisation organisations are recognised, for a more detailed + list see L. + + C reads characters and returns the number of characters. + C and C operate on byte counts, as do C + and C. + + Notice that because of the default behaviour "input is not UTF-8" + it is easy to mistakenly write code that keeps on expanding a file + by repeatedly encoding it in UTF-8: + + # BAD CODE WARNING + open F, "file"; + local $/; # read in the whole file + $t = ; + close F; + open F, ">:utf8", "file"; + print F $t; + close F; + + If you run this code twice, the contents of the F will be twice -UTF-8 encoded. A C would have avoided the bug, or -explicitly opening also the F for input as UTF-8. ++UTF-8 encoded. A C would have avoided the bug. + + =head2 Special Cases + + =over 4 + + =item * + + Bit Complement Operator ~ And vec() + + The bit complement operator C<~> will produce surprising results if + used on strings containing Unicode characters. The results are + consistent with the internal UTF-8 encoding of the characters, but not + with much else. So don't do that. Similarly for vec(): you will be + operating on the UTF-8 bit patterns of the Unicode characters, not on + the bytes, which is very probably not what you want. + + =item * + + Peeking At UTF-8 + + One way of peeking inside the internal encoding of Unicode characters + is to use C to get the bytes, or C + to display the bytes: + + # this will print c4 80 for the UTF-8 bytes 0xc4 0x80 + print join(" ", unpack("H*", pack("U", 0x100))), "\n"; + + Yet another way would be to use the Devel::Peek module: + + perl -MDevel::Peek -e 'Dump(chr(0x100))' + + That will show the UTF8 flag in FLAGS and both the UTF-8 bytes + and Unicode characters in PV. See also later in this document + the discussion about the C function of the C module. + + =back + + =head2 Advanced Topics + + =over 4 + + =item * + + String Equivalence + + The question of string equivalence turns somewhat complicated + in Unicode: what do you mean by equal? + + Is C equal to + C? + + The short answer is that by default Perl compares equivalence + (C, C) based only on code points of the characters. + In the above case, no (because 0x00C1 != 0x0041). But sometimes any + CAPITAL LETTER As being considered equal, or even any As of any case, + would be desirable. + + The long answer is that you need to consider character normalization + and casing issues: see L, and Unicode Technical + Reports #15 and #21, I and I, http://www.unicode.org/unicode/reports/tr15/ + http://www.unicode.org/unicode/reports/tr21/ + + As of Perl 5.8.0, the's regular expression case-ignoring matching + implements only 1:1 semantics: one character matches one character. + In I both 1:N and N:1 matches are defined. + + =item * + + String Collation + + People like to see their strings nicely sorted, or as Unicode + parlance goes, collated. But again, what do you mean by collate? + + Does C come before or after + C? + + The short answer is that by default Perl compares strings (C, + C, C, C, C) based only on the code points of the + characters. In the above case, after, since 0x00C1 > 0x00C0. + + The long answer is that "it depends", and a good answer cannot be + given without knowing (at the very least) the language context. + See L, and I + http://www.unicode.org/unicode/reports/tr10/ + + =back + + =head2 Miscellaneous + + =over 4 + + =item * + + Character Ranges + + Character ranges in regular expression character classes (C) + and in the C (also known as C) operator are not magically + Unicode-aware. What this means that C<[a-z]> will not magically start + to mean "all alphabetic letters" (not that it does mean that even for + 8-bit characters, you should be using C for that). + + For specifying things like that in regular expressions you can use the + various Unicode properties, C<\pL> in this particular case. You can + use Unicode code points as the end points of character ranges, but + that means that particular code point range, nothing more. For + further information, see L. + + =item * + + String-To-Number Conversions + + Unicode does define several other decimal (and numeric) characters + than just the familiar 0 to 9, such as the Arabic and Indic digits. + Perl does not support string-to-number conversion for digits other + than the 0 to 9 (and a to f for hexadecimal). + + =back + + =head2 Questions With Answers + + =over 4 + + =item Will My Old Scripts Break? + + Very probably not. Unless you are generating Unicode characters + somehow, any old behaviour should be preserved. About the only + behaviour that has changed and which could start generating Unicode + is the old behaviour of C where supplying an argument more + than 255 produced a character modulo 255 (for example, C + was equal to C). + + =item How Do I Make My Scripts Work With Unicode? + + Very little work should be needed since nothing changes until you + somehow generate Unicode data. The greatest trick will be getting + input as Unicode, and for that see the earlier I/O discussion. + + =item How Do I Know Whether My String Is In Unicode? + + You shouldn't care. No, you really shouldn't. If you have + to care (beyond the cases described above), it means that we + didn't get the transparency of Unicode quite right. + + Okay, if you insist: + + use Encode 'is_utf8'; + print is_utf8($string) ? 1 : 0, "\n"; + + But note that this doesn't mean that any of the characters in the + string are necessary UTF-8 encoded, or that any of the characters have + code points greater than 0xFF (255) or even 0x80 (128), or that the + string has any characters at all. All the C does is to + return the value of the internal "utf8ness" flag attached to the + $string. If the flag is on, characters added to that string will be + automatically upgraded to UTF-8 (and even then only if they really + need to be upgraded, that is, if their code point is greater than 0xFF). + + Sometimes you might really need to know the byte length of a string + instead of the character length. For that use the C pragma + and its only defined function C: + + my $unicode = chr(0x100); + print length($unicode), "\n"; # will print 1 + use bytes; + print length($unicode), "\n"; # will print 2 (the 0xC4 0x80 of the UTF-8) + + =item How Do I Detect Invalid UTF-8? + + Either + + use Encode 'encode_utf8'; + if (encode_utf8($string)) { + # valid + } else { + # invalid + } + + or + + use warnings; + @chars = unpack("U0U*", "\xFF"); # will warn + + The warning will be C. The "U0" means "expect strictly UTF-8 encoded Unicode". + Without that the C would accept also data like + C). + + =item How Do I Convert Data Into UTF-8? Or Vice Versa? + + This probably isn't as useful (or simple) as you might think. + Also, normally you shouldn't need to. + + In one sense what you are asking doesn't make much sense: UTF-8 is + (intended as an) Unicode encoding, so converting "data" into UTF-8 + isn't meaningful unless you know in what character set and encoding + the binary data is in, and in this case you can use C. + + use Encode 'from_to'; + from_to($data, "iso-8859-1", "utf-8"); # from latin-1 to utf-8 + + If you have ASCII (really 7-bit US-ASCII), you already have valid + UTF-8, the lowest 128 characters of UTF-8 encoded Unicode and US-ASCII + are equivalent. + + If you have Latin-1 (or want Latin-1), you can just use pack/unpack: + + $latin1 = pack("C*", unpack("U*", $utf8)); + $utf8 = pack("U*", unpack("C*", $latin1)); + + (The same works for EBCDIC.) + + If you have a sequence of bytes you B is valid UTF-8, + but Perl doesn't know it yet, you can make Perl a believer, too: + + use Encode 'decode_utf8'; + $utf8 = decode_utf8($bytes); + + You can convert well-formed UTF-8 to a sequence of bytes, but if + you just want to convert random binary data into UTF-8, you can't. + Any random collection of bytes isn't well-formed UTF-8. You can + use C for the former, and you can create + well-formed Unicode/UTF-8 data by C. + + =item How Do I Display Unicode? How Do I Input Unicode? + + See http://www.hclrss.demon.co.uk/unicode/ and + http://www.cl.cam.ac.uk/~mgk25/unicode.html + + =item How Does Unicode Work With Traditional Locales? + + In Perl, not very well. Avoid using locales through the C + pragma. Use only one or the other. + + =back + + =head2 Hexadecimal Notation + + The Unicode standard prefers using hexadecimal notation because that + shows better the division of Unicode into blocks of 256 characters. + Hexadecimal is also simply shorter than decimal. You can use decimal + notation, too, but learning to use hexadecimal just makes life easier + with the Unicode standard. + + The C<0x> prefix means a hexadecimal number, the digits are 0-9 I + a-f (or A-F, case doesn't matter). Each hexadecimal digit represents + four bits, or half a byte. C will show a + hexadecimal number in decimal, and C will + show a decimal number in hexadecimal. If you have just the + "hexdigits" of a hexadecimal number, you can use the C + function. + + print 0x0009, "\n"; # 9 + print 0x000a, "\n"; # 10 + print 0x000f, "\n"; # 15 + print 0x0010, "\n"; # 16 + print 0x0011, "\n"; # 17 + print 0x0100, "\n"; # 256 + + print 0x0041, "\n"; # 65 + + printf "%x\n", 65; # 41 + printf "%#x\n", 65; # 0x41 + + print hex("41"), "\n"; # 65 + + =head2 Further Resources + + =over 4 + + =item * + + Unicode Consortium + + http://www.unicode.org/ + + =item * + + Unicode FAQ + + http://www.unicode.org/unicode/faq/ + + =item * + + Unicode Glossary + + http://www.unicode.org/glossary/ + + =item * + + Unicode Useful Resources + + http://www.unicode.org/unicode/onlinedat/resources.html + + =item * + + Unicode and Multilingual Support in HTML, Fonts, Web Browsers and Other Applications + + http://www.hclrss.demon.co.uk/unicode/ + + =item * + + UTF-8 and Unicode FAQ for Unix/Linux + + http://www.cl.cam.ac.uk/~mgk25/unicode.html + + =item * + + Legacy Character Sets + + http://www.czyborra.com/ + http://www.eki.ee/letter/ + + =item * + + The Unicode support files live within the Perl installation in the + directory + + $Config{installprivlib}/unicore + + in Perl 5.8.0 or newer, and + + $Config{installprivlib}/unicode + + in the Perl 5.6 series. (The renaming to F was done to + avoid naming conflicts with lib/Unicode in case-insensitive filesystems.) + The main Unicode data file is F (or F in + Perl 5.6.1.) You can find the C<$Config{installprivlib}> by + + perl "-V:installprivlib" + + Note that some of the files have been renamed from the Unicode + standard since the Perl installation tries to live by the "8.3" + filenaming restrictions. The renamings are shown in the + accompanying F file. + + You can explore various information from the Unicode data files using + the C module. + + =back + + =head1 SEE ALSO + + L, L, L, L, L, L, + L, L, L, L + + =head1 ACKNOWLEDGEMENTS + + Thanks to the kind readers of the perl5-porters@perl.org, + perl-unicode@perl.org, linux-utf8@nl.linux.org, and unicore@unicode.org + mailing lists for their valuable feedback. + + =head1 AUTHOR, COPYRIGHT, AND LICENSE + + Copyright 2001 Jarkko Hietaniemi + + This document may be distributed under the same terms as Perl itself.