Update Changes.
[p5sagit/p5-mst-13.2.git] / lib / utf8.pm
CommitLineData
a0ed51b3 1package utf8;
2
663b9db3 3if (ord('A') != 193) { # make things more pragmatic for EBCDIC folk
4
d5448623 5$utf8::hint_bits = 0x00800000;
6
b75c8c73 7our $VERSION = '1.00';
8
a0ed51b3 9sub import {
d5448623 10 $^H |= $utf8::hint_bits;
a0ed51b3 11 $enc{caller()} = $_[1] if $_[1];
12}
13
14sub unimport {
d5448623 15 $^H &= ~$utf8::hint_bits;
a0ed51b3 16}
17
18sub AUTOLOAD {
19 require "utf8_heavy.pl";
daf4d4ea 20 goto &$AUTOLOAD if defined &$AUTOLOAD;
21 Carp::croak("Undefined subroutine $AUTOLOAD called");
a0ed51b3 22}
23
663b9db3 24}
25
a0ed51b3 261;
27__END__
28
29=head1 NAME
30
393fec97 31utf8 - Perl pragma to enable/disable UTF-8 in source code
a0ed51b3 32
33=head1 SYNOPSIS
34
35 use utf8;
36 no utf8;
37
38=head1 DESCRIPTION
39
393fec97 40WARNING: The implementation of Unicode support in Perl is incomplete.
21bad921 41See L<perlunicode> for the exact details.
a0ed51b3 42
393fec97 43The C<use utf8> pragma tells the Perl parser to allow UTF-8 in the
44program text in the current lexical scope. The C<no utf8> pragma
45tells Perl to switch back to treating the source text as literal
46bytes in the current lexical scope.
a0ed51b3 47
393fec97 48This pragma is primarily a compatibility device. Perl versions
49earlier than 5.6 allowed arbitrary bytes in source code, whereas
50in future we would like to standardize on the UTF-8 encoding for
51source text. Until UTF-8 becomes the default format for source
52text, this pragma should be used to recognize UTF-8 in the source.
53When UTF-8 becomes the standard source format, this pragma will
663b9db3 54effectively become a no-op. This pragma already is a no-op on
55EBCDIC platforms (where it is alright to code perl in EBCDIC
56rather than UTF-8).
a0ed51b3 57
393fec97 58Enabling the C<utf8> pragma has the following effects:
a0ed51b3 59
393fec97 60=over
a0ed51b3 61
62=item *
63
393fec97 64Bytes in the source text that have their high-bit set will be treated
65as being part of a literal UTF-8 character. This includes most literals
66such as identifiers, string constants, constant regular expression patterns
67and package names.
a0ed51b3 68
69=item *
70
393fec97 71In the absence of inputs marked as UTF-8, regular expressions within the
72scope of this pragma will default to using character semantics instead
73of byte semantics.
a0ed51b3 74
393fec97 75 @bytes_or_chars = split //, $data; # may split to bytes if data
76 # $data isn't UTF-8
77 {
78 use utf8; # force char semantics
79 @chars = split //, $data; # splits characters
a0ed51b3 80 }
81
393fec97 82=head1 SEE ALSO
a0ed51b3 83
8058d7ab 84L<perlunicode>, L<bytes>
a0ed51b3 85
86=cut