UTF-X encoding invariance for Encode:
[p5sagit/p5-mst-13.2.git] / lib / utf8.pm
CommitLineData
a0ed51b3 1package utf8;
2
663b9db3 3if (ord('A') != 193) { # make things more pragmatic for EBCDIC folk
4
d5448623 5$utf8::hint_bits = 0x00800000;
6
b75c8c73 7our $VERSION = '1.00';
8
a0ed51b3 9sub import {
d5448623 10 $^H |= $utf8::hint_bits;
a0ed51b3 11 $enc{caller()} = $_[1] if $_[1];
12}
13
14sub unimport {
d5448623 15 $^H &= ~$utf8::hint_bits;
a0ed51b3 16}
17
18sub AUTOLOAD {
19 require "utf8_heavy.pl";
daf4d4ea 20 goto &$AUTOLOAD if defined &$AUTOLOAD;
21 Carp::croak("Undefined subroutine $AUTOLOAD called");
a0ed51b3 22}
23
663b9db3 24}
25
a0ed51b3 261;
27__END__
28
29=head1 NAME
30
393fec97 31utf8 - Perl pragma to enable/disable UTF-8 in source code
a0ed51b3 32
33=head1 SYNOPSIS
34
35 use utf8;
36 no utf8;
37
38=head1 DESCRIPTION
39
393fec97 40WARNING: The implementation of Unicode support in Perl is incomplete.
21bad921 41See L<perlunicode> for the exact details.
a0ed51b3 42
393fec97 43The C<use utf8> pragma tells the Perl parser to allow UTF-8 in the
44program text in the current lexical scope. The C<no utf8> pragma
45tells Perl to switch back to treating the source text as literal
46bytes in the current lexical scope.
a0ed51b3 47
393fec97 48This pragma is primarily a compatibility device. Perl versions
49earlier than 5.6 allowed arbitrary bytes in source code, whereas
50in future we would like to standardize on the UTF-8 encoding for
51source text. Until UTF-8 becomes the default format for source
52text, this pragma should be used to recognize UTF-8 in the source.
53When UTF-8 becomes the standard source format, this pragma will
663b9db3 54effectively become a no-op. This pragma already is a no-op on
1b026014 55EBCDIC platforms (where it is alright to code perl in EBCDIC
663b9db3 56rather than UTF-8).
a0ed51b3 57
393fec97 58Enabling the C<utf8> pragma has the following effects:
a0ed51b3 59
4ac9195f 60=over 4
a0ed51b3 61
62=item *
63
393fec97 64Bytes in the source text that have their high-bit set will be treated
65as being part of a literal UTF-8 character. This includes most literals
66such as identifiers, string constants, constant regular expression patterns
67and package names.
a0ed51b3 68
69=item *
70
393fec97 71In the absence of inputs marked as UTF-8, regular expressions within the
72scope of this pragma will default to using character semantics instead
73of byte semantics.
a0ed51b3 74
393fec97 75 @bytes_or_chars = split //, $data; # may split to bytes if data
76 # $data isn't UTF-8
77 {
78 use utf8; # force char semantics
79 @chars = split //, $data; # splits characters
a0ed51b3 80 }
81
4ac9195f 82=back
83
1b026014 84=head2 Utility functions
85
86The following functions are defined in the C<utf8::> package by the perl core.
87
88=over 4
89
90=item * $num_octets = utf8::upgrade($string);
91
92Converts internal representation of string to the perls internal UTF-X form.
93Returns the number of octets necessary to represent the string as UTF-X.
94
95=item * utf8::downgrade($string[, CHECK])
96
97Converts internal representation of string to be un-encoded bytes.
98
99=item * utf8::encode($string)
100
101Converts (in-place) I<$string> from logical characters to octet sequence
102representing it in perl's UTF-X encoding.
103
104=item * $flag = utf8::decode($string)
105
106Attempts to converts I<$string> in-place from perl's UTF-X encoding into logical characters.
107
108=back
109
393fec97 110=head1 SEE ALSO
a0ed51b3 111
8058d7ab 112L<perlunicode>, L<bytes>
a0ed51b3 113
114=cut