lib/utf8.pm

   1 package utf8;
   2
   3
   4 $utf8::hint_bits = 0x00800000;
   5
   6 our $VERSION = '1.00';
   7
   8 sub import {
   9     $^H |= $utf8::hint_bits;
  10     $enc{caller()} = $_[1] if $_[1];
  11 }
  12
  13 sub unimport {
  14     $^H &= ~$utf8::hint_bits;
  15 }
  16
  17 sub AUTOLOAD {
  18     require "utf8_heavy.pl";
  19     goto &$AUTOLOAD if defined &$AUTOLOAD;
  20     Carp::croak("Undefined subroutine $AUTOLOAD called");
  21 }
  22
  23 1;
  24 __END__
  25
  26 =head1 NAME
  27
  28 utf8 - Perl pragma to enable/disable UTF-8 (or UTF-EBCDIC) in source code
  29
  30 =head1 SYNOPSIS
  31
  32     use utf8;
  33     no utf8;
  34
  35 =head1 DESCRIPTION
  36
  37 WARNING: The implementation of Unicode support in Perl is incomplete.
  38 See L<perlunicode> for the exact details.
  39
  40 The C<use utf8> pragma tells the Perl parser to allow UTF-8 in the
  41 program text in the current lexical scope (allow UTF-EBCDIC on EBCDIC based
  42 platforms).  The C<no utf8> pragma tells Perl to switch back to treating
  43 the source text as literal bytes in the current lexical scope.
  44
  45 This pragma is primarily a compatibility device.  Perl versions
  46 earlier than 5.6 allowed arbitrary bytes in source code, whereas
  47 in future we would like to standardize on the UTF-8 encoding for
  48 source text.  Until UTF-8 becomes the default format for source
  49 text, this pragma should be used to recognize UTF-8 in the source.
  50 When UTF-8 becomes the standard source format, this pragma will
  51 effectively become a no-op.  For convenience in what follows the
  52 term UTF-X is used to refer to UTF-8 on ASCII and ISO Latin based
  53 platforms and UTF-EBCDIC on EBCDIC based platforms.
  54
  55 Enabling the C<utf8> pragma has the following effects:
  56
  57 =over 4
  58
  59 =item *
  60
  61 Bytes in the source text that have their high-bit set will be treated
  62 as being part of a literal UTF-8 character.  This includes most literals
  63 such as identifiers, string constants, constant regular expression patterns
  64 and package names.  On EBCDIC platforms, characters in the C1 control group
  65 and the Latin 1 character set are treated as being part of a literal
  66 UTF-EBCDIC character.
  67
  68 =item *
  69
  70 In the absence of inputs marked as UTF-X, regular expressions within the
  71 scope of this pragma will default to using character semantics instead
  72 of byte semantics.
  73
  74     @bytes_or_chars = split //, $data;  # may split to bytes if data
  75                                         # $data isn't UTF-X
  76     {
  77         use utf8;                       # force char semantics
  78         @chars = split //, $data;       # splits characters
  79     }
  80
  81 =back
  82
  83 =head2 Utility functions
  84
  85 The following functions are defined in the C<utf8::> package by the perl core.
  86
  87 =over 4
  88
  89 =item * $num_octets = utf8::upgrade($string);
  90
  91 Converts internal representation of string to the perls internal UTF-X form.
  92 Returns the number of octets necessary to represent the string as UTF-X.
  93
  94 =item * utf8::downgrade($string[, CHECK])
  95
  96 Converts internal representation of string to be un-encoded bytes.
  97
  98 =item * utf8::encode($string)
  99
 100 Converts (in-place) I<$string> from logical characters to octet sequence
 101 representing it in perl's UTF-X encoding.
 102
 103 =item * $flag = utf8::decode($string)
 104
 105 Attempts to convert I<$string> in-place from perl's UTF-X encoding into logical characters.
 106
 107 =back
 108
 109 =head1 SEE ALSO
 110
 111 L<perlunicode>, L<bytes>
 112
 113 =cut