ext/Encode/Encode.pm

   1 package Encode;
   2
   3 $VERSION = 0.01;
   4
   5 require DynaLoader;
   6 require Exporter;
   7
   8 @ISA = qw(Exporter DynaLoader);
   9
  10 # Public, encouraged API is exported by default
  11 @EXPORT = qw (
  12   encode
  13   decode
  14   encode_utf8
  15   decode_utf8
  16   find_encoding
  17 );
  18
  19 @EXPORT_OK =
  20     qw(
  21        encodings
  22        from_to
  23        is_utf8
  24        is_8bit
  25        is_16bit
  26        utf8_upgrade
  27        utf8_downgrade
  28        _utf8_on
  29        _utf8_off
  30       );
  31
  32 bootstrap Encode ();
  33
  34 # Documentation moved after __END__ for speed - NI-S
  35
  36 use Carp;
  37
  38 # The global hash is declared in XS code
  39 $encoding{Unicode}      = bless({},'Encode::Unicode');
  40 $encoding{utf8}         = bless({},'Encode::utf8');
  41 $encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
  42
  43 sub encodings
  44 {
  45  my ($class) = @_;
  46  foreach my $dir (@INC)
  47   {
  48    if (opendir(my $dh,"$dir/Encode"))
  49     {
  50      while (defined(my $name = readdir($dh)))
  51       {
  52        if ($name =~ /^(.*)\.enc$/)
  53         {
  54          next if exists $encoding{$1};
  55          $encoding{$1} = "$dir/$name";
  56         }
  57       }
  58      closedir($dh);
  59     }
  60   }
  61  return keys %encoding;
  62 }
  63
  64 sub loadEncoding
  65 {
  66  my ($class,$name,$file) = @_;
  67  if (open(my $fh,$file))
  68   {
  69    my $type;
  70    while (1)
  71     {
  72      my $line = <$fh>;
  73      $type = substr($line,0,1);
  74      last unless $type eq '#';
  75     }
  76    $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
  77    #warn "Loading $file";
  78    return $class->read($fh,$name,$type);
  79   }
  80  else
  81   {
  82    return undef;
  83   }
  84 }
  85
  86 sub getEncoding
  87 {
  88  my ($class,$name) = @_;
  89  my $enc;
  90  unless (ref($enc = $encoding{$name}))
  91   {
  92    $enc = $class->loadEncoding($name,$enc) if defined $enc;
  93    unless (ref($enc))
  94     {
  95      foreach my $dir (@INC)
  96       {
  97        last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
  98       }
  99     }
 100    $encoding{$name} = $enc;
 101   }
 102  return $enc;
 103 }
 104
 105 sub find_encoding
 106 {
 107  my ($name) = @_;
 108  return __PACKAGE__->getEncoding($name);
 109 }
 110
 111 sub encode
 112 {
 113  my ($name,$string,$check) = @_;
 114  my $enc = find_encoding($name);
 115  croak("Unknown encoding '$name'") unless defined $enc;
 116  my $octets = $enc->encode($string,$check);
 117  return undef if ($check && length($string));
 118  return $octets;
 119 }
 120
 121 sub decode
 122 {
 123  my ($name,$octets,$check) = @_;
 124  my $enc = find_encoding($name);
 125  croak("Unknown encoding '$name'") unless defined $enc;
 126  my $string = $enc->decode($octets,$check);
 127  return undef if ($check && length($octets));
 128  return $string;
 129 }
 130
 131 sub from_to
 132 {
 133  my ($string,$from,$to,$check) = @_;
 134  my $f = find_encoding($from);
 135  croak("Unknown encoding '$from'") unless defined $f;
 136  my $t = find_encoding($to);
 137  croak("Unknown encoding '$to'") unless defined $t;
 138  my $uni = $f->decode($string,$check);
 139  return undef if ($check && length($string));
 140  $string = $t->encode($uni,$check);
 141  return undef if ($check && length($uni));
 142  return length($_[0] = $string);
 143 }
 144
 145 sub encode_utf8
 146 {
 147  my ($str) = @_;
 148  utf8_encode($str);
 149  return $str;
 150 }
 151
 152 sub decode_utf8
 153 {
 154  my ($str) = @_;
 155  return undef unless utf8_decode($str);
 156  return $str;
 157 }
 158
 159 package Encode::Encoding;
 160 # Base class for classes which implement encodings
 161 # Temporary legacy methods
 162 sub toUnicode   { shift->decode(@_) }
 163 sub fromUnicode { shift->encode(@_) }
 164
 165 package Encode::XS;
 166 use base 'Encode::Encoding';
 167
 168 package Encode::Unicode;
 169 use base 'Encode::Encoding';
 170
 171 # Dummy package that provides the encode interface but leaves data
 172 # as UTF-8 encoded. It is here so that from_to() works.
 173
 174 sub name { 'Unicode' }
 175
 176 sub decode
 177 {
 178  my ($obj,$str,$chk) = @_;
 179  Encode::utf8_upgrade($str);
 180  $_[1] = '' if $chk;
 181  return $str;
 182 }
 183
 184 *encode = \&decode;
 185
 186 package Encode::utf8;
 187 use base 'Encode::Encoding';
 188
 189 # package to allow long-hand
 190 #   $octets = encode( utf8 => $string );
 191 #
 192
 193 sub name { 'utf8' }
 194
 195 sub decode
 196 {
 197  my ($obj,$octets,$chk) = @_;
 198  my $str = Encode::decode_utf8($octets);
 199  if (defined $str)
 200   {
 201    $_[1] = '' if $chk;
 202    return $str;
 203   }
 204  return undef;
 205 }
 206
 207 sub encode
 208 {
 209  my ($obj,$string,$chk) = @_;
 210  my $octets = Encode::encode_utf8($string);
 211  $_[1] = '' if $chk;
 212  return $octets;
 213 }
 214
 215 package Encode::Table;
 216 use base 'Encode::Encoding';
 217
 218 sub read
 219 {
 220  my ($class,$fh,$name,$type) = @_;
 221  my $rep = $class->can("rep_$type");
 222  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 223  my @touni;
 224  my %fmuni;
 225  my $count = 0;
 226  $def = hex($def);
 227  while ($pages--)
 228   {
 229    my $line = <$fh>;
 230    chomp($line);
 231    my $page = hex($line);
 232    my @page;
 233    my $ch = $page * 256;
 234    for (my $i = 0; $i < 16; $i++)
 235     {
 236      my $line = <$fh>;
 237      for (my $j = 0; $j < 16; $j++)
 238       {
 239        my $val = hex(substr($line,0,4,''));
 240        if ($val || !$ch)
 241         {
 242          my $uch = chr($val);
 243          push(@page,$uch);
 244          $fmuni{$uch} = $ch;
 245          $count++;
 246         }
 247        else
 248         {
 249          push(@page,undef);
 250         }
 251        $ch++;
 252       }
 253     }
 254    $touni[$page] = \@page;
 255   }
 256
 257  return bless {Name  => $name,
 258                Rep   => $rep,
 259                ToUni => \@touni,
 260                FmUni => \%fmuni,
 261                Def   => $def,
 262                Num   => $count,
 263               },$class;
 264 }
 265
 266 sub name { shift->{'Name'} }
 267
 268 sub rep_S { 'C' }
 269
 270 sub rep_D { 'n' }
 271
 272 sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
 273
 274 sub representation
 275 {
 276  my ($obj,$ch) = @_;
 277  $ch = 0 unless @_ > 1;
 278  $obj-{'Rep'}->($ch);
 279 }
 280
 281 sub decode
 282 {
 283  my ($obj,$str,$chk) = @_;
 284  my $rep   = $obj->{'Rep'};
 285  my $touni = $obj->{'ToUni'};
 286  my $uni   = '';
 287  while (length($str))
 288   {
 289    my $ch = ord(substr($str,0,1,''));
 290    my $x;
 291    if (&$rep($ch) eq 'C')
 292     {
 293      $x = $touni->[0][$ch];
 294     }
 295    else
 296     {
 297      $x = $touni->[$ch][ord(substr($str,0,1,''))];
 298     }
 299    unless (defined $x)
 300     {
 301      last if $chk;
 302      # What do we do here ?
 303      $x = '';
 304     }
 305    $uni .= $x;
 306   }
 307  $_[1] = $str if $chk;
 308  return $uni;
 309 }
 310
 311 sub encode
 312 {
 313  my ($obj,$uni,$chk) = @_;
 314  my $fmuni = $obj->{'FmUni'};
 315  my $str   = '';
 316  my $def   = $obj->{'Def'};
 317  my $rep   = $obj->{'Rep'};
 318  while (length($uni))
 319   {
 320    my $ch = substr($uni,0,1,'');
 321    my $x  = $fmuni->{chr(ord($ch))};
 322    unless (defined $x)
 323     {
 324      last if ($chk);
 325      $x = $def;
 326     }
 327    $str .= pack(&$rep($x),$x);
 328   }
 329  $_[1] = $uni if $chk;
 330  return $str;
 331 }
 332
 333 package Encode::iso10646_1;
 334 use base 'Encode::Encoding';
 335
 336 # Encoding is 16-bit network order Unicode
 337 # Used for X font encodings
 338
 339 sub name { 'iso10646-1' }
 340
 341 sub decode
 342 {
 343  my ($obj,$str,$chk) = @_;
 344  my $uni   = '';
 345  while (length($str))
 346   {
 347    my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
 348    $uni .= chr($code);
 349   }
 350  $_[1] = $str if $chk;
 351  Encode::utf8_upgrade($uni);
 352  return $uni;
 353 }
 354
 355 sub encode
 356 {
 357  my ($obj,$uni,$chk) = @_;
 358  my $str   = '';
 359  while (length($uni))
 360   {
 361    my $ch = substr($uni,0,1,'');
 362    my $x  = ord($ch);
 363    unless ($x < 32768)
 364     {
 365      last if ($chk);
 366      $x = 0;
 367     }
 368    $str .= pack('n',$x);
 369   }
 370  $_[1] = $uni if $chk;
 371  return $str;
 372 }
 373
 374
 375 package Encode::Escape;
 376 use base 'Encode::Encoding';
 377
 378 use Carp;
 379
 380 sub read
 381 {
 382  my ($class,$fh,$name) = @_;
 383  my %self = (Name => $name, Num => 0);
 384  while (<$fh>)
 385   {
 386    my ($key,$val) = /^(\S+)\s+(.*)$/;
 387    $val =~ s/^\{(.*?)\}/$1/g;
 388    $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
 389    $self{$key} = $val;
 390   }
 391  return bless \%self,$class;
 392 }
 393
 394 sub name { shift->{'Name'} }
 395
 396 sub decode
 397 {
 398  croak("Not implemented yet");
 399 }
 400
 401 sub encode
 402 {
 403  croak("Not implemented yet");
 404 }
 405
 406 # switch back to Encode package in case we ever add AutoLoader
 407 package Encode;
 408
 409 1;
 410
 411 __END__
 412
 413 =head1 NAME
 414
 415 Encode - character encodings
 416
 417 =head1 SYNOPSIS
 418
 419     use Encode;
 420
 421 =head1 DESCRIPTION
 422
 423 The C<Encode> module provides the interfaces between perl's strings
 424 and the rest of the system. Perl strings are sequences of B<characters>.
 425
 426 The repertoire of characters that Perl can represent is at least that
 427 defined by the Unicode Consortium. On most platforms the ordinal values
 428 of the  characters (as returned by C<ord(ch)>) is the "Unicode codepoint" for
 429 the character (the exceptions are those platforms where the legacy
 430 encoding is some variant of EBCDIC rather than a super-set of ASCII
 431 - see L<perlebcdic>).
 432
 433 Traditionaly computer data has been moved around in 8-bit chunks
 434 often called "bytes". These chunks are also known as "octets" in
 435 networking standards. Perl is widely used to manipulate data of
 436 many types - not only strings of characters representing human or
 437 computer languages but also "binary" data being the machines representation
 438 of numbers, pixels in an image - or just about anything.
 439
 440 When perl is processing "binary data" the programmer wants perl to process
 441 "sequences of bytes". This is not a problem for perl - as a byte has 256
 442 possible values it easily fits in perl's much larger "logical character".
 443
 444 =head2 TERMINOLOGY
 445
 446 =over
 447
 448 =item *
 449
 450 I<character>: a character in the range 0..(2**32-1) (or more).
 451 (What perl's strings are made of.)
 452
 453 =item *
 454
 455 I<byte>: a character in the range 0..255
 456 (A special case of a perl character.)
 457
 458 =item *
 459
 460 I<octet>: 8 bits of data, with ordinal values 0..255
 461 (Term for bytes passed to or from a non-perl context, e.g. disk file.)
 462
 463 =back
 464
 465 The marker [INTERNAL] marks Internal Implementation Details, in
 466 general meant only for those who think they know what they are doing,
 467 and such details may change in future releases.
 468
 469 =head1 ENCODINGS
 470
 471 =head2 Characteristics of an Encoding
 472
 473 An encoding has a "repertoire" of characters that it can represent,
 474 and for each representable character there is at least one sequence of
 475 octets that represents it.
 476
 477 =head2 Types of Encodings
 478
 479 Encodings can be divided into the following types:
 480
 481 =over 4
 482
 483 =item * Fixed length 8-bit (or less) encodings.
 484
 485 Each character is a single octet so may have a repertoire of up to
 486 256 characters. ASCII and iso-8859-* are typical examples.
 487
 488 =item * Fixed length 16-bit encodings
 489
 490 Each character is two octets so may have a repertoire of up to
 491 65,536 characters. Unicode's UCS-2 is an example. Also used for
 492 encodings for East Asian languages.
 493
 494 =item * Fixed length 32-bit encodings.
 495
 496 Not really very "encoded" encodings. The Unicode code points
 497 are just represented as 4-octet integers. None the less because
 498 different architectures use different representations of integers
 499 (so called "endian") there at least two disctinct encodings.
 500
 501 =item * Multi-byte encodings
 502
 503 The number of octets needed to represent a character varies.
 504 UTF-8 is a particularly complex but regular case of a multi-byte
 505 encoding. Several East Asian countries use a multi-byte encoding
 506 where 1-octet is used to cover western roman characters and Asian
 507 characters get 2-octets.
 508 (UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
 509 to represent a Unicode code point.)
 510
 511 =item * "Escape" encodings.
 512
 513 These encodings embed "escape sequences" into the octet sequence
 514 which describe how the following octets are to be interpreted.
 515 The iso-2022-* family is typical. Following the escape sequence
 516 octets are encoded by an "embedded" encoding (which will be one
 517 of the above types) until another escape sequence switches to
 518 a different "embedded" encoding.
 519
 520 These schemes are very flexible and can handle mixed languages but are
 521 very complex to process (and have state).
 522 No escape encodings are implemented for perl yet.
 523
 524 =back
 525
 526 =head2 Specifying Encodings
 527
 528 Encodings can be specified to the API described below in two ways:
 529
 530 =over 4
 531
 532 =item 1. By name
 533
 534 Encoding names are strings with characters taken from a restricted repertoire.
 535 See L</"Encoding Names">.
 536
 537 =item 2. As an object
 538
 539 Encoding objects are returned by C<find_encoding($name)>.
 540
 541 =back
 542
 543 =head2 Encoding Names
 544
 545 Encoding names are case insensitive. White space in names is ignored.
 546 In addition an encoding may have aliases. Each encoding has one "canonical" name.
 547 The "canonical" name is chosen from the names of the encoding by picking
 548 the first in the following sequence:
 549
 550 =over 4
 551
 552 =item * The MIME name as defined in IETF RFC-XXXX.
 553
 554 =item * The name in the IANA registry.
 555
 556 =item * The name used by the the organization that defined it.
 557
 558 =back
 559
 560 Because of all the alias issues, and because in the general case
 561 encodings have state C<Encode> uses the encoding object internally
 562 once an operation is in progress.
 563
 564 I<Aliasing is not yet implemented.>
 565
 566 =head1 PERL ENCODING API
 567
 568 =head2 Generic Encoding Interface
 569
 570 =over 4
 571
 572 =item *
 573
 574         $bytes  = encode(ENCODING, $string[, CHECK])
 575
 576 Encodes string from perl's internal form into I<ENCODING> and returns a
 577 sequence of octets.
 578 See L</"Handling Malformed Data">.
 579
 580 =item *
 581
 582         $string = decode(ENCODING, $bytes[, CHECK])
 583
 584 Decode sequence of octets assumed to be in I<ENCODING> into perls internal
 585 form and returns the resuting string.
 586 See L</"Handling Malformed Data">.
 587
 588 =back
 589
 590 =head2 Handling Malformed Data
 591
 592 If CHECK is not set, C<undef> is returned.  If the data is supposed to
 593 be UTF-8, an optional lexical warning (category utf8) is given.
 594 If CHECK is true but not a code reference, dies.
 595
 596 It would desirable to have a way to indicate that transform should use the
 597 encodings "replacement character" - no such mechanism is defined yet.
 598
 599 It is also planned to allow I<CHECK> to be a code reference.
 600
 601 This is not yet implemented as there are design issues with what its arguments
 602 should be and how it returns its results.
 603
 604 =over 4
 605
 606 =item Scheme 1
 607
 608 Passed remaining fragment of string being processed.
 609 Modifies it in place to remove bytes/characters it can understand
 610 and returns a string used to represent them.
 611 e.g.
 612
 613  sub fixup {
 614    my $ch = substr($_[0],0,1,'');
 615    return sprintf("\x{%02X}",ord($ch);
 616  }
 617
 618 This scheme is close to how underlying C code for Encode works, but gives
 619 the fixup routine very little context.
 620
 621 =item Scheme 2
 622
 623 Passed original string, and an index into it of the problem area,
 624 and output string so far.
 625 Appends what it will to output string and returns new index into
 626 original string.
 627 e.g.
 628
 629  sub fixup {
 630    # my ($s,$i,$d) = @_;
 631    my $ch = substr($_[0],$_[1],1);
 632    $_[2] .= sprintf("\x{%02X}",ord($ch);
 633    return $_[1]+1;
 634  }
 635
 636 This scheme gives maximal control to the fixup routine but is more complicated
 637 to code, and may need internals of Encode to be tweaked to keep original
 638 string intact.
 639
 640 =item Other Schemes
 641
 642 Hybrids of above.
 643
 644 Multiple return values rather than in-place modifications.
 645
 646 Index into the string could be pos($str) allowing s/\G...//.
 647
 648 =back
 649
 650 =head2 UTF-8 / utf8
 651
 652 The Unicode consortium defines the UTF-8 standard as a way of encoding
 653 the entire Unicode repertiore as sequences of octets. This encoding
 654 is expected to become very widespread. Perl can use this form internaly
 655 to represent strings, so conversions to and from this form are particularly
 656 efficient (as octets in memory do not have to change, just the meta-data
 657 that tells perl how to treat them).
 658
 659 =over 4
 660
 661 =item *
 662
 663         $bytes = encode_utf8($string);
 664
 665 The characters that comprise string are encoded in perl's superset of UTF-8
 666 and the resulting octets returned as a sequence of bytes. All possible
 667 characters have a UTF-8 representation so this function cannot fail.
 668
 669 =item *
 670
 671         $string = decode_utf8($bytes [,CHECK]);
 672
 673 The sequence of octets represented by $bytes is decoded from UTF-8 into
 674 a sequence of logical characters. Not all sequences of octets form valid
 675 UTF-8 encodings, so it is possible for this call to fail.
 676 See L</"Handling Malformed Data">.
 677
 678 =back
 679
 680 =head2 Other Encodings of Unicode
 681
 682 UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks.
 683 UCS-2 can only represent 0..0xFFFF, while UTF-16 has a "surogate pair"
 684 scheme which allows it to cover the whole Unicode range.
 685
 686 Encode implements big-endian UCS-2 as the encoding "iso10646-1" as that
 687 happens to be the name used by that representation when used with X11 fonts.
 688
 689 UTF-32 or UCS-4 is 32-bit or 4-byte chunks.  Perl's logical characters
 690 can be considered as being in this form without encoding. An encoding
 691 to transfer strings in this form (e.g. to write them to a file) would need to
 692
 693      pack('L',map(chr($_),split(//,$string)));   # native
 694   or
 695      pack('V',map(chr($_),split(//,$string)));   # little-endian
 696   or
 697      pack('N',map(chr($_),split(//,$string)));   # big-endian
 698
 699 depending on the endian required.
 700
 701 No UTF-32 encodings are not yet implemented.
 702
 703 Both UCS-2 and UCS-4 style encodings can have "byte order marks" by representing
 704 the code point 0xFFFE as the very first thing in a file.
 705
 706 =head1 Encoding and IO
 707
 708 It is very common to want to do encoding transformations when
 709 reading or writing files, network connections, pipes etc.
 710 If perl is configured to use the new 'perlio' IO system then
 711 C<Encode> provides a "layer" (See L<perliol>) which can transform
 712 data as it is read or written.
 713
 714      open(my $ilyad,'>:encoding(iso8859-7)','ilyad.greek');
 715      print $ilyad @epic;
 716
 717 In addition the new IO system can also be configured to read/write
 718 UTF-8 encoded characters (as noted above this is efficient):
 719
 720      open(my $fh,'>:utf8','anything');
 721      print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
 722
 723 Either of the above forms of "layer" specifications can be made the default
 724 for a lexical scope with the C<use open ...> pragma. See L<open>.
 725
 726 Once a handle is open is layers can be altered using C<binmode>.
 727
 728 Without any such configuration, or if perl itself is built using
 729 system's own IO, then write operations assume that file handle accepts
 730 only I<bytes> and will C<die> if a character larger than 255 is
 731 written to the handle. When reading, each octet from the handle
 732 becomes a byte-in-a-character. Note that this default is the same
 733 behaviour as bytes-only languages (including perl before v5.6) would have,
 734 and is sufficient to handle native 8-bit encodings e.g. iso-8859-1,
 735 EBCDIC etc. and any legacy mechanisms for handling other encodings
 736 and binary data.
 737
 738 In other cases it is the programs responsibility
 739 to transform characters into bytes using the API above before
 740 doing writes, and to transform the bytes read from a handle into characters
 741 before doing "character operations" (e.g. C<lc>, C</\W+/>, ...).
 742
 743 =head1 Encoding How to ...
 744
 745 To do:
 746
 747 =over 4
 748
 749 =item * IO with mixed content (faking iso-2020-*)
 750
 751 =item * MIME's Content-Length:
 752
 753 =item * UTF-8 strings in binary data.
 754
 755 =item * perl/Encode wrappers on non-Unicode XS modules.
 756
 757 =back
 758
 759 =head1 Messing with Perl's Internals
 760
 761 The following API uses parts of perl's internals in the current implementation.
 762 As such they are efficient, but may change.
 763
 764 =over 4
 765
 766 =item *
 767
 768         $num_octets = utf8_upgrade($string);
 769
 770 Converts internal representation of string to the UTF-8 form.
 771 Returns the number of octets necessary to represent the string as UTF-8.
 772
 773 =item * utf8_downgrade($string[, CHECK])
 774
 775 Converts internal representation of string to be un-encoded bytes.
 776
 777 =item * is_utf8(STRING [, CHECK])
 778
 779 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 780 If CHECK is true, also checks the data in STRING for being
 781 well-formed UTF-8.  Returns true if successful, false otherwise.
 782
 783 =item * valid_utf8(STRING)
 784
 785 [INTERNAL] Test whether STRING is in a consistent state.
 786 Will return true if string is held as bytes, or is well-formed UTF-8
 787 and has the UTF-8 flag on.
 788 Main reason for this routine is to allow perl's testsuite to check
 789 that operations have left strings in a consistent state.
 790
 791 =item *
 792
 793         _utf8_on(STRING)
 794
 795 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 796 B<not> checked for being well-formed UTF-8.  Do not use unless you
 797 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 798 state of the UTF-8 flag (so please don't test the return value as
 799 I<not> success or failure), or C<undef> if STRING is not a string.
 800
 801 =item *
 802
 803         _utf8_off(STRING)
 804
 805 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 806 Returns the previous state of the UTF-8 flag (so please don't test the
 807 return value as I<not> success or failure), or C<undef> if STRING is
 808 not a string.
 809
 810 =back
 811
 812 =head1 SEE ALSO
 813
 814 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>
 815
 816 =cut
 817
 818
 819