Chip didn't time travel.

[p5sagit/p5-mst-13.2.git] / pod / perlebcdic.pod
diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod

index c98b46c..a9f1d0f 100644 (file)
--- a/pod/perlebcdic.pod
+++ b/pod/perlebcdic.pod
@@ -97,6 +97,44 @@ for VM/ESA.  CCSID 1047 differs from CCSID 0037 in eight places.
 The EBCDIC code page in use on Siemens' BS2000 system is distinct from
 1047 and 0037.  It is identified below as the POSIX-BC set.
 
+=head2 Unicode code points versus EBCDIC code points
+
+In Unicode terminology a I<code point> is the number assigned to a
+character: for example, in EBCDIC the character "A" is usually assigned
+the number 193.  In Unicode the character "A" is assigned the number 65.
+This causes a problem with the semantics of the pack/unpack "U", which
+are supposed to pack Unicode code points to characters and back to numbers.
+The problem is: which code points to use for code points less than 256?
+(for 256 and over there's no problem: Unicode code points are used)
+In EBCDIC, for the low 256 the EBCDIC code points are used.  This
+means that the equivalences
+
+       pack("U", ord($character)) eq $character
+       unpack("U", $character) == ord $character
+
+will hold.  (If Unicode code points were applied consistently over
+all the possible code points, pack("U",ord("A")) would in EBCDIC
+equal I<A with acute> or chr(101), and unpack("U", "A") would equal
+65, or I<non-breaking space>, not 193, or ord "A".)
+
+=head2 Remaining Perl Unicode problems in EBCDIC
+
+=over 4
+
+=item *
+
+Many of the remaining seem to be related to case-insensitive matching:
+for example, C<< /[\x{131}]/ >> (LATIN SMALL LETTER DOTLESS I) does
+not match "I" case-insensitively, as it should under Unicode.
+(The match succeeds in ASCII-derived platforms.)
+
+=item *
+
+The extensions Unicode::Collate and Unicode::Normalized are not
+supported under EBCDIC, likewise for the encoding pragma.
+
+=back
+
 =head2 Unicode and UTF
 
 UTF is a Unicode Transformation Format.  UTF-8 is a Unicode conforming
@@ -104,6 +142,32 @@ representation of the Unicode standard that looks very much like ASCII.
 UTF-EBCDIC is an attempt to represent Unicode characters in an EBCDIC
 transparent manner.
 
+=head2 Using Encode
+
+Starting from Perl 5.8 you can use the standard new module Encode
+to translate from EBCDIC to Latin-1 code points
+
+       use Encode 'from_to';
+
+       my %ebcdic = ( 176 => 'cp37', 95 => 'cp1047', 106 => 'posix-bc' );
+
+       # $a is in EBCDIC code points
+       from_to($a, $ebcdic{ord '^'}, 'latin1');
+       # $a is ISO 8859-1 code points
+
+and from Latin-1 code points to EBCDIC code points
+
+       use Encode 'from_to';
+
+       my %ebcdic = ( 176 => 'cp37', 95 => 'cp1047', 106 => 'posix-bc' );
+
+       # $a is ISO 8859-1 code points
+       from_to($a, 'latin1', $ebcdic{ord '^'});
+       # $a is in EBCDIC code points
+
+For doing I/O it is suggested that you use the autotranslating features
+of PerlIO, see L<perluniintro>.
+
 =head1 SINGLE OCTET TABLES
 
 The following tables list the ASCII and Latin 1 ordered sets including
@@ -1054,7 +1118,7 @@ following will print "Yes indeed\n" on either an ASCII or EBCDIC computer:
     $all_byte_chrs = '';
     for (0..255) { $all_byte_chrs .= chr($_); }
     $uuencode_byte_chrs = pack('u', $all_byte_chrs);
-    ($uu = <<'    ENDOFHEREDOC') =~ s/^\s*//gm;
+    ($uu = <<'ENDOFHEREDOC') =~ s/^\s*//gm;
     M``$"`P0%!@<("0H+#`T.#Q`1$A,4%187&!D:&QP='A\@(2(C)"4F)R@I*BLL
     M+2XO,#$R,S0U-C<X.3H[/#T^/T!!0D-$149'2$E*2TQ-3D]045)35%565UA9
     M6EM<75Y?8&%B8V1E9F=H:6IK;&UN;W!Q<G-T=79W>'EZ>WQ]?G^`@8*#A(6&