lib/encoding.t

   1 BEGIN {
   2     require Config; import Config;
   3     if ($Config{'extensions'} !~ /\bEncode\b/) {
   4       print "1..0 # Skip: Encode was not built\n";
   5       exit 0;
   6     }
   7     if (ord("A") == 193) {
   8         print "1..0 # encoding pragma does not support EBCDIC platforms\n";
   9         exit(0);
  10     }
  11 }
  12
  13 print "1..29\n";
  14
  15 use encoding "latin1"; # ignored (overwritten by the next line)
  16 use encoding "greek";  # iso 8859-7 (no "latin" alias, surprise...)
  17
  18 # "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
  19 # \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
  20 # instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)
  21
  22 $a = "\xDF";
  23 $b = "\x{100}";
  24
  25 print "not " unless ord($a) == 0x3af;
  26 print "ok 1\n";
  27
  28 print "not " unless ord($b) == 0x100;
  29 print "ok 2\n";
  30
  31 my $c;
  32
  33 $c = $a . $b;
  34
  35 print "not " unless ord($c) == 0x3af;
  36 print "ok 3\n";
  37
  38 print "not " unless length($c) == 2;
  39 print "ok 4\n";
  40
  41 print "not " unless ord(substr($c, 1, 1)) == 0x100;
  42 print "ok 5\n";
  43
  44 print "not " unless ord(chr(0xdf)) == 0x3af; # spooky
  45 print "ok 6\n";
  46
  47 print "not " unless ord(pack("C", 0xdf)) == 0x3af;
  48 print "ok 7\n";
  49
  50 # we didn't break pack/unpack, I hope
  51
  52 print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
  53 print "ok 8\n";
  54
  55 # the first octet of UTF-8 encoded 0x3af
  56 print "not " unless unpack("C", chr(0xdf)) == 0xce;
  57 print "ok 9\n";
  58
  59 print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
  60 print "ok 10\n";
  61
  62 print "not " unless unpack("U", chr(0xdf)) == 0x3af;
  63 print "ok 11\n";
  64
  65 # charnames must still work
  66 use charnames ':full';
  67 print "not " unless ord("\N{LATIN SMALL LETTER SHARP S}") == 0xdf;
  68 print "ok 12\n";
  69
  70 # combine
  71
  72 $c = "\xDF\N{LATIN SMALL LETTER SHARP S}" . chr(0xdf);
  73
  74 print "not " unless ord($c) == 0x3af;
  75 print "ok 13\n";
  76
  77 print "not " unless ord(substr($c, 1, 1)) == 0xdf;
  78 print "ok 14\n";
  79
  80 print "not " unless ord(substr($c, 2, 1)) == 0x3af;
  81 print "ok 15\n";
  82
  83 # regex literals
  84
  85 print "not " unless "\xDF"    =~ /\x{3AF}/;
  86 print "ok 16\n";
  87
  88 print "not " unless "\x{3AF}" =~ /\xDF/;
  89 print "ok 17\n";
  90
  91 print "not " unless "\xDF"    =~ /\xDF/;
  92 print "ok 18\n";
  93
  94 print "not " unless "\x{3AF}" =~ /\x{3AF}/;
  95 print "ok 19\n";
  96
  97 # eq, cmp
  98
  99 my ($byte,$bytes,$U,$Ub,$g1,$g2,$l) = (
 100     pack("C*", 0xDF ),       # byte
 101     pack("C*", 0xDF, 0x20),  # ($bytes2 cmp $U) > 0
 102     pack("U*", 0x3AF),       # $U eq $byte
 103     pack("U*", 0xDF ),       # $Ub would eq $bytev w/o use encoding
 104     pack("U*", 0x3B1),       # ($g1 cmp $byte) > 0; === chr(0xe1)
 105     pack("U*", 0x3AF, 0x20), # ($g2 cmp $byte) > 0;
 106     pack("U*", 0x3AB),       # ($l  cmp $byte) < 0; === chr(0xdb)
 107 );
 108
 109 # all the tests in this section that compare a byte encoded string
 110 # ato UTF-8 encoded are run in all possible vairants
 111 # all of the eq, ne, cmp operations tested,
 112 # $v z $u tested as well as $u z $v
 113
 114 sub alleq($$){
 115     my ($a,$b)    =    (shift, shift);
 116      $a  eq  $b        &&     $b  eq  $a         &&
 117   !( $a  ne  $b )      &&  !( $b  ne  $a )       &&
 118    ( $a  cmp $b ) == 0 &&   ( $b  cmp $a ) == 0;
 119 }
 120
 121 sub anyeq($$){
 122     my ($a,$b)    =    (shift, shift);
 123      $a  eq  $b        ||     $b  eq  $a         ||
 124   !( $a  ne  $b )      ||  !( $b  ne  $a )       ||
 125    ( $a  cmp $b ) == 0 ||   ( $b  cmp $a ) == 0;
 126 }
 127
 128 sub allgt($$){
 129     my ($a,$b)    =    (shift, shift);
 130     ( $a cmp $b ) == 1 && ( $b cmp $a ) == -1;
 131 }
 132 #match the correct UTF-8 string
 133 print "not " unless  alleq($byte, $U);
 134 print "ok 20\n";
 135
 136 #do not match a wrong UTF-8 string
 137 print "not " if anyeq($byte, $Ub);
 138 print "ok 21\n";
 139
 140 #string ordering
 141 print "not " unless allgt ( $g1,    $byte  )  &&
 142                     allgt ( $g2,    $byte  )  &&
 143                     allgt ( $byte,  $l     )  &&
 144                     allgt ( $bytes, $U     );
 145 print "ok 22\n";
 146
 147 # upgrade, downgrade
 148
 149 my ($u,$v,$v2);
 150 $u = $v = $v2 = pack("C*", 0xDF);
 151 utf8::upgrade($v);                   #explicit upgrade
 152 $v2 = substr( $v2."\x{410}", 0, -1); #implicit upgrade
 153
 154 # implicit upgrade === explicit upgrade
 155 print "not "  if do{{use bytes; $v ne $v2}} || $v ne $v2;
 156 print "ok 23\n";
 157
 158 # utf8::upgrade is transparent and does not break equality
 159 print "not " unless alleq( $u, $v );
 160 print "ok 24\n";
 161
 162 $u = $v = pack("C*", 0xDF);
 163 utf8::upgrade($v);
 164 #test for a roundtrip, we should get back from where we left
 165 eval {utf8::downgrade( $v )};
 166 print "not " if $@ !~ /^Wide / || do{{use bytes; $u eq $v}} || $u ne $v;
 167 print "ok 25\n";
 168
 169 # some more eq, cmp
 170
 171 my $byte=pack("C*", 0xDF);
 172
 173 print "not " unless pack("U*", 0x3AF) eq $byte;
 174 print "ok 26\n";
 175
 176 print "not " if chr(0xDF) cmp $byte;
 177 print "ok 27\n";
 178
 179 print "not " unless ((pack("U*", 0x3B0)       cmp $byte) ==  1) &&
 180                     ((pack("U*", 0x3AE)       cmp $byte) == -1) &&
 181                     ((pack("U*", 0x3AF, 0x20) cmp $byte) ==  1) &&
 182                     ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1);
 183 print "ok 28\n";
 184
 185 # Used to core dump in 5.7.3
 186 print ord undef == 0 ? "ok 29\n" : "not ok 29\n";