ext/Encode/lib/Encode/CN/HZ.pm

   1 package Encode::CN::HZ;
   2
   3 use strict;
   4 use warnings;
   5 use utf8 ();
   6
   7 use vars qw($VERSION);
   8 $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
   9
  10 use Encode qw(:fallbacks);
  11
  12 use base qw(Encode::Encoding);
  13 __PACKAGE__->Define('hz');
  14
  15 # HZ is a combination of ASCII and escaped GB, so we implement it
  16 # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
  17
  18 # not ported for EBCDIC.  Which should be used, "~" or "\x7E"?
  19
  20 sub needs_lines { 1 }
  21
  22 sub decode ($$;$) {
  23     my ( $obj, $str, $chk ) = @_;
  24
  25     my $GB  = Encode::find_encoding('gb2312-raw');
  26     my $ret = '';
  27     my $in_ascii = 1;    # default mode is ASCII.
  28
  29     while ( length $str ) {
  30         if ($in_ascii) {    # ASCII mode
  31             if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) {    # no '~' => ASCII
  32                 $ret .= $1;
  33
  34                 # EBCDIC should need ascii2native, but not ported.
  35             }
  36             elsif ( $str =~ s/^\x7E\x7E// ) {           # escaped tilde
  37                 $ret .= '~';
  38             }
  39             elsif ( $str =~ s/^\x7E\cJ// ) {    # '\cJ' == LF in ASCII
  40                 1;                              # no-op
  41             }
  42             elsif ( $str =~ s/^\x7E\x7B// ) {    # '~{'
  43                 $in_ascii = 0;                   # to GB
  44             }
  45             else {    # encounters an invalid escape, \x80 or greater
  46                 last;
  47             }
  48         }
  49         else {        # GB mode; the byte ranges are as in RFC 1843.
  50             no warnings 'uninitialized';
  51             if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) {
  52                 $ret .= $GB->decode( $1, $chk );
  53             }
  54             elsif ( $str =~ s/^\x7E\x7D// ) {    # '~}'
  55                 $in_ascii = 1;
  56             }
  57             else {                               # invalid
  58                 last;
  59             }
  60         }
  61     }
  62     $_[1] = '' if $chk;    # needs_lines guarantees no partial character
  63     return $ret;
  64 }
  65
  66 sub cat_decode {
  67     my ( $obj, undef, $src, $pos, $trm, $chk ) = @_;
  68     my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ];
  69
  70     my $GB  = Encode::find_encoding('gb2312-raw');
  71     my $ret = '';
  72     my $in_ascii = 1;      # default mode is ASCII.
  73
  74     my $ini_pos = pos($$rsrc);
  75
  76     substr( $src, 0, $pos ) = '';
  77
  78     my $ini_len = bytes::length($src);
  79
  80     # $trm is the first of the pair '~~', then 2nd tilde is to be removed.
  81     # XXX: Is better C<$src =~ s/^\x7E// or die if ...>?
  82     $src =~ s/^\x7E// if $trm eq "\x7E";
  83
  84     while ( length $src ) {
  85         my $now;
  86         if ($in_ascii) {    # ASCII mode
  87             if ( $src =~ s/^([\x00-\x7D\x7F])// ) {    # no '~' => ASCII
  88                 $now = $1;
  89             }
  90             elsif ( $src =~ s/^\x7E\x7E// ) {          # escaped tilde
  91                 $now = '~';
  92             }
  93             elsif ( $src =~ s/^\x7E\cJ// ) {    # '\cJ' == LF in ASCII
  94                 next;
  95             }
  96             elsif ( $src =~ s/^\x7E\x7B// ) {    # '~{'
  97                 $in_ascii = 0;                   # to GB
  98                 next;
  99             }
 100             else {    # encounters an invalid escape, \x80 or greater
 101                 last;
 102             }
 103         }
 104         else {        # GB mode; the byte ranges are as in RFC 1843.
 105             if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) {
 106                 $now = $GB->decode( $1, $chk );
 107             }
 108             elsif ( $src =~ s/^\x7E\x7D// ) {    # '~}'
 109                 $in_ascii = 1;
 110                 next;
 111             }
 112             else {                               # invalid
 113                 last;
 114             }
 115         }
 116
 117         next if !defined $now;
 118
 119         $ret .= $now;
 120
 121         if ( $now eq $trm ) {
 122             $$rdst .= $ret;
 123             $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
 124             pos($$rsrc) = $ini_pos;
 125             return 1;
 126         }
 127     }
 128
 129     $$rdst .= $ret;
 130     $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
 131     pos($$rsrc) = $ini_pos;
 132     return '';    # terminator not found
 133 }
 134
 135 sub encode($$;$) {
 136     my ( $obj, $str, $chk ) = @_;
 137
 138     my $GB  = Encode::find_encoding('gb2312-raw');
 139     my $ret = '';
 140     my $in_ascii = 1;    # default mode is ASCII.
 141
 142     no warnings 'utf8';  # $str may be malformed UTF8 at the end of a chunk.
 143
 144     while ( length $str ) {
 145         if ( $str =~ s/^([[:ascii:]]+)// ) {
 146             my $tmp = $1;
 147             $tmp =~ s/~/~~/g;    # escapes tildes
 148             if ( !$in_ascii ) {
 149                 $ret .= "\x7E\x7D";    # '~}'
 150                 $in_ascii = 1;
 151             }
 152             $ret .= pack 'a*', $tmp;    # remove UTF8 flag.
 153         }
 154         elsif ( $str =~ s/(.)// ) {
 155             my $s = $1;
 156             my $tmp = $GB->encode( $s, $chk );
 157             last if !defined $tmp;
 158             if ( length $tmp == 2 ) {    # maybe a valid GB char (XXX)
 159                 if ($in_ascii) {
 160                     $ret .= "\x7E\x7B";    # '~{'
 161                     $in_ascii = 0;
 162                 }
 163                 $ret .= $tmp;
 164             }
 165             elsif ( length $tmp ) {        # maybe FALLBACK in ASCII (XXX)
 166                 if ( !$in_ascii ) {
 167                     $ret .= "\x7E\x7D";    # '~}'
 168                     $in_ascii = 1;
 169                 }
 170                 $ret .= $tmp;
 171             }
 172         }
 173         else {    # if $str is malformed UTF8 *and* if length $str != 0.
 174             last;
 175         }
 176     }
 177     $_[1] = $str if $chk;
 178
 179     # The state at the end of the chunk is discarded, even if in GB mode.
 180     # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{".
 181     # Parhaps it is harmless, but further investigations may be required...
 182
 183     if ( !$in_ascii ) {
 184         $ret .= "\x7E\x7D";    # '~}'
 185         $in_ascii = 1;
 186     }
 187     utf8::encode($ret); # https://rt.cpan.org/Ticket/Display.html?id=35120
 188     return $ret;
 189 }
 190
 191 1;
 192 __END__
 193
 194 =head1 NAME
 195
 196 Encode::CN::HZ -- internally used by Encode::CN
 197
 198 =cut